mirror of
https://github.com/public-apis/public-apis.git
synced 2025-05-10 15:46:48 +02:00
Filter the links after the index section
This commit is contained in:
parent
60245abb31
commit
f921f4ec5d
1 changed files with 11 additions and 5 deletions
|
@ -14,12 +14,19 @@ ignored_links = [
|
|||
|
||||
def parse_links(filename):
|
||||
"""Returns a list of URLs from text file"""
|
||||
with open(filename) as fp:
|
||||
data = fp.read()
|
||||
with open(filename, mode='r', encoding='utf-8') as fp:
|
||||
readme = fp.read()
|
||||
index_section = readme.find('## Index')
|
||||
content = readme[index_section:]
|
||||
|
||||
raw_links = re.findall(
|
||||
'((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))',
|
||||
data)
|
||||
links = [raw_link[0] for raw_link in raw_links]
|
||||
content)
|
||||
|
||||
links = [
|
||||
str(raw_link[0]).rstrip('/') for raw_link in raw_links
|
||||
]
|
||||
|
||||
return links
|
||||
|
||||
def dup_links(links):
|
||||
|
@ -30,7 +37,6 @@ def dup_links(links):
|
|||
dupes = []
|
||||
|
||||
for link in links:
|
||||
link = link.rstrip('/')
|
||||
if link in ignored_links:
|
||||
continue
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue