Filter the links after the index section

This commit is contained in:
Matheus Felipe 2021-08-24 06:15:28 -03:00
parent 60245abb31
commit f921f4ec5d
No known key found for this signature in database
GPG key ID: AA785C523274872F

View file

@ -14,12 +14,19 @@ ignored_links = [
def parse_links(filename):
"""Returns a list of URLs from text file"""
with open(filename) as fp:
data = fp.read()
with open(filename, mode='r', encoding='utf-8') as fp:
readme = fp.read()
index_section = readme.find('## Index')
content = readme[index_section:]
raw_links = re.findall(
'((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))',
data)
links = [raw_link[0] for raw_link in raw_links]
content)
links = [
str(raw_link[0]).rstrip('/') for raw_link in raw_links
]
return links
def dup_links(links):
@ -30,7 +37,6 @@ def dup_links(links):
dupes = []
for link in links:
link = link.rstrip('/')
if link in ignored_links:
continue