mirror of
https://github.com/public-apis/public-apis.git
synced 2025-05-09 23:26:52 +02:00
Check if a link is working
This commit is contained in:
parent
d06a3717d4
commit
7be0512b54
1 changed files with 78 additions and 1 deletions
|
@ -1,9 +1,12 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import sys
|
||||
import re
|
||||
import sys
|
||||
import random
|
||||
from typing import List, Tuple
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
def find_links_in_text(text: str) -> List[str]:
|
||||
"""Find links in a text and return a list of URLs."""
|
||||
|
@ -55,6 +58,80 @@ def check_duplicate_links(links: List[str]) -> Tuple[bool, List]:
|
|||
return (has_duplicate, duplicates)
|
||||
|
||||
|
||||
def fake_user_agent() -> str:
|
||||
"""Faking user agent as some hosting services block not-whitelisted UA."""
|
||||
|
||||
user_agents = [
|
||||
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko)',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
|
||||
]
|
||||
|
||||
return random.choice(user_agents)
|
||||
|
||||
|
||||
def get_host_from_link(link: str) -> str:
|
||||
|
||||
host = link.split('://', 1)[1]
|
||||
|
||||
# Remove routes, arguments and anchors
|
||||
if '/' in host:
|
||||
host = host.split('/', 1)[0]
|
||||
|
||||
elif '?' in host:
|
||||
host = host.split('?', 1)[0]
|
||||
|
||||
elif '#' in host:
|
||||
host = host.split('#', 1)[0]
|
||||
|
||||
return host
|
||||
|
||||
|
||||
def check_if_link_is_working(link: str) -> Tuple[bool, str]:
|
||||
"""Checks if a link is working.
|
||||
|
||||
If an error is identified when the request for the link occurs,
|
||||
the return will be a tuple with the first value True and the second
|
||||
value a string containing the error message.
|
||||
|
||||
If no errors are identified, the return will be a tuple with the
|
||||
first value False and the second an empty string.
|
||||
"""
|
||||
|
||||
has_error = False
|
||||
error_message = ''
|
||||
|
||||
try:
|
||||
resp = requests.get(link + '/', timeout=25, headers={
|
||||
'User-Agent': fake_user_agent(),
|
||||
'host': get_host_from_link(link)
|
||||
})
|
||||
|
||||
code = resp.status_code
|
||||
if code >= 400:
|
||||
has_error = True
|
||||
error_message = f'ERR:CLT: {code} : {link}'
|
||||
|
||||
except (TimeoutError, requests.exceptions.ConnectTimeout):
|
||||
has_error = True
|
||||
error_message = f'ERR:TMO: {link}'
|
||||
|
||||
except requests.exceptions.SSLError as error:
|
||||
has_error = True
|
||||
error_message = f'ERR:SSL: {error} : {link}'
|
||||
|
||||
except requests.exceptions.TooManyRedirects as error:
|
||||
has_error = True
|
||||
error_message = f'ERR:TMR: {error} : {link}'
|
||||
|
||||
except Exception as error:
|
||||
has_error = True
|
||||
error_message = f'ERR:UKN: {error} : {link}'
|
||||
|
||||
return (has_error, error_message)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
num_args = len(sys.argv)
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue