Spaces:
Build error
Build error
# Modified from: | |
# https://github.com/allenai/allennlp/blob/main/scripts/check_links.py | |
import argparse | |
import logging | |
import os | |
import pathlib | |
import re | |
import sys | |
from multiprocessing.dummy import Pool | |
from typing import NamedTuple, Optional, Tuple | |
import requests | |
from mmcv.utils import get_logger | |
def parse_args(): | |
parser = argparse.ArgumentParser( | |
description='Goes through all the inline-links ' | |
'in markdown files and reports the breakages') | |
parser.add_argument( | |
'--num-threads', | |
type=int, | |
default=100, | |
help='Number of processes to confirm the link') | |
parser.add_argument('--https-proxy', type=str, help='https proxy') | |
parser.add_argument( | |
'--out', | |
type=str, | |
default='link_reports.txt', | |
help='output path of reports') | |
args = parser.parse_args() | |
return args | |
OK_STATUS_CODES = ( | |
200, | |
401, # the resource exists but may require some sort of login. | |
403, # ^ same | |
405, # HEAD method not allowed. | |
# the resource exists, but our default 'Accept-' header may not | |
# match what the server can provide. | |
406, | |
) | |
class MatchTuple(NamedTuple): | |
source: str | |
name: str | |
link: str | |
def check_link( | |
match_tuple: MatchTuple, | |
http_session: requests.Session, | |
logger: logging = None) -> Tuple[MatchTuple, bool, Optional[str]]: | |
reason: Optional[str] = None | |
if match_tuple.link.startswith('http'): | |
result_ok, reason = check_url(match_tuple, http_session) | |
else: | |
result_ok = check_path(match_tuple) | |
if logger is None: | |
print(f" {'β' if result_ok else 'β'} {match_tuple.link}") | |
else: | |
logger.info(f" {'β' if result_ok else 'β'} {match_tuple.link}") | |
return match_tuple, result_ok, reason | |
def check_url(match_tuple: MatchTuple, | |
http_session: requests.Session) -> Tuple[bool, str]: | |
"""Check if a URL is reachable.""" | |
try: | |
result = http_session.head( | |
match_tuple.link, timeout=5, allow_redirects=True) | |
return ( | |
result.ok or result.status_code in OK_STATUS_CODES, | |
f'status code = {result.status_code}', | |
) | |
except (requests.ConnectionError, requests.Timeout): | |
return False, 'connection error' | |
def check_path(match_tuple: MatchTuple) -> bool: | |
"""Check if a file in this repository exists.""" | |
relative_path = match_tuple.link.split('#')[0] | |
full_path = os.path.join( | |
os.path.dirname(str(match_tuple.source)), relative_path) | |
return os.path.exists(full_path) | |
def main(): | |
args = parse_args() | |
# setup logger | |
logger = get_logger(name='mmdet', log_file=args.out) | |
# setup https_proxy | |
if args.https_proxy: | |
os.environ['https_proxy'] = args.https_proxy | |
# setup http_session | |
http_session = requests.Session() | |
for resource_prefix in ('http://', 'https://'): | |
http_session.mount( | |
resource_prefix, | |
requests.adapters.HTTPAdapter( | |
max_retries=5, | |
pool_connections=20, | |
pool_maxsize=args.num_threads), | |
) | |
logger.info('Finding all markdown files in the current directory...') | |
project_root = (pathlib.Path(__file__).parent / '..').resolve() | |
markdown_files = project_root.glob('**/*.md') | |
all_matches = set() | |
url_regex = re.compile(r'\[([^!][^\]]+)\]\(([^)(]+)\)') | |
for markdown_file in markdown_files: | |
with open(markdown_file) as handle: | |
for line in handle.readlines(): | |
matches = url_regex.findall(line) | |
for name, link in matches: | |
if 'localhost' not in link: | |
all_matches.add( | |
MatchTuple( | |
source=str(markdown_file), | |
name=name, | |
link=link)) | |
logger.info(f' {len(all_matches)} markdown files found') | |
logger.info('Checking to make sure we can retrieve each link...') | |
with Pool(processes=args.num_threads) as pool: | |
results = pool.starmap(check_link, [(match, http_session, logger) | |
for match in list(all_matches)]) | |
# collect unreachable results | |
unreachable_results = [(match_tuple, reason) | |
for match_tuple, success, reason in results | |
if not success] | |
if unreachable_results: | |
logger.info('================================================') | |
logger.info(f'Unreachable links ({len(unreachable_results)}):') | |
for match_tuple, reason in unreachable_results: | |
logger.info(' > Source: ' + match_tuple.source) | |
logger.info(' Name: ' + match_tuple.name) | |
logger.info(' Link: ' + match_tuple.link) | |
if reason is not None: | |
logger.info(' Reason: ' + reason) | |
sys.exit(1) | |
logger.info('No Unreachable link found.') | |
if __name__ == '__main__': | |
main() | |