# html_to_markdown/url_utils.py from typing import Dict media_suffixes = [ "jpeg", "jpg", "png", "gif", "bmp", "tiff", "tif", "svg", "webp", "ico", "avi", "mov", "mp4", "mkv", "flv", "wmv", "webm", "mpeg", "mpg", "mp3", "wav", "aac", "ogg", "flac", "m4a", "pdf", "doc", "docx", "ppt", "pptx", "xls", "xlsx", "txt", "css", "js", "xml", "json", "html", "htm" ] def add_ref_prefix(prefix: str, prefixes_to_refs: Dict[str, str]) -> str: if prefix not in prefixes_to_refs: prefixes_to_refs[prefix] = f'ref{len(prefixes_to_refs)}' return prefixes_to_refs[prefix] def process_url(url: str, prefixes_to_refs: Dict[str, str]) -> str: if not url.startswith('http'): return url else: parts = url.split('/') media_suffix = parts[-1].split('.')[-1].lower() if media_suffix in media_suffixes: prefix = '/'.join(parts[:-1]) ref_prefix = add_ref_prefix(prefix, prefixes_to_refs) return f"{ref_prefix}://{parts[-1]}" else: if len(parts) > 4: return add_ref_prefix(url, prefixes_to_refs) else: return url def refify_urls(markdown_elements: list, prefixes_to_refs: Dict[str, str] = {}) -> Dict[str, str]: for element in markdown_elements: if isinstance(element, dict): node_type = element.get('type') if node_type == 'link': original_href = element.get('href', '') element['href'] = process_url(original_href, prefixes_to_refs) refify_urls(element.get('content', []), prefixes_to_refs) elif node_type in ['image', 'video']: original_src = element.get('src', '') element['src'] = process_url(original_src, prefixes_to_refs) elif node_type == 'list': for item in element.get('items', []): refify_urls(item.get('content', []), prefixes_to_refs) elif node_type == 'table': for row in element.get('rows', []): for cell in row.get('cells', []): if isinstance(cell.get('content'), list): refify_urls(cell['content'], prefixes_to_refs) elif node_type in ['blockquote', 'semanticHtml']: refify_urls(element.get('content', []), prefixes_to_refs) return prefixes_to_refs