# This is copied from Magentic-one's great repo: https://github.com/microsoft/autogen/blob/v0.4.4/python/packages/autogen-magentic-one/src/autogen_magentic_one/markdown_browser/mdconvert.py # Thanks to Microsoft researchers for open-sourcing this! # type: ignore import base64 import copy import html import json import mimetypes import os import re import shutil import subprocess import sys import tempfile import traceback from typing import Any, Dict, List, Optional, Union from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse import mammoth import markdownify import pandas as pd import pdfminer import pdfminer.high_level import pptx # File-format detection import puremagic import pydub import requests import speech_recognition as sr from bs4 import BeautifulSoup from youtube_transcript_api import YouTubeTranscriptApi from youtube_transcript_api.formatters import SRTFormatter class _CustomMarkdownify(markdownify.MarkdownConverter): """ A custom version of markdownify's MarkdownConverter. Changes include: - Altering the default heading style to use '#', '##', etc. - Removing javascript hyperlinks. - Truncating images with large data:uri sources. - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax """ def __init__(self, **options: Any): options["heading_style"] = options.get("heading_style", markdownify.ATX) # Explicitly cast options to the expected type if necessary super().__init__(**options) def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str: """Same as usual, but be sure to start with a new line""" if not convert_as_inline: if not re.search(r"^\n", text): return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore return super().convert_hn(n, el, text, convert_as_inline) # type: ignore def convert_a(self, el: Any, text: str, convert_as_inline: bool): """Same as usual converter, but removes Javascript links and escapes URIs.""" prefix, suffix, text = markdownify.chomp(text) # type: ignore if not text: return "" href = el.get("href") title = el.get("title") # Escape URIs and skip non-http or file schemes if href: try: parsed_url = urlparse(href) # type: ignore if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore return "%s%s%s" % (prefix, text, suffix) href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore except ValueError: # It's not clear if this ever gets thrown return "%s%s%s" % (prefix, text, suffix) # For the replacement see #29: text nodes underscores are escaped if ( self.options["autolinks"] and text.replace(r"\_", "_") == href and not title and not self.options["default_title"] ): # Shortcut syntax return "<%s>" % href if self.options["default_title"] and not title: title = href title_part = ' "%s"' % title.replace('"', r"\"") if title else "" return "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix) if href else text def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str: """Same as usual converter, but removes data URIs""" alt = el.attrs.get("alt", None) or "" src = el.attrs.get("src", None) or "" title = el.attrs.get("title", None) or "" title_part = ' "%s"' % title.replace('"', r"\"") if title else "" if convert_as_inline and el.parent.name not in self.options["keep_inline_images_in"]: return alt # Remove dataURIs if src.startswith("data:"): src = src.split(",")[0] + "..." return "" % (alt, src, title_part) def convert_soup(self, soup: Any) -> str: return super().convert_soup(soup) # type: ignore class DocumentConverterResult: """The result of converting a document to text.""" def __init__(self, title: Union[str, None] = None, text_content: str = ""): self.title: Union[str, None] = title self.text_content: str = text_content class DocumentConverter: """Abstract superclass of all DocumentConverters.""" def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]: raise NotImplementedError() class PlainTextConverter(DocumentConverter): """Anything with content type text/plain""" def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]: # Guess the content type from any file extension that might be around content_type, _ = mimetypes.guess_type("__placeholder" + kwargs.get("file_extension", "")) # Only accept text files if content_type is None: return None # elif "text/" not in content_type.lower(): # return None text_content = "" with open(local_path, "rt", encoding="utf-8") as fh: text_content = fh.read() return DocumentConverterResult( title=None, text_content=text_content, ) class JavaScriptConverter(DocumentConverter): """Handle JavaScript files (.js, .mjs) as plain text""" def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]: # Check if it's a JavaScript file extension = kwargs.get("file_extension", "") if extension.lower() not in [".js", ".mjs"]: return None try: # Read the file as text with open(local_path, "rt", encoding="utf-8") as fh: text_content = fh.read() return DocumentConverterResult( title=f"JavaScript file: {os.path.basename(local_path)}", text_content=text_content, ) except Exception as e: # If UTF-8 fails, try with different encoding try: with open(local_path, "rt", encoding="latin-1") as fh: text_content = fh.read() return DocumentConverterResult( title=f"JavaScript file: {os.path.basename(local_path)}", text_content=text_content, ) except Exception: return None class HtmlConverter(DocumentConverter): """Anything with content type text/html""" def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]: # Bail if not html extension = kwargs.get("file_extension", "") if extension.lower() not in [".html", ".htm"]: return None result = None with open(local_path, "rt", encoding="utf-8") as fh: result = self._convert(fh.read()) return result def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]: """Helper function that converts and HTML string.""" # Parse the string soup = BeautifulSoup(html_content, "html.parser") # Remove javascript and style blocks for script in soup(["script", "style"]): script.extract() # Print only the main content body_elm = soup.find("body") webpage_text = "" if body_elm: webpage_text = _CustomMarkdownify().convert_soup(body_elm) else: webpage_text = _CustomMarkdownify().convert_soup(soup) assert isinstance(webpage_text, str) return DocumentConverterResult( title=None if soup.title is None else soup.title.string, text_content=webpage_text ) class WikipediaConverter(DocumentConverter): """Handle Wikipedia pages separately, focusing only on the main document content.""" def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]: # Bail if not Wikipedia extension = kwargs.get("file_extension", "") if extension.lower() not in [".html", ".htm"]: return None url = kwargs.get("url", "") if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url): return None # Parse the file soup = None with open(local_path, "rt", encoding="utf-8") as fh: soup = BeautifulSoup(fh.read(), "html.parser") # Remove javascript and style blocks for script in soup(["script", "style"]): script.extract() # Print only the main content body_elm = soup.find("div", {"id": "mw-content-text"}) title_elm = soup.find("span", {"class": "mw-page-title-main"}) webpage_text = "" main_title = None if soup.title is None else soup.title.string if body_elm: # What's the title if title_elm and len(title_elm) > 0: main_title = title_elm.string # type: ignore assert isinstance(main_title, str) # Convert the page webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(body_elm) else: webpage_text = _CustomMarkdownify().convert_soup(soup) return DocumentConverterResult( title=main_title, text_content=webpage_text, ) class YouTubeConverter(DocumentConverter): """Handle YouTube specially, focusing on the video title, description, and transcript.""" def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]: # Bail if not YouTube extension = kwargs.get("file_extension", "") if extension.lower() not in [".html", ".htm"]: return None url = kwargs.get("url", "") if not url.startswith("https://www.youtube.com/watch?"): return None # Parse the file soup = None with open(local_path, "rt", encoding="utf-8") as fh: soup = BeautifulSoup(fh.read(), "html.parser") # Read the meta tags assert soup.title is not None and soup.title.string is not None metadata: Dict[str, str] = {"title": soup.title.string} for meta in soup(["meta"]): for a in meta.attrs: if a in ["itemprop", "property", "name"]: metadata[meta[a]] = meta.get("content", "") break # We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation try: for script in soup(["script"]): content = script.text if "ytInitialData" in content: lines = re.split(r"\r?\n", content) obj_start = lines[0].find("{") obj_end = lines[0].rfind("}") if obj_start >= 0 and obj_end >= 0: data = json.loads(lines[0][obj_start : obj_end + 1]) attrdesc = self._findKey(data, "attributedDescriptionBodyText") # type: ignore if attrdesc: metadata["description"] = str(attrdesc["content"]) break except Exception: pass # Start preparing the page webpage_text = "# YouTube\n" title = self._get(metadata, ["title", "og:title", "name"]) # type: ignore assert isinstance(title, str) if title: webpage_text += f"\n## {title}\n" stats = "" views = self._get(metadata, ["interactionCount"]) # type: ignore if views: stats += f"- **Views:** {views}\n" keywords = self._get(metadata, ["keywords"]) # type: ignore if keywords: stats += f"- **Keywords:** {keywords}\n" runtime = self._get(metadata, ["duration"]) # type: ignore if runtime: stats += f"- **Runtime:** {runtime}\n" if len(stats) > 0: webpage_text += f"\n### Video Metadata\n{stats}\n" description = self._get(metadata, ["description", "og:description"]) # type: ignore if description: webpage_text += f"\n### Description\n{description}\n" transcript_text = "" parsed_url = urlparse(url) # type: ignore params = parse_qs(parsed_url.query) # type: ignore if "v" in params: assert isinstance(params["v"][0], str) video_id = str(params["v"][0]) try: # Must be a single transcript. transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore # transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore # Alternative formatting: transcript_text = SRTFormatter().format_transcript(transcript) except Exception: pass if transcript_text: webpage_text += f"\n### Transcript\n{transcript_text}\n" title = title if title else soup.title.string assert isinstance(title, str) return DocumentConverterResult( title=title, text_content=webpage_text, ) def _get(self, metadata: Dict[str, str], keys: List[str], default: Union[str, None] = None) -> Union[str, None]: for k in keys: if k in metadata: return metadata[k] return default def _findKey(self, json: Any, key: str) -> Union[str, None]: # TODO: Fix json type if isinstance(json, list): for elm in json: ret = self._findKey(elm, key) if ret is not None: return ret elif isinstance(json, dict): for k in json: if k == key: return json[k] else: ret = self._findKey(json[k], key) if ret is not None: return ret return None class PdfConverter(DocumentConverter): """ Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. """ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a PDF extension = kwargs.get("file_extension", "") if extension.lower() != ".pdf": return None return DocumentConverterResult( title=None, text_content=pdfminer.high_level.extract_text(local_path), ) class DocxConverter(HtmlConverter): """ Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. """ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a DOCX extension = kwargs.get("file_extension", "") if extension.lower() != ".docx": return None result = None with open(local_path, "rb") as docx_file: result = mammoth.convert_to_html(docx_file) html_content = result.value result = self._convert(html_content) return result class XlsxConverter(HtmlConverter): """ Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. """ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a XLSX extension = kwargs.get("file_extension", "") if extension.lower() not in [".xlsx", ".xls"]: return None sheets = pd.read_excel(local_path, sheet_name=None) md_content = "" for s in sheets: md_content += f"## {s}\n" html_content = sheets[s].to_html(index=False) md_content += self._convert(html_content).text_content.strip() + "\n\n" return DocumentConverterResult( title=None, text_content=md_content.strip(), ) class PptxConverter(HtmlConverter): """ Converts PPTX files to Markdown. Supports heading, tables and images with alt text. """ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a PPTX extension = kwargs.get("file_extension", "") if extension.lower() != ".pptx": return None md_content = "" presentation = pptx.Presentation(local_path) slide_num = 0 for slide in presentation.slides: slide_num += 1 md_content += f"\n\n\n" title = slide.shapes.title for shape in slide.shapes: # Pictures if self._is_picture(shape): # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069 alt_text = "" try: alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") except Exception: pass # A placeholder name filename = re.sub(r"\W", "", shape.name) + ".jpg" md_content += "\n\n" # Tables if self._is_table(shape): html_table = "
" + html.escape(cell.text) + " | " else: html_table += "" + html.escape(cell.text) + " | " html_table += "
---|