import os from pathlib import Path from litellm import completion from litellm.exceptions import BadRequestError from mistralai import Mistral from mistralai.models import OCRPageObject, OCRResponse, OCRUsageInfo # Define the size limit in bytes MAX_SIZE_BYTES = 49 * 1024 * 1024 async def convert_pdf_to_markdown_async( pdf_path: Path, with_image_description: bool = False, ) -> OCRResponse: mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY")) uploaded_pdf = await mistral_client.files.upload_async( file={ "file_name": "uploaded_file.pdf", "content": open(pdf_path, "rb"), }, purpose="ocr", ) signed_url = await mistral_client.files.get_signed_url_async( file_id=uploaded_pdf.id ) ocr_response = await mistral_client.ocr.process_async( model="mistral-ocr-latest", document={"type": "document_url", "document_url": signed_url.url}, include_image_base64=True, ) print(f"Processing PDF: {pdf_path.name}") return ocr_response def convert_ocr_response_to_markdown(ocr_response: OCRResponse) -> str: markdowns: list[str] = [] for page in ocr_response.pages: page_description = page.markdown markdowns.append(page_description) return "\n\n".join(markdowns) def get_markdown_by_page_numbers( markdown: OCRResponse, page_numbers: list[int], get_full_content: bool = False ) -> str: markdowns: list[str] = [] page_numbers_to_get = set(page_numbers) if get_full_content: page_numbers_to_get = set(range(len(markdown.pages))) for page_number in page_numbers_to_get: markdowns.append( f"*Page {page_number}*\n{markdown.pages[page_number].markdown}" ) return "\n\n".join(markdowns) def find_in_markdown( markdown: OCRResponse, search_queries: list[str] | str ) -> list[int]: """ Find the page numbers of the pdf that contain the search query. Args: markdown (OCRResponse): The markdown of the pdf. search_queries (list[str]): The search queries. Returns: list[int]: The page numbers of the pdf that contain the search query. """ if isinstance(search_queries, str): search_queries = [search_queries] page_numbers: list[int] = [] for page_number, page in enumerate(markdown.pages): for search_query in search_queries: if search_query.lower() in page.markdown.lower(): page_numbers.append(page_number) return page_numbers def get_table_of_contents_per_page_markdown(markdown: OCRResponse) -> str: """ Get the table of contents of the pdf. Finds all the titles of the pdf to reconstruct the table of contents. Args: markdown (OCRResponse): The markdown of the pdf. Returns: str: The table of contents of the pdf. """ title_to_page_number: dict[str, int] = {} for page_number, page in enumerate(markdown.pages): lines = page.markdown.split("\n") for line in lines: line = line.strip() if line.startswith("#"): title_to_page_number[line] = page_number table_of_contents = "\n".join( [ f"{title} - Page {page_number}" for title, page_number in title_to_page_number.items() ] ) return table_of_contents def convert_raw_markdown_to_ocr_response(raw_markdown: str) -> OCRResponse: pages = raw_markdown.split("# ") usage_info_empty = OCRUsageInfo(pages_processed=0) return OCRResponse( pages=[ OCRPageObject(index=i, markdown="# " + page, images=[], dimensions=None) for i, page in enumerate(pages) ], usage_info=usage_info_empty, model="", ) def get_images_from_pdf(pdf_path: Path, image_ids: list[str]) -> list[str]: raise NotImplementedError("Not implemented") def get_image_description_using_llm( base_64_str: str, model: str = "mistral/mistral-small-latest" ) -> str | None: assert base_64_str.startswith("data:image/jpeg;base64") messages = [ { "role": "user", "content": [ {"type": "text", "text": "Describe this image in detail:"}, {"type": "image_url", "image_url": {"url": base_64_str}}, ], } ] try: response = completion( model=model, # LiteLLM naming convention messages=messages, temperature=0.0, stream=False, ) output = dict(response)["choices"][0].message.content except BadRequestError: output = "" return output