import os
from pathlib import Path

from litellm import completion
from litellm.exceptions import BadRequestError
from mistralai import Mistral
from mistralai.models import OCRPageObject, OCRResponse, OCRUsageInfo

# Define the size limit in bytes
MAX_SIZE_BYTES = 49 * 1024 * 1024


async def convert_pdf_to_markdown_async(
    pdf_path: Path,
    with_image_description: bool = False,
) -> OCRResponse:
    mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))

    uploaded_pdf = await mistral_client.files.upload_async(
        file={
            "file_name": "uploaded_file.pdf",
            "content": open(pdf_path, "rb"),
        },
        purpose="ocr",
    )

    signed_url = await mistral_client.files.get_signed_url_async(
        file_id=uploaded_pdf.id
    )

    ocr_response = await mistral_client.ocr.process_async(
        model="mistral-ocr-latest",
        document={"type": "document_url", "document_url": signed_url.url},
        include_image_base64=True,
    )
    print(f"Processing PDF: {pdf_path.name}")
    return ocr_response


def convert_ocr_response_to_markdown(ocr_response: OCRResponse) -> str:
    markdowns: list[str] = []
    for page in ocr_response.pages:
        page_description = page.markdown
        markdowns.append(page_description)

    return "\n\n".join(markdowns)


def get_markdown_by_page_numbers(
    markdown: OCRResponse, page_numbers: list[int], get_full_content: bool = False
) -> str:
    markdowns: list[str] = []
    page_numbers_to_get = set(page_numbers)
    if get_full_content:
        page_numbers_to_get = set(range(len(markdown.pages)))

    for page_number in page_numbers_to_get:
        markdowns.append(
            f"*Page {page_number}*\n{markdown.pages[page_number].markdown}"
        )
    return "\n\n".join(markdowns)


def find_in_markdown(
    markdown: OCRResponse, search_queries: list[str] | str
) -> list[int]:
    """
    Find the page numbers of the pdf that contain the search query.

    Args:
        markdown (OCRResponse): The markdown of the pdf.
        search_queries (list[str]): The search queries.

    Returns:
        list[int]: The page numbers of the pdf that contain the search query.
    """
    if isinstance(search_queries, str):
        search_queries = [search_queries]
    page_numbers: list[int] = []
    for page_number, page in enumerate(markdown.pages):
        for search_query in search_queries:
            if search_query.lower() in page.markdown.lower():
                page_numbers.append(page_number)
    return page_numbers


def get_table_of_contents_per_page_markdown(markdown: OCRResponse) -> str:
    """
    Get the table of contents of the pdf.

    Finds all the titles of the pdf to reconstruct the table of contents.

    Args:
        markdown (OCRResponse): The markdown of the pdf.

    Returns:
        str: The table of contents of the pdf.
    """
    title_to_page_number: dict[str, int] = {}
    for page_number, page in enumerate(markdown.pages):
        lines = page.markdown.split("\n")
        for line in lines:
            line = line.strip()
            if line.startswith("#"):
                title_to_page_number[line] = page_number

    table_of_contents = "\n".join(
        [
            f"{title} - Page {page_number}"
            for title, page_number in title_to_page_number.items()
        ]
    )
    return table_of_contents


def convert_raw_markdown_to_ocr_response(raw_markdown: str) -> OCRResponse:
    pages = raw_markdown.split("# ")
    usage_info_empty = OCRUsageInfo(pages_processed=0)
    return OCRResponse(
        pages=[
            OCRPageObject(index=i, markdown="# " + page, images=[], dimensions=None)
            for i, page in enumerate(pages)
        ],
        usage_info=usage_info_empty,
        model="",
    )


def get_images_from_pdf(pdf_path: Path, image_ids: list[str]) -> list[str]:
    raise NotImplementedError("Not implemented")

    def get_image_description_using_llm(
        base_64_str: str, model: str = "mistral/mistral-small-latest"
    ) -> str | None:
        assert base_64_str.startswith("data:image/jpeg;base64")

        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Describe this image in detail:"},
                    {"type": "image_url", "image_url": {"url": base_64_str}},
                ],
            }
        ]
        try:
            response = completion(
                model=model,  # LiteLLM naming convention
                messages=messages,
                temperature=0.0,
                stream=False,
            )
            output = dict(response)["choices"][0].message.content
        except BadRequestError:
            output = ""
        return output