Spaces:

not-lain
/

utils

Running

File size: 4,824 Bytes

import gradio as gr
import warnings
from typing import List
from pdfitdown.pdfconversion import convert_to_pdf, convert_markdown_to_pdf

from base_utils import (
    convert_pdf_to_image,
    extract_text_from_pdf,
    convert_doc_to_text,
    extract_text_from_docx,
    extract_text_from_ppt,
    extract_text_from_pptx,
    sanitize_list_of_lists,
    parse_url,
)

pdf_to_img = gr.Interface(
    convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
)
pdf_to_text = gr.Interface(
    extract_text_from_pdf,
    gr.File(),
    gr.Textbox(placeholder="Extracted text will appear here"),
    api_name="pdf_to_text",
)

doc_to_text = gr.Interface(
    convert_doc_to_text, gr.File(), gr.Textbox(), api_name="doc_to_text"
)
docx_to_text = gr.Interface(
    extract_text_from_docx, gr.File(), gr.Textbox(), api_name="docx_to_text"
)

ppt_to_text = gr.Interface(
    extract_text_from_ppt,
    gr.File(),
    gr.Textbox(),
    api_name="ppt_to_text",
)

pptx_to_text = gr.Interface(
    extract_text_from_pptx,
    gr.File(),
    gr.Textbox(),
    api_name="pptx_to_text",
)
str_to_json = gr.Interface(
    sanitize_list_of_lists,
    gr.Text(),
    gr.JSON(),
    api_name="str_to_json",
    examples=[
        """[
  ["What year was the Carthaginian Empire founded?", "Around 814 BCE"],
  ["Where was the center of the Carthaginian Empire located?", "Carthage, near present-day Tunis, Tunisia"],
  ["Which powerful ancient republic did Carthage have conflicts with?", "The Roman Republic"],
  ["Fill in the blank: Hannibal famously crossed the ________ with war elephants.", "Alps"],
  ["What were the series of conflicts between Carthage and Rome called?", "The Punic Wars"],
  ["Multiple Choice: What was a significant military advantage of Carthage? A) Strong infantry, B) Powerful navy, C) Fortified cities", "B) Powerful navy"],
  ["In what year was Carthage captured and destroyed by Rome?", "146 BCE"],
  ["What did Carthage excel in that allowed it to amass wealth?", "Maritime trade"]
]"""
    ],
)

url_parser = gr.Interface(
    parse_url,
    inputs=["text"],
    outputs=["text"],
    api_name="url_to_text",
)


class FileNotConvertedWarning(Warning):
    """The file was not in one of the specified formats for conversion to PDF"""

    pass


def to_pdf(files: List[str]) -> List[str]:
    pdfs = []
    for f in files:
        if f.endswith(".docx"):
            newfile = f.replace(".docx", ".pdf")
            file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
            pdfs.append(file_to_add)
        elif f.endswith(".pdf"):
            pdfs.append(f)
        elif f.endswith(".html"):
            newfile = f.replace(".html", ".pdf")
            file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
            pdfs.append(file_to_add)
        elif f.endswith(".pptx"):
            newfile = f.replace(".pptx", ".pdf")
            file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
            pdfs.append(file_to_add)
        elif f.endswith(".csv"):
            newfile = f.replace(".csv", ".pdf")
            file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
            pdfs.append(file_to_add)
        elif f.endswith(".xml"):
            newfile = f.replace(".xml", ".pdf")
            file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
            pdfs.append(file_to_add)
        elif f.endswith(".md"):
            newfile = f.replace(".md", ".pdf")
            file_to_add = convert_markdown_to_pdf(f, newfile, newfile.split(".")[0])
            pdfs.append(file_to_add)
        else:
            warnings.warn(
                f"File {f} was not converted to PDF because its file format is not included in those that can be converted",
                FileNotConvertedWarning,
            )
            continue
    return pdfs


def convert(file: str) -> str:
    files = [file]
    pdfs = to_pdf(files)
    return pdfs


pdf_converter = gr.Interface(
    fn=convert,
    inputs=gr.File(label="Upload your file"),
    outputs=gr.File(label="Converted PDF"),
    title="File to PDF Converter",
    description="Upload a file in .docx, .pdf, .html, .pptx, .csv, .xml, or .md format, and get it converted to PDF.",
    api_name="convert_to_pdf",
)

demo = gr.TabbedInterface(
    [
        pdf_to_img,
        pdf_to_text,
        doc_to_text,
        docx_to_text,
        ppt_to_text,
        pptx_to_text,
        url_parser,
        str_to_json,
        pdf_converter,
    ],
    [
        "PDF to Image",
        "Extract PDF Text",
        "Extract DOC Text",
        "Extract DOCX Text",
        "Extract PPT Text",
        "Extract PPTX Text",
        "Extract text from URL",
        "Extract Json",
        "Convert to PDF",
    ],
)

demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True)