File size: 4,824 Bytes
4b0678e
d5b5b0f
 
 
5f1077a
c577758
 
 
 
 
 
 
 
 
 
d9c1e67
5f1077a
 
 
 
 
 
 
 
 
 
d99955f
c577758
5d2e8ec
d99955f
c577758
5f1077a
 
ba611cd
 
 
 
 
 
 
0772fb4
 
 
 
 
 
59e60e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d9c1e67
 
 
 
 
 
c577758
 
d5b5b0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e0f3c82
d5b5b0f
 
 
 
 
 
 
 
 
 
 
5f1077a
c577758
 
 
 
 
 
 
 
 
d5b5b0f
c577758
59e60e9
 
 
5d2e8ec
 
ba611cd
0772fb4
d9c1e67
59e60e9
d5b5b0f
59e60e9
5f1077a
 
fac9a75
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import gradio as gr
import warnings
from typing import List
from pdfitdown.pdfconversion import convert_to_pdf, convert_markdown_to_pdf

from base_utils import (
    convert_pdf_to_image,
    extract_text_from_pdf,
    convert_doc_to_text,
    extract_text_from_docx,
    extract_text_from_ppt,
    extract_text_from_pptx,
    sanitize_list_of_lists,
    parse_url,
)

pdf_to_img = gr.Interface(
    convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
)
pdf_to_text = gr.Interface(
    extract_text_from_pdf,
    gr.File(),
    gr.Textbox(placeholder="Extracted text will appear here"),
    api_name="pdf_to_text",
)

doc_to_text = gr.Interface(
    convert_doc_to_text, gr.File(), gr.Textbox(), api_name="doc_to_text"
)
docx_to_text = gr.Interface(
    extract_text_from_docx, gr.File(), gr.Textbox(), api_name="docx_to_text"
)

ppt_to_text = gr.Interface(
    extract_text_from_ppt,
    gr.File(),
    gr.Textbox(),
    api_name="ppt_to_text",
)

pptx_to_text = gr.Interface(
    extract_text_from_pptx,
    gr.File(),
    gr.Textbox(),
    api_name="pptx_to_text",
)
str_to_json = gr.Interface(
    sanitize_list_of_lists,
    gr.Text(),
    gr.JSON(),
    api_name="str_to_json",
    examples=[
        """[
  ["What year was the Carthaginian Empire founded?", "Around 814 BCE"],
  ["Where was the center of the Carthaginian Empire located?", "Carthage, near present-day Tunis, Tunisia"],
  ["Which powerful ancient republic did Carthage have conflicts with?", "The Roman Republic"],
  ["Fill in the blank: Hannibal famously crossed the ________ with war elephants.", "Alps"],
  ["What were the series of conflicts between Carthage and Rome called?", "The Punic Wars"],
  ["Multiple Choice: What was a significant military advantage of Carthage? A) Strong infantry, B) Powerful navy, C) Fortified cities", "B) Powerful navy"],
  ["In what year was Carthage captured and destroyed by Rome?", "146 BCE"],
  ["What did Carthage excel in that allowed it to amass wealth?", "Maritime trade"]
]"""
    ],
)

url_parser = gr.Interface(
    parse_url,
    inputs=["text"],
    outputs=["text"],
    api_name="url_to_text",
)


class FileNotConvertedWarning(Warning):
    """The file was not in one of the specified formats for conversion to PDF"""

    pass


def to_pdf(files: List[str]) -> List[str]:
    pdfs = []
    for f in files:
        if f.endswith(".docx"):
            newfile = f.replace(".docx", ".pdf")
            file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
            pdfs.append(file_to_add)
        elif f.endswith(".pdf"):
            pdfs.append(f)
        elif f.endswith(".html"):
            newfile = f.replace(".html", ".pdf")
            file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
            pdfs.append(file_to_add)
        elif f.endswith(".pptx"):
            newfile = f.replace(".pptx", ".pdf")
            file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
            pdfs.append(file_to_add)
        elif f.endswith(".csv"):
            newfile = f.replace(".csv", ".pdf")
            file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
            pdfs.append(file_to_add)
        elif f.endswith(".xml"):
            newfile = f.replace(".xml", ".pdf")
            file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
            pdfs.append(file_to_add)
        elif f.endswith(".md"):
            newfile = f.replace(".md", ".pdf")
            file_to_add = convert_markdown_to_pdf(f, newfile, newfile.split(".")[0])
            pdfs.append(file_to_add)
        else:
            warnings.warn(
                f"File {f} was not converted to PDF because its file format is not included in those that can be converted",
                FileNotConvertedWarning,
            )
            continue
    return pdfs


def convert(file: str) -> str:
    files = [file]
    pdfs = to_pdf(files)
    return pdfs


pdf_converter = gr.Interface(
    fn=convert,
    inputs=gr.File(label="Upload your file"),
    outputs=gr.File(label="Converted PDF"),
    title="File to PDF Converter",
    description="Upload a file in .docx, .pdf, .html, .pptx, .csv, .xml, or .md format, and get it converted to PDF.",
    api_name="convert_to_pdf",
)

demo = gr.TabbedInterface(
    [
        pdf_to_img,
        pdf_to_text,
        doc_to_text,
        docx_to_text,
        ppt_to_text,
        pptx_to_text,
        url_parser,
        str_to_json,
        pdf_converter,
    ],
    [
        "PDF to Image",
        "Extract PDF Text",
        "Extract DOC Text",
        "Extract DOCX Text",
        "Extract PPT Text",
        "Extract PPTX Text",
        "Extract text from URL",
        "Extract Json",
        "Convert to PDF",
    ],
)

demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True)