File size: 4,824 Bytes
4b0678e d5b5b0f 5f1077a c577758 d9c1e67 5f1077a d99955f c577758 5d2e8ec d99955f c577758 5f1077a ba611cd 0772fb4 59e60e9 d9c1e67 c577758 d5b5b0f e0f3c82 d5b5b0f 5f1077a c577758 d5b5b0f c577758 59e60e9 5d2e8ec ba611cd 0772fb4 d9c1e67 59e60e9 d5b5b0f 59e60e9 5f1077a fac9a75 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import gradio as gr
import warnings
from typing import List
from pdfitdown.pdfconversion import convert_to_pdf, convert_markdown_to_pdf
from base_utils import (
convert_pdf_to_image,
extract_text_from_pdf,
convert_doc_to_text,
extract_text_from_docx,
extract_text_from_ppt,
extract_text_from_pptx,
sanitize_list_of_lists,
parse_url,
)
pdf_to_img = gr.Interface(
convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
)
pdf_to_text = gr.Interface(
extract_text_from_pdf,
gr.File(),
gr.Textbox(placeholder="Extracted text will appear here"),
api_name="pdf_to_text",
)
doc_to_text = gr.Interface(
convert_doc_to_text, gr.File(), gr.Textbox(), api_name="doc_to_text"
)
docx_to_text = gr.Interface(
extract_text_from_docx, gr.File(), gr.Textbox(), api_name="docx_to_text"
)
ppt_to_text = gr.Interface(
extract_text_from_ppt,
gr.File(),
gr.Textbox(),
api_name="ppt_to_text",
)
pptx_to_text = gr.Interface(
extract_text_from_pptx,
gr.File(),
gr.Textbox(),
api_name="pptx_to_text",
)
str_to_json = gr.Interface(
sanitize_list_of_lists,
gr.Text(),
gr.JSON(),
api_name="str_to_json",
examples=[
"""[
["What year was the Carthaginian Empire founded?", "Around 814 BCE"],
["Where was the center of the Carthaginian Empire located?", "Carthage, near present-day Tunis, Tunisia"],
["Which powerful ancient republic did Carthage have conflicts with?", "The Roman Republic"],
["Fill in the blank: Hannibal famously crossed the ________ with war elephants.", "Alps"],
["What were the series of conflicts between Carthage and Rome called?", "The Punic Wars"],
["Multiple Choice: What was a significant military advantage of Carthage? A) Strong infantry, B) Powerful navy, C) Fortified cities", "B) Powerful navy"],
["In what year was Carthage captured and destroyed by Rome?", "146 BCE"],
["What did Carthage excel in that allowed it to amass wealth?", "Maritime trade"]
]"""
],
)
url_parser = gr.Interface(
parse_url,
inputs=["text"],
outputs=["text"],
api_name="url_to_text",
)
class FileNotConvertedWarning(Warning):
"""The file was not in one of the specified formats for conversion to PDF"""
pass
def to_pdf(files: List[str]) -> List[str]:
pdfs = []
for f in files:
if f.endswith(".docx"):
newfile = f.replace(".docx", ".pdf")
file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
pdfs.append(file_to_add)
elif f.endswith(".pdf"):
pdfs.append(f)
elif f.endswith(".html"):
newfile = f.replace(".html", ".pdf")
file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
pdfs.append(file_to_add)
elif f.endswith(".pptx"):
newfile = f.replace(".pptx", ".pdf")
file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
pdfs.append(file_to_add)
elif f.endswith(".csv"):
newfile = f.replace(".csv", ".pdf")
file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
pdfs.append(file_to_add)
elif f.endswith(".xml"):
newfile = f.replace(".xml", ".pdf")
file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
pdfs.append(file_to_add)
elif f.endswith(".md"):
newfile = f.replace(".md", ".pdf")
file_to_add = convert_markdown_to_pdf(f, newfile, newfile.split(".")[0])
pdfs.append(file_to_add)
else:
warnings.warn(
f"File {f} was not converted to PDF because its file format is not included in those that can be converted",
FileNotConvertedWarning,
)
continue
return pdfs
def convert(file: str) -> str:
files = [file]
pdfs = to_pdf(files)
return pdfs
pdf_converter = gr.Interface(
fn=convert,
inputs=gr.File(label="Upload your file"),
outputs=gr.File(label="Converted PDF"),
title="File to PDF Converter",
description="Upload a file in .docx, .pdf, .html, .pptx, .csv, .xml, or .md format, and get it converted to PDF.",
api_name="convert_to_pdf",
)
demo = gr.TabbedInterface(
[
pdf_to_img,
pdf_to_text,
doc_to_text,
docx_to_text,
ppt_to_text,
pptx_to_text,
url_parser,
str_to_json,
pdf_converter,
],
[
"PDF to Image",
"Extract PDF Text",
"Extract DOC Text",
"Extract DOCX Text",
"Extract PPT Text",
"Extract PPTX Text",
"Extract text from URL",
"Extract Json",
"Convert to PDF",
],
)
demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True)
|