add string sanitization
Browse files- app.py +65 -6
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -5,6 +5,7 @@ import pdfplumber
|
|
| 5 |
from docx import Document
|
| 6 |
import subprocess
|
| 7 |
import os
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
def extract_text_from_pptx(file_path):
|
|
@@ -20,11 +21,12 @@ def extract_text_from_pptx(file_path):
|
|
| 20 |
|
| 21 |
return "\n\n".join(text_content)
|
| 22 |
|
|
|
|
| 23 |
def extract_text_from_ppt(file_path):
|
| 24 |
try:
|
| 25 |
# Convert PPT to PPTX using unoconv
|
| 26 |
-
pptx_file_path = os.path.splitext(file_path)[0] +
|
| 27 |
-
subprocess.run([
|
| 28 |
|
| 29 |
# Extract text from PPTX
|
| 30 |
presentation = Presentation(pptx_file_path)
|
|
@@ -45,10 +47,11 @@ def extract_text_from_ppt(file_path):
|
|
| 45 |
print(f"Error extracting text from PPT file: {e}")
|
| 46 |
return "Error extracting text from PPT file"
|
| 47 |
|
|
|
|
| 48 |
def extract_text_from_ppt_or_pptx(file_path):
|
| 49 |
-
if file_path.endswith(
|
| 50 |
return extract_text_from_pptx(file_path)
|
| 51 |
-
elif file_path.endswith(
|
| 52 |
return extract_text_from_ppt(file_path)
|
| 53 |
else:
|
| 54 |
return "Unsupported file type. Please provide a .ppt or .pptx file."
|
|
@@ -103,6 +106,37 @@ def extract_text_from_doc_or_docx(file):
|
|
| 103 |
return "Unsupported file type. Please upload a .doc or .docx file."
|
| 104 |
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
pdf_to_img = gr.Interface(
|
| 107 |
convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
|
| 108 |
)
|
|
@@ -127,9 +161,34 @@ pptx_or_ppt_to_text = gr.Interface(
|
|
| 127 |
api_name="pptx_or_ppt_to_text",
|
| 128 |
)
|
| 129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
demo = gr.TabbedInterface(
|
| 131 |
-
[pdf_to_img, pdf_to_text, doc_or_docx_to_text, pptx_or_ppt_to_text],
|
| 132 |
-
[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
)
|
| 134 |
|
| 135 |
demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True)
|
|
|
|
| 5 |
from docx import Document
|
| 6 |
import subprocess
|
| 7 |
import os
|
| 8 |
+
from typing import Optional, List
|
| 9 |
|
| 10 |
|
| 11 |
def extract_text_from_pptx(file_path):
|
|
|
|
| 21 |
|
| 22 |
return "\n\n".join(text_content)
|
| 23 |
|
| 24 |
+
|
| 25 |
def extract_text_from_ppt(file_path):
|
| 26 |
try:
|
| 27 |
# Convert PPT to PPTX using unoconv
|
| 28 |
+
pptx_file_path = os.path.splitext(file_path)[0] + ".pptx"
|
| 29 |
+
subprocess.run(["unoconv", "-f", "pptx", file_path], check=True)
|
| 30 |
|
| 31 |
# Extract text from PPTX
|
| 32 |
presentation = Presentation(pptx_file_path)
|
|
|
|
| 47 |
print(f"Error extracting text from PPT file: {e}")
|
| 48 |
return "Error extracting text from PPT file"
|
| 49 |
|
| 50 |
+
|
| 51 |
def extract_text_from_ppt_or_pptx(file_path):
|
| 52 |
+
if file_path.endswith(".pptx"):
|
| 53 |
return extract_text_from_pptx(file_path)
|
| 54 |
+
elif file_path.endswith(".ppt"):
|
| 55 |
return extract_text_from_ppt(file_path)
|
| 56 |
else:
|
| 57 |
return "Unsupported file type. Please provide a .ppt or .pptx file."
|
|
|
|
| 106 |
return "Unsupported file type. Please upload a .doc or .docx file."
|
| 107 |
|
| 108 |
|
| 109 |
+
def sanitize_list_of_lists(text: str) -> Optional[List[List]]:
|
| 110 |
+
left = text.find("[")
|
| 111 |
+
right = text.rfind("]")
|
| 112 |
+
text = text[left : right + 1]
|
| 113 |
+
try:
|
| 114 |
+
# Safely evaluate the string to a Python object
|
| 115 |
+
list_of_lists = eval(text)
|
| 116 |
+
if isinstance(list_of_lists, list): # Ensure it's a list
|
| 117 |
+
out = []
|
| 118 |
+
try:
|
| 119 |
+
# parse list of lists
|
| 120 |
+
for front, back in list_of_lists:
|
| 121 |
+
out.append({"front": front, "back": back})
|
| 122 |
+
return out
|
| 123 |
+
# errors
|
| 124 |
+
except Exception as e:
|
| 125 |
+
print(e)
|
| 126 |
+
# return anything that was already parsed
|
| 127 |
+
if out != []:
|
| 128 |
+
return out
|
| 129 |
+
# original schedma is not respected
|
| 130 |
+
else:
|
| 131 |
+
return None
|
| 132 |
+
else:
|
| 133 |
+
print("The evaluated object is not a list.")
|
| 134 |
+
return None
|
| 135 |
+
except Exception as e:
|
| 136 |
+
print(f"Error parsing the list of lists: {e}")
|
| 137 |
+
return None
|
| 138 |
+
|
| 139 |
+
|
| 140 |
pdf_to_img = gr.Interface(
|
| 141 |
convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
|
| 142 |
)
|
|
|
|
| 161 |
api_name="pptx_or_ppt_to_text",
|
| 162 |
)
|
| 163 |
|
| 164 |
+
str_to_json = gr.Interface(
|
| 165 |
+
sanitize_list_of_lists,
|
| 166 |
+
gr.Text(),
|
| 167 |
+
gr.JSON(),
|
| 168 |
+
api_name="str_to_json",
|
| 169 |
+
examples=[
|
| 170 |
+
"""[
|
| 171 |
+
["What year was the Carthaginian Empire founded?", "Around 814 BCE"],
|
| 172 |
+
["Where was the center of the Carthaginian Empire located?", "Carthage, near present-day Tunis, Tunisia"],
|
| 173 |
+
["Which powerful ancient republic did Carthage have conflicts with?", "The Roman Republic"],
|
| 174 |
+
["Fill in the blank: Hannibal famously crossed the ________ with war elephants.", "Alps"],
|
| 175 |
+
["What were the series of conflicts between Carthage and Rome called?", "The Punic Wars"],
|
| 176 |
+
["Multiple Choice: What was a significant military advantage of Carthage? A) Strong infantry, B) Powerful navy, C) Fortified cities", "B) Powerful navy"],
|
| 177 |
+
["In what year was Carthage captured and destroyed by Rome?", "146 BCE"],
|
| 178 |
+
["What did Carthage excel in that allowed it to amass wealth?", "Maritime trade"]
|
| 179 |
+
]"""
|
| 180 |
+
],
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
demo = gr.TabbedInterface(
|
| 184 |
+
[pdf_to_img, pdf_to_text, doc_or_docx_to_text, pptx_or_ppt_to_text, str_to_json],
|
| 185 |
+
[
|
| 186 |
+
"PDF to Image",
|
| 187 |
+
"Extract PDF Text",
|
| 188 |
+
"Extract DOC/DOCX Text",
|
| 189 |
+
"Extract PPTX/PPT Text",
|
| 190 |
+
"Extract Json",
|
| 191 |
+
],
|
| 192 |
)
|
| 193 |
|
| 194 |
demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True)
|
requirements.txt
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
pdf2image
|
| 2 |
gradio
|
| 3 |
pdfplumber
|
|
|
|
| 1 |
+
typing
|
| 2 |
pdf2image
|
| 3 |
gradio
|
| 4 |
pdfplumber
|