from pptx import Presentation import gradio as gr from pdf2image import convert_from_path import pdfplumber from docx import Document import subprocess import os from typing import Optional, List import string import random import re def extract_text_from_pptx(file_path): prs = Presentation(file_path) text_content = [] for slide in prs.slides: slide_text = [] for shape in slide.shapes: if hasattr(shape, "text"): slide_text.append(shape.text) text_content.append("\n".join(slide_text)) return "\n\n".join(text_content) def extract_text_from_ppt(file_path): try: # Convert PPT to PPTX using unoconv pptx_file_path = os.path.splitext(file_path)[0] + ".pptx" subprocess.run(["unoconv", "-f", "pptx", file_path], check=True) # Extract text from PPTX presentation = Presentation(pptx_file_path) text_content = [] for slide in presentation.slides: slide_text = [] for shape in slide.shapes: if hasattr(shape, "text"): slide_text.append(shape.text) text_content.append("\n".join(slide_text)) # Remove the converted PPTX file os.remove(pptx_file_path) return "\n\n".join(text_content) except Exception as e: print(f"Error extracting text from PPT file: {e}") return "Error extracting text from PPT file" def extract_text_from_ppt_or_pptx(file_path): if file_path.endswith(".pptx"): return extract_text_from_pptx(file_path) elif file_path.endswith(".ppt"): return extract_text_from_ppt(file_path) else: return "Unsupported file type. Please provide a .ppt or .pptx file." def convert_pdf_to_image(file): images = convert_from_path(file) return images def extract_text_from_pdf(file): text = "" with pdfplumber.open(file) as pdf: for page in pdf.pages: text += page.extract_text() + "\n" return text def extract_text_from_docx(file): text = "" doc = Document(file.name) for paragraph in doc.paragraphs: text += paragraph.text + "\n" return text def convert_doc_to_text(doc_path): try: subprocess.run( ["unoconv", "--format", "txt", doc_path], capture_output=True, text=True, check=True, ) txt_file_path = doc_path.replace(".doc", ".txt") with open(txt_file_path, "r") as f: text = f.read() text = text.lstrip("\ufeff") os.remove(txt_file_path) return text except subprocess.CalledProcessError as e: print(f"Error converting {doc_path} to text: {e}") return "" def extract_text_from_doc_or_docx(file): if file.name.endswith(".docx"): return extract_text_from_docx(file) elif file.name.endswith(".doc"): return convert_doc_to_text(file.name) else: return "Unsupported file type. Please upload a .doc or .docx file." # function that generates a random string def generate_random_string(length=23): characters = string.ascii_letters + string.digits # Includes letters and digits random_string = "".join(random.choice(characters) for _ in range(length)) return random_string # function that adds the necessary json fields def handle_json_output(json_list: list): n = len(json_list) for i in range(n): # not last element random_string1 = generate_random_string() random_string2 = generate_random_string() element = json_list[i] front = element["frontText"] back = element["backText"] element["frontHTML"] = ( f'

' f"

{front}

" ) element["backHTML"] = ( f'

' f"

{back}

" ) element["termType"] = "basic" cloze_matches = re.findall(r"_{2,}", front) # match only the first one, if there is multiple don't do anything if (cloze_matches != []) & (len(cloze_matches) <= 2): # It's a cloze type card element["termType"] = "cloze" # inject the back in a span format into the front def replace_cloze(match): return f'

{back}

' front_html = re.sub(r"_{2,}", replace_cloze, front) element["frontHTML"] = ( f'

' f"

{front_html}

" ) def replace_underscores(match): return f" {back} " element["frontText"] = re.sub(r"_{2,}", replace_underscores, front) element["backText"] = "" element["backHTML"] = ( f'

' f"

" ) return json_list def sanitize_list_of_lists(text: str) -> Optional[List[List]]: left = text.find("[") right = text.rfind("]") text = text[left : right + 1] try: # Safely evaluate the string to a Python object list_of_lists = eval(text) if isinstance(list_of_lists, list): # Ensure it's a list out = [] try: # parse list of lists for front, back in list_of_lists: out.append({"frontText": front, "backText": back}) return handle_json_output(out) # errors except Exception as e: print(e) # return anything that was already parsed if out != []: return handle_json_output(out) # original schedma is not respected else: return None else: print("The evaluated object is not a list.") return None except Exception as e: print(f"Error parsing the list of lists: {e}") return None pdf_to_img = gr.Interface( convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img" ) pdf_to_text = gr.Interface( extract_text_from_pdf, gr.File(), gr.Textbox(placeholder="Extracted text will appear here"), api_name="pdf_to_text", ) doc_or_docx_to_text = gr.Interface( extract_text_from_doc_or_docx, gr.File(), gr.Textbox(placeholder="Extracted text from DOC or DOCX will appear here"), api_name="doc_or_docx_to_text", ) pptx_or_ppt_to_text = gr.Interface( extract_text_from_ppt_or_pptx, gr.File(), gr.Textbox(placeholder="Extracted text from PPTX will appear here"), api_name="pptx_or_ppt_to_text", ) str_to_json = gr.Interface( sanitize_list_of_lists, gr.Text(), gr.JSON(), api_name="str_to_json", examples=[ """[ ["What year was the Carthaginian Empire founded?", "Around 814 BCE"], ["Where was the center of the Carthaginian Empire located?", "Carthage, near present-day Tunis, Tunisia"], ["Which powerful ancient republic did Carthage have conflicts with?", "The Roman Republic"], ["Fill in the blank: Hannibal famously crossed the ________ with war elephants.", "Alps"], ["What were the series of conflicts between Carthage and Rome called?", "The Punic Wars"], ["Multiple Choice: What was a significant military advantage of Carthage? A) Strong infantry, B) Powerful navy, C) Fortified cities", "B) Powerful navy"], ["In what year was Carthage captured and destroyed by Rome?", "146 BCE"], ["What did Carthage excel in that allowed it to amass wealth?", "Maritime trade"] ]""" ], ) demo = gr.TabbedInterface( [pdf_to_img, pdf_to_text, doc_or_docx_to_text, pptx_or_ppt_to_text, str_to_json], [ "PDF to Image", "Extract PDF Text", "Extract DOC/DOCX Text", "Extract PPTX/PPT Text", "Extract Json", ], ) demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True)