utils / app.py
not-lain's picture
extract from docx
0bce450
raw
history blame
1.1 kB
import gradio as gr
from pdf2image import convert_from_path
import pdfplumber
from docx import Document
def convert_pdf_to_image(file):
images = convert_from_path(file)
return images
def extract_text_from_pdf(file):
text = ""
with pdfplumber.open(file) as pdf:
for page in pdf.pages:
text += page.extract_text() + "\n"
return text
def extract_text_from_docx(file):
text = ""
doc = Document(file.name)
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
pdf_to_img = gr.Interface(convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img")
pdf_to_text = gr.Interface(extract_text_from_pdf, gr.File(), gr.Textbox(placeholder="Extracted text will appear here"), api_name="pdf_to_text")
docx_to_text = gr.Interface(extract_text_from_docx, gr.File(), gr.Textbox(placeholder="Extracted text from DOCX will appear here"), api_name="docx_to_text")
demo = gr.TabbedInterface([pdf_to_img, pdf_to_text, docx_to_text], ["PDF to Image", "Extract PDF Text", "Extract DOCX Text"])
demo.launch(debug=True)