lincolnlegalbart

Sleeping

App Files Files Community

Ari commited on Sep 5, 2024

Commit

ac28e59

verified ·

1 Parent(s): 170c2bc

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -5

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from fpdf import FPDF
 from gtts import gTTS
 from pdfminer.high_level import extract_text
 nltk.download('punkt')
@@ -12,13 +13,35 @@ nltk.download('punkt')
 tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
 model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
 # Main processing function
 def pdf_to_text(text, PDF, min_length=20):
     try:
-        # Extract text from PDF if no input text provided
-        if text == "":
             text = extract_text(PDF.name)
         # Tokenize text
         inputs = tokenizer([text], max_length=1024, return_tensors="pt")
         min_length = int(min_length)
@@ -48,9 +71,9 @@ def pdf_to_text(text, PDF, min_length=20):
 # Gradio interface
 iface = gr.Interface(
     fn=pdf_to_text,
-    inputs=[gr.Textbox(label="Input Text"), gr.File(label="Upload PDF"), gr.Slider(minimum=10, maximum=100, step=10, value=20, label="Summary Minimum Length")],
     outputs=[gr.Audio(label="Generated Audio"), gr.Textbox(label="Generated Summary"), gr.File(label="Summary PDF")]
 )
 if __name__ == "__main__":
-    iface.launch()

 from fpdf import FPDF
 from gtts import gTTS
 from pdfminer.high_level import extract_text
+from docx import Document
 nltk.download('punkt')
 tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
 model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
+# Function to convert DOCX to PDF
+def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
+    doc = Document(docx_file)
+    full_text = []
+    for para in doc.paragraphs:
+        full_text.append(para.text)
+    # Create a PDF and write the extracted text
+    pdf = FPDF()
+    pdf.add_page()
+    pdf.set_font("Times", size=12)
+    pdf.multi_cell(190, 10, txt="\n".join(full_text), align='C')
+    pdf.output(output_pdf)
+    return output_pdf
 # Main processing function
 def pdf_to_text(text, PDF, min_length=20):
     try:
+        # Determine whether the input is a PDF or DOCX
+        file_extension = os.path.splitext(PDF.name)[1].lower()
+        # If DOCX, first convert it to PDF
+        if file_extension == '.docx':
+            pdf_file_path = docx_to_pdf(PDF.name)  # Convert DOCX to PDF
+            text = extract_text(pdf_file_path)  # Extract text from the newly created PDF
+        # If PDF, extract text from it directly
+        elif file_extension == '.pdf' and text == "":
             text = extract_text(PDF.name)
         # Tokenize text
         inputs = tokenizer([text], max_length=1024, return_tensors="pt")
         min_length = int(min_length)
 # Gradio interface
 iface = gr.Interface(
     fn=pdf_to_text,
+    inputs=[gr.Textbox(label="Input Text"), gr.File(label="Upload PDF or DOCX"), gr.Slider(minimum=10, maximum=100, step=10, value=20, label="Summary Minimum Length")],
     outputs=[gr.Audio(label="Generated Audio"), gr.Textbox(label="Generated Summary"), gr.File(label="Summary PDF")]
 )
 if __name__ == "__main__":
+    iface.launch()