Spaces:

mrsk1883
/

testingwspace

Sleeping

App Files Files Community

mrsk1883 commited on Dec 9, 2023

Commit

67d721c

•

1 Parent(s): 6fbb403

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -20

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 from gtts import gTTS
 from io import BytesIO
 import re
 model_name = "pszemraj/led-base-book-summary"
 model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
@@ -16,37 +17,58 @@ def extract_first_sentence(text):
     else:
         return text
-def summarize_pdf_abstract(pdf_file):
     try:
-        reader = PdfReader(pdf_file)
-        abstract_text = ""
-        for page in reader.pages:
-            if "Abstract" in page.extract_text() or "Introduction" in page.extract_text():
-                abstract_text = page.extract_text()
-                break
-        inputs = tokenizer(abstract_text, return_tensors="pt")
-        outputs = model.generate(**inputs)
-        summary = tokenizer.decode(outputs[0])
-        # Extract only the first sentence
-        summary_sentence = extract_first_sentence(summary)
-        # Generate audio
-        speech = gTTS(text=summary_sentence, lang="en")
-        speech_bytes = BytesIO()
-        speech.write_to_fp(speech_bytes)
-        # Return individual output values
-        return summary_sentence, speech_bytes.getvalue()
     except Exception as e:
         raise Exception(str(e))
 interface = gr.Interface(
-    fn=summarize_pdf_abstract,
     inputs=[gr.File(label="Upload PDF")],
     outputs=[gr.Textbox(label="Summary"), gr.Audio()],
 )
-interface.launch(share=True)

 from gtts import gTTS
 from io import BytesIO
 import re
+import os
 model_name = "pszemraj/led-base-book-summary"
 model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
     else:
         return text
+def extract_abstract_and_summarize(pdf_file):
     try:
+        with open(pdf_file, 'rb') as file:
+            pdf_reader = PdfReader(file)
+            abstract_text = ''
+            for page_num in range(len(pdf_reader.pages)):
+                page = pdf_reader.pages[page_num]
+                text = page.extract_text()
+                abstract_match = re.search(r'\bAbstract\b', text, re.IGNORECASE)
+                if abstract_match:
+                    start_index = abstract_match.end()
+                    # Check for the next heading or section marker
+                    next_section_match = re.search(r'\b(?:Introduction|Methodology|Conclusion)\b', text[start_index:])
+                    if next_section_match:
+                        end_index = start_index + next_section_match.start()
+                        abstract_text = text[start_index:end_index]
+                    else:
+                        abstract_text = text[start_index:]
+                    break  # Exit loop once abstract is found
+            # Summarize the extracted abstract
+            inputs = tokenizer(abstract_text, return_tensors="pt")
+            outputs = model.generate(**inputs)
+            summary = tokenizer.decode(outputs[0])
+            # Extract only the first sentence
+            summary_sentence = extract_first_sentence(summary)
+            # Generate audio
+            speech = gTTS(text=summary_sentence, lang="en")
+            speech_bytes = BytesIO()
+            speech.write_to_fp(speech_bytes)
+            # Return individual output values
+            return summary_sentence, speech_bytes.getvalue(), abstract_text.strip()
     except Exception as e:
         raise Exception(str(e))
 interface = gr.Interface(
+    fn=extract_abstract_and_summarize,
     inputs=[gr.File(label="Upload PDF")],
     outputs=[gr.Textbox(label="Summary"), gr.Audio()],
+    title="PDF Summarization & Audio Tool",
+    description="""PDF Summarization App. This app extracts the abstract from a PDF, summarizes it in one sentence, and generates an audio of it. Only upload PDFs with abstracts.
+    Please read the README.MD for information about the app and sample PDFs.""",
 )
+interface.launch(share=True)