mrsk1883 commited on
Commit
67d721c
1 Parent(s): 6fbb403

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -20
app.py CHANGED
@@ -4,6 +4,7 @@ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
4
  from gtts import gTTS
5
  from io import BytesIO
6
  import re
 
7
 
8
  model_name = "pszemraj/led-base-book-summary"
9
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
@@ -16,37 +17,58 @@ def extract_first_sentence(text):
16
  else:
17
  return text
18
 
19
- def summarize_pdf_abstract(pdf_file):
20
  try:
21
- reader = PdfReader(pdf_file)
22
- abstract_text = ""
23
- for page in reader.pages:
24
- if "Abstract" in page.extract_text() or "Introduction" in page.extract_text():
25
- abstract_text = page.extract_text()
26
- break
27
 
28
- inputs = tokenizer(abstract_text, return_tensors="pt")
29
- outputs = model.generate(**inputs)
30
- summary = tokenizer.decode(outputs[0])
31
 
32
- # Extract only the first sentence
33
- summary_sentence = extract_first_sentence(summary)
34
 
35
- # Generate audio
36
- speech = gTTS(text=summary_sentence, lang="en")
37
- speech_bytes = BytesIO()
38
- speech.write_to_fp(speech_bytes)
39
 
40
- # Return individual output values
41
- return summary_sentence, speech_bytes.getvalue()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  except Exception as e:
44
  raise Exception(str(e))
45
 
46
  interface = gr.Interface(
47
- fn=summarize_pdf_abstract,
48
  inputs=[gr.File(label="Upload PDF")],
49
  outputs=[gr.Textbox(label="Summary"), gr.Audio()],
 
 
 
50
  )
51
 
52
- interface.launch(share=True)
 
4
  from gtts import gTTS
5
  from io import BytesIO
6
  import re
7
+ import os
8
 
9
  model_name = "pszemraj/led-base-book-summary"
10
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
 
17
  else:
18
  return text
19
 
20
+ def extract_abstract_and_summarize(pdf_file):
21
  try:
22
+ with open(pdf_file, 'rb') as file:
23
+ pdf_reader = PdfReader(file)
24
+ abstract_text = ''
 
 
 
25
 
26
+ for page_num in range(len(pdf_reader.pages)):
27
+ page = pdf_reader.pages[page_num]
28
+ text = page.extract_text()
29
 
30
+ abstract_match = re.search(r'\bAbstract\b', text, re.IGNORECASE)
 
31
 
32
+ if abstract_match:
33
+ start_index = abstract_match.end()
 
 
34
 
35
+ # Check for the next heading or section marker
36
+ next_section_match = re.search(r'\b(?:Introduction|Methodology|Conclusion)\b', text[start_index:])
37
+
38
+ if next_section_match:
39
+ end_index = start_index + next_section_match.start()
40
+ abstract_text = text[start_index:end_index]
41
+ else:
42
+ abstract_text = text[start_index:]
43
+
44
+ break # Exit loop once abstract is found
45
+
46
+ # Summarize the extracted abstract
47
+ inputs = tokenizer(abstract_text, return_tensors="pt")
48
+ outputs = model.generate(**inputs)
49
+ summary = tokenizer.decode(outputs[0])
50
+
51
+ # Extract only the first sentence
52
+ summary_sentence = extract_first_sentence(summary)
53
+
54
+ # Generate audio
55
+ speech = gTTS(text=summary_sentence, lang="en")
56
+ speech_bytes = BytesIO()
57
+ speech.write_to_fp(speech_bytes)
58
+
59
+ # Return individual output values
60
+ return summary_sentence, speech_bytes.getvalue(), abstract_text.strip()
61
 
62
  except Exception as e:
63
  raise Exception(str(e))
64
 
65
  interface = gr.Interface(
66
+ fn=extract_abstract_and_summarize,
67
  inputs=[gr.File(label="Upload PDF")],
68
  outputs=[gr.Textbox(label="Summary"), gr.Audio()],
69
+ title="PDF Summarization & Audio Tool",
70
+ description="""PDF Summarization App. This app extracts the abstract from a PDF, summarizes it in one sentence, and generates an audio of it. Only upload PDFs with abstracts.
71
+ Please read the README.MD for information about the app and sample PDFs.""",
72
  )
73
 
74
+ interface.launch(share=True)