mrsk1883 commited on
Commit
97b49ea
1 Parent(s): 1636cb6

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +58 -0
utils.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PyPDF2 import PdfReader
2
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
3
+ from gtts import gTTS
4
+ import os
5
+
6
+ # Download the summarization model and tokenizer
7
+ model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
8
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
9
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
10
+
11
+ def summarize_and_speak_pdf_abstract(pdf_path):
12
+ """
13
+ Reads a PDF file, extracts the abstract, summarizes it in one sentence, and generates an audio file of the summary.
14
+
15
+ Args:
16
+ pdf_path: Path to the PDF file.
17
+ """
18
+
19
+ # Summarize the abstract
20
+ summary = summarize_pdf_abstract(pdf_path)
21
+
22
+ # Define language and audio format
23
+ language = "en" # Change this to your desired language
24
+ audio_format = "mp3"
25
+
26
+ # Create the text-to-speech object
27
+ tts = gTTS(text=summary, lang=language)
28
+
29
+ # Generate the audio file
30
+ audio_file_name = f"summary.{audio_format}"
31
+ tts.save(audio_file_name)
32
+
33
+ print(f"Audio file created: {audio_file_name}")
34
+
35
+ # Play the audio file (optional)
36
+ # os.system(f"play {audio_file_name}")
37
+
38
+
39
+ def summarize_pdf_abstract(pdf_path):
40
+ """
41
+ Reads a PDF file, extracts the abstract, and summarizes it in one sentence.
42
+
43
+ Args:
44
+ pdf_path: Path to the PDF file.
45
+
46
+ Returns:
47
+ A string containing the one-sentence summary of the abstract.
48
+ """
49
+
50
+ # Read the PDF file
51
+ reader = PdfReader(open(pdf_path, "rb"))
52
+
53
+ # Extract the abstract
54
+ abstract_text = ""
55
+ for page in reader.pages:
56
+ # Search for keywords like "Abstract" or "Introduction"
57
+ if (
58
+ "Abstract" in page.extract_text