File size: 3,195 Bytes
c693e62
 
cdb128e
c693e62
 
 
564f95e
c693e62
cdb128e
 
 
c693e62
67d721c
6416b3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e9d4bd7
 
6416b3a
6900efd
6416b3a
e9d4bd7
6900efd
 
6416b3a
6900efd
6416b3a
 
6900efd
6416b3a
 
 
 
6900efd
6416b3a
 
 
c693e62
 
6416b3a
 
 
01a52e2
 
 
151b692
c693e62
2a10acb
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import gradio as gr
from PyPDF2 import PdfReader
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
from gtts import gTTS
from io import BytesIO
import re
import os

# Load the LED-large model for summarization
model_name = "pszemraj/led-large-book-summary"
summarizer = pipeline("summarization", model=model_name, tokenizer=model_name)

def extract_abstract_and_summarize(pdf_file):
    try:
        if pdf_file is None:
            raise ValueError("PDF file is not provided.")

        with open(pdf_file, "rb") as file:
            pdf_reader = PdfReader(file)
            abstract_text = ""
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text = page.extract_text()
                abstract_match = re.search(r"\bAbstract\b", text, re.IGNORECASE)
                if abstract_match:
                    start_index = abstract_match.end()
                    introduction_match = re.search(r"\bIntroduction\b", text[start_index:], re.IGNORECASE)
                    if introduction_match:
                        end_index = start_index + introduction_match.start()
                    else:
                        end_index = None
                    abstract_text = text[start_index:end_index]
                    break

            # Summarize the extracted abstract using the LED-large model with a specific max_length
            result = summarizer(abstract_text, max_length=81)

            # Extract only the first sentence from the summary
            if result and isinstance(result, list) and len(result) > 0:
                summary = result[0].get('summary_text', 'Summary not available.')
                # Extracting the first sentence
                first_sentence = summary.split('.')[0] + '.'
            else:
                first_sentence = "Summary not available."

            # Generate audio
            speech = gTTS(text=first_sentence, lang="en")
            speech_bytes = BytesIO()
            speech.write_to_fp(speech_bytes)

            # Return individual output values
            return first_sentence, speech_bytes.getvalue(), abstract_text.strip()

    except Exception as e:
        raise Exception(str(e))

interface = gr.Interface(
    fn=extract_abstract_and_summarize,
    inputs=[gr.File(label="Upload PDF")],
    outputs=[gr.Textbox(label="Summary"), gr.Audio()],
    title="PDF Summarization & Audio Generation Tool",
    description="""PDF Summarization App. This app extracts the abstract from a PDF, summarizes it using the 'pszemraj/led-large-book-summary' model into one sentence summary, and generates an audio of it. Only upload PDFs with abstracts. Example 
    PDF's are given below, and please click on them to see the summarized text and audio generated. Please read the README.MD for more information about the app.""",
    examples=[[os.path.join(os.path.dirname(__file__), "Article 11 Hidden Technical Debt in Machine Learning Systems.pdf")],[os.path.join(os.path.dirname(__file__), "Article 4 Experimental Evidence on the Productivity Effects of Generative Artificial Intelligence.pdf")]],cache_examples=True,
)

interface.launch()