File size: 6,323 Bytes
8c468f3
05baaca
 
 
 
8c468f3
 
05baaca
c14f8d7
 
 
 
 
 
3b1b590
 
912db67
c14f8d7
 
 
 
 
 
3b1b590
c14f8d7
05baaca
47bdf28
 
 
ba7e56a
 
f76c355
2fa0266
c14f8d7
eed1bb3
41bb5ba
eed1bb3
 
70b9010
c14f8d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05baaca
c14f8d7
 
 
 
 
 
 
 
 
 
 
 
f76c355
 
 
 
 
 
c14f8d7
f76c355
 
c14f8d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b1b590
c14f8d7
3b1b590
c14f8d7
 
 
3b1b590
c14f8d7
 
3b1b590
c14f8d7
 
 
 
 
912db67
 
3b1b590
c14f8d7
c2b55f1
912db67
 
 
 
c14f8d7
 
 
 
 
2fa0266
c14f8d7
3b1b590
c14f8d7
 
 
 
912db67
c14f8d7
3b1b590
c14f8d7
 
3b1b590
e8cc3a6
47bdf28
eed1bb3
 
c14f8d7
 
3b1b590
c14f8d7
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# https://huggingface.co/spaces/Alioth86/SpeechAbstractor
#Please, consider that I have recombined the function I created for the part 1 of assessment
#I have added a main function to connect them (for this main function I got some help from ChatGPT-4)
#I have created the input/output parts and the titles and the description
#and all the gradio features according to the Gradio website instructions. 
#Please note that I have uploaded it all through git and git LFS. 

#Here are the imports
import PyPDF2
import pdfplumber
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
import re
import torch
import transformers
from transformers import pipeline
import nltk
from datasets import load_dataset
import soundfile as sf
from IPython.display import Audio
import sentencepiece as spm
import os
import tempfile
import gradio as gr

#Here is the code
title="SpeechAbstractor"

description = """
This app enables users to upload academic articles in PDF format, specifically focusing on abstracts. 
It efficiently summarizes the abstract and provides an audio playback of the summarized content. 
Below are some example PDFs for you to experiment with. Feel free to explore the functionality of SpeechAbstractor!
(Please note: it works only with articles with an abstract)."""

examples = [
    ["Article_7.pdf"],["Article_11.pdf"]
    ]

#reporting the functions created for the part 1 
def text_extraction(element):
    line_text = element.get_text()

    line_formats = []
    for text_line in element:
        if isinstance(text_line, LTTextContainer):
            for character in text_line:
                if isinstance(character, LTChar):
                    line_formats.append(character.fontname)
                    line_formats.append(character.size)
    format_per_line = list(set(line_formats))

    return (line_text, format_per_line)

def read_pdf(pdf_pathy):
  pdfFileObj = open(pdf_pathy, 'rb')
  pdfReaded = PyPDF2.PdfReader(pdfFileObj)

  text_per_pagy = {}
  for pagenum, page in enumerate(extract_pages(pdf_pathy)):
      print("Elaborating Page_" +str(pagenum))
      pageObj = pdfReaded.pages[pagenum]
      page_text = []
      line_format = []
      page_content = []

      pdf = pdfplumber.open(pdf_pathy)

      page_elements = [(element.y1, element) for element in page._objs]
      page_elements.sort(key=lambda a: a[0], reverse=True)

      for i,component in enumerate(page_elements):
          pos= component[0]
          element = component[1]

          if isinstance(element, LTTextContainer):
                  (line_text, format_per_line) = text_extraction(element)
                  page_text.append(line_text)
                  line_format.append(format_per_line)
                  page_content.append(line_text)
          

      dctkey = 'Page_'+str(pagenum)
      text_per_pagy[dctkey]= [page_text, line_format, page_content]

  pdfFileObj.close()


  return text_per_pagy


def clean_text(text):
    
    text = re.sub(r'\s+', ' ', text)

    return text.strip()


def extract_abstract(text_per_pagy):
    abstract_text = ""
    
    for page_num, page_text in text_per_pagy.items():
        if page_text:
            page_text = page_text.replace("- ", "")
           
            start_index = -1
            for variant in ["Abstract", "abstract", "ABSTRACT"]:
                start_index = page_text.find(variant)
                if start_index != -1:
                    start_index += len(variant) + 1
                    break

            if start_index != -1:
                end_markers = ["Introduction", "INTRODUCTION", "Background", "Contents", "Keywords"]
                end_index = -1

                for marker in end_markers:
                    temp_index = page_text.find(marker, start_index)
                    if temp_index != -1:
                        end_index = temp_index
                        break

                if end_index == -1:
                    end_index = len(page_text)

                abstract = page_text[start_index:end_index].strip()

                abstract_text += " " + abstract

                break

    return abstract_text

#let's define a main function that gets the uploaded file (pdf) to do the job
def main_function(uploaded_filepath):
    #put a control to see if there is a file uploaded
    if uploaded_filepath is None:
        return "No file loaded", None

    #read and process the file according to read_pdf
    text_per_pagy = read_pdf(uploaded_filepath)

    #cleaning the text and getting the abstract using the 2 other functions
    for key, value in text_per_pagy.items():
        cleaned_text = clean_text(' '.join(value[0]))
        text_per_pagy[key] = cleaned_text
    abstract_text = extract_abstract(text_per_pagy)

    nltk.download('punkt')

    #abstract the summary with my pipeline and model, deciding the length
    summarizer = pipeline("summarization", model="pszemraj/long-t5-tglobal-base-sci-simplify")
    summary = summarizer(abstract_text, max_length=100, min_length=50, do_sample=False)[0]['summary_text']

    #keeping just the first sentence, to be sure. 
    sentences = nltk.tokenize.sent_tokenize(summary)
    first_sentence = sentences[0]

    #generating the audio from the text, with my pipeline and model
    synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
    speech = synthesiser(first_sentence, forward_params={"speaker_embeddings": speaker_embedding})

    #saving the audio in a temporary file
    audio_file_path = "summary.wav"
    sf.write(audio_file_path, speech["audio"], samplerate=speech["sampling_rate"])

    #the function returns the 2 pieces we need
    return first_sentence, audio_file_path

#let's communicate with gradio what it has to put in
iface = gr.Interface(
    fn=main_function,
    inputs=gr.File(type="filepath"),  
    outputs=[gr.Textbox(label="Abstract Summary"), gr.Audio(label="Abstract Summary Audio", type="filepath")],
    title=title,
    description=description, 
    examples=examples
)

#launching the app
if __name__ == "__main__":
    iface.launch()