marcolorenzi98 commited on
Commit
c9d7e4e
1 Parent(s): 546303b

Upload abstractalk.py

Browse files
Files changed (1) hide show
  1. abstractalk.py +399 -0
abstractalk.py ADDED
@@ -0,0 +1,399 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """AbstracTalk.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1SsbXdZC55VNVB3CVBntZ7ugyA3eqsVFp
8
+
9
+ #Assessment 3 Audio Processing and AI in Production
10
+ Part 2
11
+
12
+ What to Do: Create a Hugging Face Space and publish the code you generated in the previous notebook.
13
+
14
+ How to Do It: Create a comprehensive package with all required files to publish the app. Use Gradio to design the interface. In the interface, specify the app's name, provide a brief description, and mention that your app only accepts PDFs with abstracts. Include examples of working PDFs in the app. Upload your app to Hugging Face Space and ensure it remains accessible throughout the grading period.
15
+
16
+ What to Deliver: Upload a compressed folder with a .zip or .rar extension. The folder should contain all the files that you uploaded to your Hugging Face Space. Please ADD as first line of the app.py file the address of the Space running the app as a Python Comment (see the example below). The app should keep running in order to be tested at the moment of grading.
17
+
18
+ #Install and import
19
+ """
20
+
21
+ !pip install -q gradio torch transformers
22
+
23
+ from IPython.display import Audio
24
+ from transformers import pipeline
25
+ import torch
26
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
27
+ import gradio as gr
28
+
29
+ import numpy as np
30
+
31
+ import os
32
+
33
+ """# PDF Reader
34
+
35
+ ## Libraries + Code
36
+ """
37
+
38
+ !pip install PyPDF2
39
+ !pip install pdfminer.six
40
+ !pip install pdfplumber
41
+ !pip install pdf2image
42
+ !pip install Pillow
43
+ !pip install pytesseract
44
+ !apt-get install poppler-utils
45
+ !apt install tesseract-ocr
46
+ !apt install libtesseract-dev
47
+
48
+ # To read the PDF
49
+ import PyPDF2
50
+ # To analyze the PDF layout and extract text
51
+ from pdfminer.high_level import extract_pages, extract_text
52
+ from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
53
+ # To extract text from tables in PDF
54
+ import pdfplumber
55
+ # To extract the images from the PDFs
56
+ from PIL import Image
57
+ from pdf2image import convert_from_path
58
+ # To perform OCR to extract text from images
59
+ import pytesseract
60
+ # To remove the additional created files
61
+ import os
62
+
63
+ # Create a function to extract text
64
+
65
+ def text_extraction(element):
66
+ # Extracting the text from the in-line text element
67
+ line_text = element.get_text()
68
+
69
+ # Find the formats of the text
70
+ # Initialize the list with all the formats that appeared in the line of text
71
+ line_formats = []
72
+ for text_line in element:
73
+ if isinstance(text_line, LTTextContainer):
74
+ # Iterating through each character in the line of text
75
+ for character in text_line:
76
+ if isinstance(character, LTChar):
77
+ # Append the font name of the character
78
+ line_formats.append(character.fontname)
79
+ # Append the font size of the character
80
+ line_formats.append(character.size)
81
+ # Find the unique font sizes and names in the line
82
+ format_per_line = list(set(line_formats))
83
+
84
+ # Return a tuple with the text in each line along with its format
85
+ return (line_text, format_per_line)
86
+
87
+ # Create a function to crop the image elements from PDFs
88
+ def crop_image(element, pageObj):
89
+ # Get the coordinates to crop the image from the PDF
90
+ [image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1]
91
+ # Crop the page using coordinates (left, bottom, right, top)
92
+ pageObj.mediabox.lower_left = (image_left, image_bottom)
93
+ pageObj.mediabox.upper_right = (image_right, image_top)
94
+ # Save the cropped page to a new PDF
95
+ cropped_pdf_writer = PyPDF2.PdfWriter()
96
+ cropped_pdf_writer.add_page(pageObj)
97
+ # Save the cropped PDF to a new file
98
+ with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
99
+ cropped_pdf_writer.write(cropped_pdf_file)
100
+
101
+ # Create a function to convert the PDF to images
102
+ def convert_to_images(input_file,):
103
+ images = convert_from_path(input_file)
104
+ image = images[0]
105
+ output_file = "PDF_image.png"
106
+ image.save(output_file, "PNG")
107
+
108
+ # Create a function to read text from images
109
+ def image_to_text(image_path):
110
+ # Read the image
111
+ img = Image.open(image_path)
112
+ # Extract the text from the image
113
+ text = pytesseract.image_to_string(img)
114
+ return text
115
+
116
+ # Extracting tables from the page
117
+
118
+ def extract_table(pdf_path, page_num, table_num):
119
+ # Open the pdf file
120
+ pdf = pdfplumber.open(pdf_path)
121
+ # Find the examined page
122
+ table_page = pdf.pages[page_num]
123
+ # Extract the appropriate table
124
+ table = table_page.extract_tables()[table_num]
125
+ return table
126
+
127
+ # Convert table into the appropriate format
128
+ def table_converter(table):
129
+ table_string = ''
130
+ # Iterate through each row of the table
131
+ for row_num in range(len(table)):
132
+ row = table[row_num]
133
+ # Remove the line breaker from the wrapped texts
134
+ cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
135
+ # Convert the table into a string
136
+ table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
137
+ # Removing the last line break
138
+ table_string = table_string[:-1]
139
+ return table_string
140
+
141
+ def read_pdf(pdf_path):
142
+ # create a PDF file object
143
+ pdfFileObj = open(pdf_path, 'rb')
144
+ # create a PDF reader object
145
+ pdfReaded = PyPDF2.PdfReader(pdfFileObj)
146
+
147
+ # Create the dictionary to extract text from each image
148
+ text_per_page = {}
149
+ # We extract the pages from the PDF
150
+ for pagenum, page in enumerate(extract_pages(pdf_path)):
151
+ print("Elaborating Page_" +str(pagenum))
152
+ # Initialize the variables needed for the text extraction from the page
153
+ pageObj = pdfReaded.pages[pagenum]
154
+ page_text = []
155
+ line_format = []
156
+ text_from_images = []
157
+ text_from_tables = []
158
+ page_content = []
159
+ # Initialize the number of the examined tables
160
+ table_num = 0
161
+ first_element= True
162
+ table_extraction_flag= False
163
+ # Open the pdf file
164
+ pdf = pdfplumber.open(pdf_path)
165
+ # Find the examined page
166
+ page_tables = pdf.pages[pagenum]
167
+ # Find the number of tables on the page
168
+ tables = page_tables.find_tables()
169
+
170
+
171
+ # Find all the elements
172
+ page_elements = [(element.y1, element) for element in page._objs]
173
+ # Sort all the elements as they appear in the page
174
+ page_elements.sort(key=lambda a: a[0], reverse=True)
175
+
176
+ # Find the elements that composed a page
177
+ for i,component in enumerate(page_elements):
178
+ # Extract the position of the top side of the element in the PDF
179
+ pos= component[0]
180
+ # Extract the element of the page layout
181
+ element = component[1]
182
+
183
+ # Check if the element is a text element
184
+ if isinstance(element, LTTextContainer):
185
+ # Check if the text appeared in a table
186
+ if table_extraction_flag == False:
187
+ # Use the function to extract the text and format for each text element
188
+ (line_text, format_per_line) = text_extraction(element)
189
+ # Append the text of each line to the page text
190
+ page_text.append(line_text)
191
+ # Append the format for each line containing text
192
+ line_format.append(format_per_line)
193
+ page_content.append(line_text)
194
+ else:
195
+ # Omit the text that appeared in a table
196
+ pass
197
+
198
+ # Check the elements for images
199
+ if isinstance(element, LTFigure):
200
+ # Crop the image from the PDF
201
+ crop_image(element, pageObj)
202
+ # Convert the cropped pdf to an image
203
+ convert_to_images('cropped_image.pdf')
204
+ # Extract the text from the image
205
+ image_text = image_to_text('PDF_image.png')
206
+ text_from_images.append(image_text)
207
+ page_content.append(image_text)
208
+ # Add a placeholder in the text and format lists
209
+ page_text.append('image')
210
+ line_format.append('image')
211
+
212
+ # Check the elements for tables
213
+ if isinstance(element, LTRect):
214
+ # If the first rectangular element
215
+ if first_element == True and (table_num+1) <= len(tables):
216
+ # Find the bounding box of the table
217
+ lower_side = page.bbox[3] - tables[table_num].bbox[3]
218
+ upper_side = element.y1
219
+ # Extract the information from the table
220
+ table = extract_table(pdf_path, pagenum, table_num)
221
+ # Convert the table information in structured string format
222
+ table_string = table_converter(table)
223
+ # Append the table string into a list
224
+ text_from_tables.append(table_string)
225
+ page_content.append(table_string)
226
+ # Set the flag as True to avoid the content again
227
+ table_extraction_flag = True
228
+ # Make it another element
229
+ first_element = False
230
+ # Add a placeholder in the text and format lists
231
+ page_text.append('table')
232
+ line_format.append('table')
233
+
234
+ # Check if we already extracted the tables from the page
235
+ if element.y0 >= lower_side and element.y1 <= upper_side:
236
+ pass
237
+ elif not isinstance(page_elements[i+1][1], LTRect):
238
+ table_extraction_flag = False
239
+ first_element = True
240
+ table_num+=1
241
+
242
+
243
+ # Create the key of the dictionary
244
+ dctkey = 'Page_'+str(pagenum)
245
+ # Add the list of list as the value of the page key
246
+ text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
247
+
248
+ # Closing the pdf file object
249
+ pdfFileObj.close()
250
+
251
+ try:
252
+
253
+ # Deleting the additional files created
254
+ os.remove('cropped_image.pdf')
255
+ os.remove('PDF_image.png')
256
+ finally:
257
+ return text_per_page
258
+
259
+ """#Functions
260
+
261
+ ##Extract abstract
262
+ """
263
+
264
+ def upload_file(files):
265
+ file_paths = [file.name for file in files]
266
+ return file_paths
267
+
268
+ def extract_abstract(path):
269
+
270
+ text_per_page = read_pdf(path)
271
+
272
+ abstract_found = False
273
+ abstract_content = ""
274
+ abstract_lenght = 700
275
+ start_collecting = False
276
+
277
+ for num_page in text_per_page:
278
+ page_i = text_per_page[num_page][0]
279
+
280
+ for index, word in enumerate(page_i):
281
+ if ("abstract" in word.lower() or "summary" in word.lower()):
282
+ abstract_found = True
283
+ start_collecting = True
284
+ continue
285
+
286
+ if start_collecting:
287
+ abstract_content += word + ' '
288
+ # Check if the collected content contains "Introduction" to stop collecting
289
+ if "introduction" in word.lower():
290
+ break
291
+
292
+ cleaned_abstract = ' '.join(abstract_content.splitlines()).replace('\n', ' ').replace(' ', ' ')
293
+
294
+
295
+ if abstract_found:
296
+ print("Abstract found")
297
+ return cleaned_abstract
298
+ else:
299
+ print("Abstract not found")
300
+
301
+ def summarize_abstract(path):
302
+
303
+ abstract_article = extract_abstract(path)
304
+
305
+ INSTRUCTION = "summarize: "
306
+ tokenizer = AutoTokenizer.from_pretrained("haining/scientific_abstract_simplification")
307
+ model = AutoModelForSeq2SeqLM.from_pretrained("haining/scientific_abstract_simplification")
308
+ input_text = abstract_article
309
+ encoding = tokenizer(INSTRUCTION + input_text,
310
+ max_length=672,
311
+ padding='max_length',
312
+ truncation=True,
313
+ return_tensors='pt')
314
+
315
+ with torch.no_grad():
316
+ decoded_ids = model.generate(input_ids=encoding['input_ids'],
317
+ attention_mask=encoding['attention_mask'],
318
+ max_length=512,
319
+ top_p=.9,
320
+ do_sample=True)
321
+
322
+ summary=tokenizer.decode(decoded_ids[0], skip_special_tokens=True)
323
+
324
+ # Extract and print only the first sentence
325
+ first_sentence = summary.split('.')[0] + '.'
326
+ print(first_sentence)
327
+ return first_sentence
328
+
329
+ def text_to_speech(sentence):
330
+
331
+ #sentence = summarize_abstract (path)
332
+
333
+ synthesiser = pipeline("text-to-speech", "suno/bark-small")
334
+
335
+ speech = synthesiser(sentence, forward_params={"do_sample": True})
336
+
337
+ audio_float32 = speech["audio"]
338
+ sr = speech["sampling_rate"]
339
+
340
+ #gr.Audio only accept a tuple(int, np.array(int16))
341
+ audio_int16 = (audio_float32 * 32767).astype(np.int16)
342
+ audio_reshaped = audio_int16.reshape(audio_int16.shape[1])
343
+
344
+ return sr, audio_reshaped
345
+
346
+ """# Uploading PDF File"""
347
+
348
+ from google.colab import files
349
+ uploaded = files.upload()
350
+
351
+ """#Gradio interface"""
352
+
353
+ interface = gr.Blocks()
354
+
355
+
356
+ with interface:
357
+ gr.Markdown(
358
+ """
359
+ # AbstracTalk
360
+ This app let's you upload an article (you can only upload a PDF with an abstract).
361
+ It reads the abstract and does not only summarize it in just one sentence,
362
+ but also makes it simpler for anybody to understand. Moreover, it also provides
363
+ an additional layer of accessibility through spoken versions of the text.
364
+ If you are not satisfied with the given summary you can press again the button and have a new summary.
365
+ Have fun and master knowledge with AbstracTalk!
366
+ """)
367
+
368
+ #the interface architecture goes down here
369
+ with gr.Row():
370
+ with gr.Column():
371
+ uploaded_article = gr.File()
372
+ gr.Markdown("## PDF Examples")
373
+ gr.Examples(
374
+ examples=[[os.path.join(os.path.abspath(""), 'Article 7 Efficient Estimation of Word Representations in Vector Space.pdf')],
375
+ [os.path.join(os.path.abspath(""), "Article 9 Transformers in Speech Processing_ Survey.pdf")],
376
+ [os.path.join(os.path.abspath(""), "Article 11 Hidden Technical Debt in Machine Learning Systems.pdf")]],
377
+ inputs=uploaded_article
378
+ )
379
+
380
+ with gr.Column():
381
+ summarized_abstract = gr.Textbox("One-sentence Abstract")
382
+ talked_abstract = gr.Audio(type="numpy")
383
+ with gr.Row():
384
+ summary_button = gr.Button(value="Summarize Abstract", size="lg")
385
+ tts_button = gr.Button(value="Speak Abstract", size="lg")
386
+
387
+
388
+ #the functionality goes down here
389
+
390
+ #first column
391
+
392
+
393
+ #second column
394
+ summary_button.click(summarize_abstract, inputs=uploaded_article, outputs=summarized_abstract)
395
+ tts_button.click(text_to_speech, inputs=summarized_abstract, outputs=talked_abstract)
396
+
397
+ if __name__ == "__main__":
398
+ interface.launch(debug=False)
399
+