wfranco commited on
Commit
63ea7df
1 Parent(s): 8d5533f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +391 -122
app.py CHANGED
@@ -1,132 +1,401 @@
1
- #!pip install gradio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  def read_pdf(pdf_path):
4
- # create a PDF file object
5
- pdfFileObj = open(pdf_path, 'rb')
6
- # create a PDF reader object
7
- pdfReader = PyPDF2.PdfReader(pdfFileObj)
8
-
9
- # Create the dictionary to extract text from each page
10
- text_per_page = {}
11
- # We extract the pages from the PDF
12
- for pagenum, page in enumerate(extract_pages(pdf_path)):
13
- # Initialize the variables needed for the text extraction from the page
14
- pageObj = pdfReader.pages[pagenum]
15
- page_text = []
16
- line_format = []
17
- text_from_images = []
18
- text_from_tables = []
19
- page_content = []
20
- # Initialize the number of the examined tables
21
- table_num = 0
22
- first_element= True
23
- table_extraction_flag= False
24
- # Open the pdf file
25
- pdf = pdfplumber.open(pdf_path)
26
- # Find the examined page
27
- page_tables = pdf.pages[pagenum]
28
- # Find the number of tables on the page
29
- tables = page_tables.find_tables()
30
-
31
- # Find all the elements
32
- page_elements = [(element.y1, element) for element in page._objs]
33
- # Sort all the elements as they appear in the page
34
- page_elements.sort(key=lambda a: a[0], reverse=True)
35
-
36
- # Find the elements that composed a page
37
- for i, component in enumerate(page_elements):
38
- # Extract the position of the top side of the element in the PDF
39
- pos = component[0]
40
- # Extract the element of the page layout
41
- element = component[1]
42
-
43
- # Check if the element is a text element
44
- if isinstance(element, LTTextContainer):
45
- # Check if the text appeared in a table
46
- if table_extraction_flag == False:
47
- # Use the function to extract the text and format for each text element
48
- (line_text, format_per_line) = text_extraction(element)
49
- # Append the text of each line to the page text
50
- page_text.append(line_text)
51
- # Append the format for each line containing text
52
- line_format.append(format_per_line)
53
- page_content.append(line_text)
54
- else:
55
- # Omit the text that appeared in a table
56
- pass
57
-
58
- # Create the key of the dictionary
59
- dctkey = 'Page_'+str(pagenum)
60
- # Add the list of list as the value of the page key
61
- text_per_page[dctkey] = [page_text, line_format, text_from_images, text_from_tables, page_content]
62
-
63
- # Closing the pdf file object
64
- pdfFileObj.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  return text_per_page
66
- pdf_path = '/content/Article 11 Hidden Technical Debt in Machine Learning Systems.pdf'
67
-
68
- text_per_page = read_pdf(pdf_path)
69
-
70
- Page_0 = text_per_page['Page_0']
71
-
72
- def nested_list_to_string(nested_list):
73
- result = ''
74
- for element in nested_list:
75
- if isinstance(element, list): # Check if the element is a list
76
- result += nested_list_to_string(element) # Recursively process the list
77
- elif isinstance(element, str): # Check if the element is a string
78
- result += element # Append the string to the result
79
- return result
80
-
81
- Page_0 = text_per_page['Page_0']
82
- string_result = nested_list_to_string(Page_0)
83
-
84
- def extract_abstract(page_0):
85
- def nested_list_to_string(nested_list):
86
- result = ''
87
- for element in nested_list:
88
- if isinstance(element, list): # Check if the element is a list
89
- result += nested_list_to_string(element) # Recursively process the list
90
- elif isinstance(element, str): # Check if the element is a string
91
- result += element # Append the string to the result
92
- return result
93
-
94
- # Convert the nested list into a single string
95
- full_text = nested_list_to_string(page_0)
96
-
97
- # Find the start of the 'Abstract' section and the end of it (start of 'Introduction')
98
- start_index = full_text.find('Abstract')
99
- end_index = full_text.find('Introduction')
100
-
101
- # If both 'Abstract' and 'Introduction' are found, extract the text in between
102
- if start_index != -1 and end_index != -1:
103
- # Extract the text and remove the word 'Abstract'
104
- abstract_text = full_text[start_index + len('Abstract'):end_index]
105
- return abstract_text.strip()
106
  else:
107
- return "Abstract or Introduction section not found."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
- # Example usage
110
- Page_0 = text_per_page['Page_0']
111
- abstract_text = extract_abstract(Page_0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
- wall_of_text = abstract_text
114
 
115
- result = summarizer(
116
- wall_of_text,
117
- min_length=1,
118
- max_length=30,
119
- no_repeat_ngram_size=3,
120
- encoder_no_repeat_ngram_size=3,
121
- repetition_penalty=3.5,
122
- num_beams=4,
123
- early_stopping=True,
124
- )
125
 
126
- # Access the first element of the list (which is the dictionary) and then the value of 'summary_text'
127
- summary_string = result[0]['summary_text']
128
 
129
- print(summary_string)
 
 
130
 
131
- app = gra.Interface(fn = user_greeting, inputs=summary_string, outputs=summary_string)
132
- app.launch()
 
1
+ # https://huggingface.co/spaces/marcolorenzi98/AAI-projects
2
+
3
+ # -*- coding: utf-8 -*-
4
+ """AbstracTalk.ipynb
5
+ Automatically generated by Colaboratory.
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1SsbXdZC55VNVB3CVBntZ7ugyA3eqsVFp
8
+ #Assessment 3 Audio Processing and AI in Production
9
+ Part 2
10
+ What to Do: Create a Hugging Face Space and publish the code you generated in the previous notebook.
11
+ How to Do It: Create a comprehensive package with all required files to publish the app. Use Gradio to design the interface. In the interface, specify the app's name, provide a brief description, and mention that your app only accepts PDFs with abstracts. Include examples of working PDFs in the app. Upload your app to Hugging Face Space and ensure it remains accessible throughout the grading period.
12
+ What to Deliver: Upload a compressed folder with a .zip or .rar extension. The folder should contain all the files that you uploaded to your Hugging Face Space. Please ADD as first line of the app.py file the address of the Space running the app as a Python Comment (see the example below). The app should keep running in order to be tested at the moment of grading.
13
+ #Install and import
14
+ """
15
+
16
+
17
+ #from IPython.display import Audio
18
+ from transformers import pipeline
19
+ import torch
20
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
21
  import gradio as gr
22
+
23
+ import numpy as np
24
+
25
+ import os
26
+
27
+ """# PDF Reader
28
+ ## Libraries + Code
29
+ """
30
+
31
+ # To read the PDF
32
+ import PyPDF2
33
+ # To analyze the PDF layout and extract text
34
+ from pdfminer.high_level import extract_pages, extract_text
35
+ from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
36
+ # To extract text from tables in PDF
37
+ import pdfplumber
38
+ # To extract the images from the PDFs
39
+ from PIL import Image
40
+ from pdf2image import convert_from_path
41
+ # To perform OCR to extract text from images
42
+ import pytesseract
43
+ # To remove the additional created files
44
+ import os
45
+
46
+ # Create a function to extract text
47
+
48
+ def text_extraction(element):
49
+ # Extracting the text from the in-line text element
50
+ line_text = element.get_text()
51
+
52
+ # Find the formats of the text
53
+ # Initialize the list with all the formats that appeared in the line of text
54
+ line_formats = []
55
+ for text_line in element:
56
+ if isinstance(text_line, LTTextContainer):
57
+ # Iterating through each character in the line of text
58
+ for character in text_line:
59
+ if isinstance(character, LTChar):
60
+ # Append the font name of the character
61
+ line_formats.append(character.fontname)
62
+ # Append the font size of the character
63
+ line_formats.append(character.size)
64
+ # Find the unique font sizes and names in the line
65
+ format_per_line = list(set(line_formats))
66
+
67
+ # Return a tuple with the text in each line along with its format
68
+ return (line_text, format_per_line)
69
+
70
+ # Create a function to crop the image elements from PDFs
71
+ def crop_image(element, pageObj):
72
+ # Get the coordinates to crop the image from the PDF
73
+ [image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1]
74
+ # Crop the page using coordinates (left, bottom, right, top)
75
+ pageObj.mediabox.lower_left = (image_left, image_bottom)
76
+ pageObj.mediabox.upper_right = (image_right, image_top)
77
+ # Save the cropped page to a new PDF
78
+ cropped_pdf_writer = PyPDF2.PdfWriter()
79
+ cropped_pdf_writer.add_page(pageObj)
80
+ # Save the cropped PDF to a new file
81
+ with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
82
+ cropped_pdf_writer.write(cropped_pdf_file)
83
+
84
+ # Create a function to convert the PDF to images
85
+ def convert_to_images(input_file,):
86
+ images = convert_from_path(input_file)
87
+ image = images[0]
88
+ output_file = "PDF_image.png"
89
+ image.save(output_file, "PNG")
90
+
91
+ # Create a function to read text from images
92
+ def image_to_text(image_path):
93
+ # Read the image
94
+ img = Image.open(image_path)
95
+ # Extract the text from the image
96
+ text = pytesseract.image_to_string(img)
97
+ return text
98
+
99
+ # Extracting tables from the page
100
+
101
+ def extract_table(pdf_path, page_num, table_num):
102
+ # Open the pdf file
103
+ pdf = pdfplumber.open(pdf_path)
104
+ # Find the examined page
105
+ table_page = pdf.pages[page_num]
106
+ # Extract the appropriate table
107
+ table = table_page.extract_tables()[table_num]
108
+ return table
109
+
110
+ # Convert table into the appropriate format
111
+ def table_converter(table):
112
+ table_string = ''
113
+ # Iterate through each row of the table
114
+ for row_num in range(len(table)):
115
+ row = table[row_num]
116
+ # Remove the line breaker from the wrapped texts
117
+ cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
118
+ # Convert the table into a string
119
+ table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
120
+ # Removing the last line break
121
+ table_string = table_string[:-1]
122
+ return table_string
123
+
124
  def read_pdf(pdf_path):
125
+ # create a PDF file object
126
+ pdfFileObj = open(pdf_path, 'rb')
127
+ # create a PDF reader object
128
+ pdfReaded = PyPDF2.PdfReader(pdfFileObj)
129
+
130
+ # Create the dictionary to extract text from each image
131
+ text_per_page = {}
132
+ # We extract the pages from the PDF
133
+ for pagenum, page in enumerate(extract_pages(pdf_path)):
134
+ print("Elaborating Page_" +str(pagenum))
135
+ # Initialize the variables needed for the text extraction from the page
136
+ pageObj = pdfReaded.pages[pagenum]
137
+ page_text = []
138
+ line_format = []
139
+ text_from_images = []
140
+ text_from_tables = []
141
+ page_content = []
142
+ # Initialize the number of the examined tables
143
+ table_num = 0
144
+ first_element= True
145
+ table_extraction_flag= False
146
+ # Open the pdf file
147
+ pdf = pdfplumber.open(pdf_path)
148
+ # Find the examined page
149
+ page_tables = pdf.pages[pagenum]
150
+ # Find the number of tables on the page
151
+ tables = page_tables.find_tables()
152
+
153
+
154
+ # Find all the elements
155
+ page_elements = [(element.y1, element) for element in page._objs]
156
+ # Sort all the elements as they appear in the page
157
+ page_elements.sort(key=lambda a: a[0], reverse=True)
158
+
159
+ # Find the elements that composed a page
160
+ for i,component in enumerate(page_elements):
161
+ # Extract the position of the top side of the element in the PDF
162
+ pos= component[0]
163
+ # Extract the element of the page layout
164
+ element = component[1]
165
+
166
+ # Check if the element is a text element
167
+ if isinstance(element, LTTextContainer):
168
+ # Check if the text appeared in a table
169
+ if table_extraction_flag == False:
170
+ # Use the function to extract the text and format for each text element
171
+ (line_text, format_per_line) = text_extraction(element)
172
+ # Append the text of each line to the page text
173
+ page_text.append(line_text)
174
+ # Append the format for each line containing text
175
+ line_format.append(format_per_line)
176
+ page_content.append(line_text)
177
+ else:
178
+ # Omit the text that appeared in a table
179
+ pass
180
+
181
+ # Check the elements for images
182
+ if isinstance(element, LTFigure):
183
+ # Crop the image from the PDF
184
+ crop_image(element, pageObj)
185
+ # Convert the cropped pdf to an image
186
+ convert_to_images('cropped_image.pdf')
187
+ # Extract the text from the image
188
+ image_text = image_to_text('PDF_image.png')
189
+ text_from_images.append(image_text)
190
+ page_content.append(image_text)
191
+ # Add a placeholder in the text and format lists
192
+ page_text.append('image')
193
+ line_format.append('image')
194
+
195
+ # Check the elements for tables
196
+ if isinstance(element, LTRect):
197
+ # If the first rectangular element
198
+ if first_element == True and (table_num+1) <= len(tables):
199
+ # Find the bounding box of the table
200
+ lower_side = page.bbox[3] - tables[table_num].bbox[3]
201
+ upper_side = element.y1
202
+ # Extract the information from the table
203
+ table = extract_table(pdf_path, pagenum, table_num)
204
+ # Convert the table information in structured string format
205
+ table_string = table_converter(table)
206
+ # Append the table string into a list
207
+ text_from_tables.append(table_string)
208
+ page_content.append(table_string)
209
+ # Set the flag as True to avoid the content again
210
+ table_extraction_flag = True
211
+ # Make it another element
212
+ first_element = False
213
+ # Add a placeholder in the text and format lists
214
+ page_text.append('table')
215
+ line_format.append('table')
216
+
217
+ # Check if we already extracted the tables from the page
218
+ if element.y0 >= lower_side and element.y1 <= upper_side:
219
+ pass
220
+ elif not isinstance(page_elements[i+1][1], LTRect):
221
+ table_extraction_flag = False
222
+ first_element = True
223
+ table_num+=1
224
+
225
+
226
+ # Create the key of the dictionary
227
+ dctkey = 'Page_'+str(pagenum)
228
+ # Add the list of list as the value of the page key
229
+ text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
230
+
231
+ # Closing the pdf file object
232
+ pdfFileObj.close()
233
+
234
+ try:
235
+
236
+ # Deleting the additional files created
237
+ os.remove('cropped_image.pdf')
238
+ os.remove('PDF_image.png')
239
+ finally:
240
  return text_per_page
241
+
242
+ """#Functions
243
+ ##Extract abstract
244
+ """
245
+
246
+ def upload_file(files):
247
+ file_paths = [file.name for file in files]
248
+ return file_paths
249
+
250
+ def extract_abstract(path):
251
+
252
+ text_per_page = read_pdf(path)
253
+
254
+ abstract_found = False
255
+ abstract_content = ""
256
+ abstract_lenght = 700
257
+ start_collecting = False
258
+
259
+ for num_page in text_per_page:
260
+ page_i = text_per_page[num_page][0]
261
+
262
+ for index, word in enumerate(page_i):
263
+ if ("abstract" in word.lower() or "summary" in word.lower()):
264
+ abstract_found = True
265
+ start_collecting = True
266
+ continue
267
+
268
+ if start_collecting:
269
+ abstract_content += word + ' '
270
+ # Check if the collected content contains "Introduction" to stop collecting
271
+ if "introduction" in word.lower():
272
+ break
273
+
274
+ cleaned_abstract = ' '.join(abstract_content.splitlines()).replace('\n', ' ').replace(' ', ' ')
275
+
276
+
277
+ if abstract_found:
278
+ print("Abstract found")
279
+ return cleaned_abstract
 
280
  else:
281
+ print("Abstract not found")
282
+
283
+ def summarize_abstract(path):
284
+
285
+ abstract_article = extract_abstract(path)
286
+
287
+ INSTRUCTION = "summarize, simplify, and contextualize in one sentence: "
288
+ tokenizer = AutoTokenizer.from_pretrained("haining/scientific_abstract_simplification")
289
+ model = AutoModelForSeq2SeqLM.from_pretrained("haining/scientific_abstract_simplification")
290
+ input_text = abstract_article
291
+ encoding = tokenizer(INSTRUCTION + input_text,
292
+ max_length=672,
293
+ padding='max_length',
294
+ truncation=True,
295
+ return_tensors='pt')
296
+
297
+ decoded_ids = model.generate(input_ids=encoding['input_ids'],
298
+ attention_mask=encoding['attention_mask'],
299
+ max_length=512,
300
+ top_p=.9,
301
+ do_sample=True)
302
+
303
+ summary=tokenizer.decode(decoded_ids[0], skip_special_tokens=True)
304
+
305
+ # Extract and print only the first sentence
306
+ first_sentence = summary.split('.')[0] + '.'
307
+ print(first_sentence)
308
+ return first_sentence
309
+
310
+ def text_to_speech(sentence):
311
+
312
+ #sentence = summarize_abstract (path)
313
+
314
+ synthesiser = pipeline("text-to-speech", "suno/bark-small")
315
+
316
+ speech = synthesiser(sentence, forward_params={"do_sample": True})
317
+
318
+ audio_float32 = speech["audio"]
319
+ sr = speech["sampling_rate"]
320
+
321
+ #gr.Audio only accept a tuple(int, np.array(int16))
322
+ audio_int16 = (audio_float32 * 32767).astype(np.int16)
323
+ audio_reshaped = audio_int16.reshape(audio_int16.shape[1])
324
+
325
+ return sr, audio_reshaped
326
+
327
+ def sum_audio(path):
328
+
329
+ sentence = summarize_abstract (path)
330
+
331
+ synthesiser = pipeline("text-to-speech", "suno/bark-small")
332
+
333
+ speech = synthesiser(sentence, forward_params={"do_sample": True})
334
+
335
+ audio_float32 = speech["audio"]
336
+ sr = speech["sampling_rate"]
337
+
338
+ #gr.Audio only accept a tuple(int, np.array(int16))
339
+ audio_int16 = (audio_float32 * 32767).astype(np.int16)
340
+ audio_reshaped = audio_int16.reshape(audio_int16.shape[1])
341
+
342
+ audio_tuple = (sr, audio_reshaped)
343
+
344
+ return sentence, audio_tuple
345
+
346
+ """# Uploading PDF File"""
347
+
348
+ #from google.colab import files
349
+ #uploaded = files.upload()
350
+
351
+
352
+ """#Gradio interface"""
353
+
354
+ interface = gr.Blocks()
355
+
356
+
357
+ with interface:
358
+ gr.Markdown(
359
+ """
360
+ # AbstracTalk
361
+ This app let's you upload an article (you can only upload a PDF with an abstract).
362
+ It reads the abstract and does not only summarize it in just one sentence,
363
+ but also makes it simpler for anybody to understand. Moreover, it also provides
364
+ an additional layer of accessibility through spoken versions of the text.
365
+ If you are not satisfied with the given summary you can press again the button and have a new summary.
366
+ Have fun and master knowledge with AbstracTalk!
367
+ """)
368
 
369
+ #the interface architecture goes down here
370
+ with gr.Row():
371
+ with gr.Column():
372
+ uploaded_article = gr.File()
373
+
374
+ with gr.Column():
375
+ summarized_abstract = gr.Textbox("One-sentence Abstract")
376
+ talked_abstract = gr.Audio(type="numpy")
377
+ with gr.Row():
378
+ summary_button = gr.Button(value="Summarize Abstract", size="lg")
379
+ tts_button = gr.Button(value="Speak Abstract", size="lg")
380
+
381
+ gr.Markdown("## PDF Examples")
382
+ gr.Examples(
383
+ examples=[[os.path.join(os.path.abspath(""), "Article 7 Efficient Estimation of Word Representations in Vector Space.pdf")],
384
+ [os.path.join(os.path.abspath(""), "Article 11 Hidden Technical Debt in Machine Learning Systems.pdf")]],
385
+ inputs=uploaded_article,
386
+ outputs=[summarized_abstract, talked_abstract],
387
+ fn=sum_audio,
388
+ cache_examples = True,
389
+ )
390
 
391
+ #the functionality goes down here
392
 
393
+ #first column
 
 
 
 
 
 
 
 
 
394
 
 
 
395
 
396
+ #second column
397
+ summary_button.click(summarize_abstract, inputs=uploaded_article, outputs=summarized_abstract)
398
+ tts_button.click(text_to_speech, inputs=summarized_abstract, outputs=talked_abstract)
399
 
400
+ if __name__ == "__main__":
401
+ interface.launch(debug=False)