Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,49 +1,26 @@
|
|
1 |
-
# https://huggingface.co/spaces/marcolorenzi98/AAI-projects
|
2 |
-
|
3 |
-
# -*- coding: utf-8 -*-
|
4 |
-
"""AbstracTalk.ipynb
|
5 |
-
Automatically generated by Colaboratory.
|
6 |
-
Original file is located at
|
7 |
-
https://colab.research.google.com/drive/1SsbXdZC55VNVB3CVBntZ7ugyA3eqsVFp
|
8 |
-
#Assessment 3 Audio Processing and AI in Production
|
9 |
-
Part 2
|
10 |
-
What to Do: Create a Hugging Face Space and publish the code you generated in the previous notebook.
|
11 |
-
How to Do It: Create a comprehensive package with all required files to publish the app. Use Gradio to design the interface. In the interface, specify the app's name, provide a brief description, and mention that your app only accepts PDFs with abstracts. Include examples of working PDFs in the app. Upload your app to Hugging Face Space and ensure it remains accessible throughout the grading period.
|
12 |
-
What to Deliver: Upload a compressed folder with a .zip or .rar extension. The folder should contain all the files that you uploaded to your Hugging Face Space. Please ADD as first line of the app.py file the address of the Space running the app as a Python Comment (see the example below). The app should keep running in order to be tested at the moment of grading.
|
13 |
-
#Install and import
|
14 |
-
"""
|
15 |
-
|
16 |
-
|
17 |
-
#from IPython.display import Audio
|
18 |
-
from transformers import pipeline
|
19 |
-
import torch
|
20 |
-
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
21 |
-
import gradio as gr
|
22 |
-
|
23 |
-
import numpy as np
|
24 |
-
|
25 |
-
import os
|
26 |
-
|
27 |
-
"""# PDF Reader
|
28 |
-
## Libraries + Code
|
29 |
-
"""
|
30 |
-
|
31 |
# To read the PDF
|
32 |
import PyPDF2
|
33 |
-
# To analyze the PDF layout and extract text
|
34 |
from pdfminer.high_level import extract_pages, extract_text
|
35 |
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
|
36 |
-
# To extract text from tables in PDF
|
37 |
import pdfplumber
|
38 |
-
# To extract the images from the PDFs
|
39 |
from PIL import Image
|
40 |
from pdf2image import convert_from_path
|
41 |
-
# To perform OCR to extract text from images
|
42 |
import pytesseract
|
43 |
-
# To remove the additional created files
|
44 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
-
|
|
|
|
|
|
|
|
|
47 |
|
48 |
def text_extraction(element):
|
49 |
# Extracting the text from the in-line text element
|
@@ -67,60 +44,6 @@ def text_extraction(element):
|
|
67 |
# Return a tuple with the text in each line along with its format
|
68 |
return (line_text, format_per_line)
|
69 |
|
70 |
-
# Create a function to crop the image elements from PDFs
|
71 |
-
def crop_image(element, pageObj):
|
72 |
-
# Get the coordinates to crop the image from the PDF
|
73 |
-
[image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1]
|
74 |
-
# Crop the page using coordinates (left, bottom, right, top)
|
75 |
-
pageObj.mediabox.lower_left = (image_left, image_bottom)
|
76 |
-
pageObj.mediabox.upper_right = (image_right, image_top)
|
77 |
-
# Save the cropped page to a new PDF
|
78 |
-
cropped_pdf_writer = PyPDF2.PdfWriter()
|
79 |
-
cropped_pdf_writer.add_page(pageObj)
|
80 |
-
# Save the cropped PDF to a new file
|
81 |
-
with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
|
82 |
-
cropped_pdf_writer.write(cropped_pdf_file)
|
83 |
-
|
84 |
-
# Create a function to convert the PDF to images
|
85 |
-
def convert_to_images(input_file,):
|
86 |
-
images = convert_from_path(input_file)
|
87 |
-
image = images[0]
|
88 |
-
output_file = "PDF_image.png"
|
89 |
-
image.save(output_file, "PNG")
|
90 |
-
|
91 |
-
# Create a function to read text from images
|
92 |
-
def image_to_text(image_path):
|
93 |
-
# Read the image
|
94 |
-
img = Image.open(image_path)
|
95 |
-
# Extract the text from the image
|
96 |
-
text = pytesseract.image_to_string(img)
|
97 |
-
return text
|
98 |
-
|
99 |
-
# Extracting tables from the page
|
100 |
-
|
101 |
-
def extract_table(pdf_path, page_num, table_num):
|
102 |
-
# Open the pdf file
|
103 |
-
pdf = pdfplumber.open(pdf_path)
|
104 |
-
# Find the examined page
|
105 |
-
table_page = pdf.pages[page_num]
|
106 |
-
# Extract the appropriate table
|
107 |
-
table = table_page.extract_tables()[table_num]
|
108 |
-
return table
|
109 |
-
|
110 |
-
# Convert table into the appropriate format
|
111 |
-
def table_converter(table):
|
112 |
-
table_string = ''
|
113 |
-
# Iterate through each row of the table
|
114 |
-
for row_num in range(len(table)):
|
115 |
-
row = table[row_num]
|
116 |
-
# Remove the line breaker from the wrapped texts
|
117 |
-
cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
|
118 |
-
# Convert the table into a string
|
119 |
-
table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
|
120 |
-
# Removing the last line break
|
121 |
-
table_string = table_string[:-1]
|
122 |
-
return table_string
|
123 |
-
|
124 |
def read_pdf(pdf_path):
|
125 |
# create a PDF file object
|
126 |
pdfFileObj = open(pdf_path, 'rb')
|
@@ -178,224 +101,94 @@ def read_pdf(pdf_path):
|
|
178 |
# Omit the text that appeared in a table
|
179 |
pass
|
180 |
|
181 |
-
# Check the elements for images
|
182 |
-
if isinstance(element, LTFigure):
|
183 |
-
# Crop the image from the PDF
|
184 |
-
crop_image(element, pageObj)
|
185 |
-
# Convert the cropped pdf to an image
|
186 |
-
convert_to_images('cropped_image.pdf')
|
187 |
-
# Extract the text from the image
|
188 |
-
image_text = image_to_text('PDF_image.png')
|
189 |
-
text_from_images.append(image_text)
|
190 |
-
page_content.append(image_text)
|
191 |
-
# Add a placeholder in the text and format lists
|
192 |
-
page_text.append('image')
|
193 |
-
line_format.append('image')
|
194 |
-
|
195 |
-
# Check the elements for tables
|
196 |
-
if isinstance(element, LTRect):
|
197 |
-
# If the first rectangular element
|
198 |
-
if first_element == True and (table_num+1) <= len(tables):
|
199 |
-
# Find the bounding box of the table
|
200 |
-
lower_side = page.bbox[3] - tables[table_num].bbox[3]
|
201 |
-
upper_side = element.y1
|
202 |
-
# Extract the information from the table
|
203 |
-
table = extract_table(pdf_path, pagenum, table_num)
|
204 |
-
# Convert the table information in structured string format
|
205 |
-
table_string = table_converter(table)
|
206 |
-
# Append the table string into a list
|
207 |
-
text_from_tables.append(table_string)
|
208 |
-
page_content.append(table_string)
|
209 |
-
# Set the flag as True to avoid the content again
|
210 |
-
table_extraction_flag = True
|
211 |
-
# Make it another element
|
212 |
-
first_element = False
|
213 |
-
# Add a placeholder in the text and format lists
|
214 |
-
page_text.append('table')
|
215 |
-
line_format.append('table')
|
216 |
-
|
217 |
-
# Check if we already extracted the tables from the page
|
218 |
-
if element.y0 >= lower_side and element.y1 <= upper_side:
|
219 |
-
pass
|
220 |
-
elif not isinstance(page_elements[i+1][1], LTRect):
|
221 |
-
table_extraction_flag = False
|
222 |
-
first_element = True
|
223 |
-
table_num+=1
|
224 |
-
|
225 |
|
226 |
# Create the key of the dictionary
|
227 |
dctkey = 'Page_'+str(pagenum)
|
228 |
# Add the list of list as the value of the page key
|
229 |
text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
|
230 |
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
try:
|
235 |
-
|
236 |
-
# Deleting the additional files created
|
237 |
-
os.remove('cropped_image.pdf')
|
238 |
-
os.remove('PDF_image.png')
|
239 |
-
finally:
|
240 |
-
return text_per_page
|
241 |
-
|
242 |
-
"""#Functions
|
243 |
-
##Extract abstract
|
244 |
-
"""
|
245 |
|
246 |
def upload_file(files):
|
|
|
247 |
file_paths = [file.name for file in files]
|
248 |
return file_paths
|
249 |
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
280 |
else:
|
281 |
-
|
282 |
-
|
283 |
-
def summarize_abstract(path):
|
284 |
-
|
285 |
-
abstract_article = extract_abstract(path)
|
286 |
-
|
287 |
-
INSTRUCTION = "summarize, simplify, and contextualize in one sentence: "
|
288 |
-
tokenizer = AutoTokenizer.from_pretrained("haining/scientific_abstract_simplification")
|
289 |
-
model = AutoModelForSeq2SeqLM.from_pretrained("haining/scientific_abstract_simplification")
|
290 |
-
input_text = abstract_article
|
291 |
-
encoding = tokenizer(INSTRUCTION + input_text,
|
292 |
-
max_length=672,
|
293 |
-
padding='max_length',
|
294 |
-
truncation=True,
|
295 |
-
return_tensors='pt')
|
296 |
-
|
297 |
-
decoded_ids = model.generate(input_ids=encoding['input_ids'],
|
298 |
-
attention_mask=encoding['attention_mask'],
|
299 |
-
max_length=512,
|
300 |
-
top_p=.9,
|
301 |
-
do_sample=True)
|
302 |
-
|
303 |
-
summary=tokenizer.decode(decoded_ids[0], skip_special_tokens=True)
|
304 |
-
|
305 |
-
# Extract and print only the first sentence
|
306 |
-
first_sentence = summary.split('.')[0] + '.'
|
307 |
-
print(first_sentence)
|
308 |
-
return first_sentence
|
309 |
-
|
310 |
-
def text_to_speech(sentence):
|
311 |
-
|
312 |
-
#sentence = summarize_abstract (path)
|
313 |
-
|
314 |
-
synthesiser = pipeline("text-to-speech", "suno/bark-small")
|
315 |
-
|
316 |
-
speech = synthesiser(sentence, forward_params={"do_sample": True})
|
317 |
-
|
318 |
-
audio_float32 = speech["audio"]
|
319 |
-
sr = speech["sampling_rate"]
|
320 |
-
|
321 |
-
#gr.Audio only accept a tuple(int, np.array(int16))
|
322 |
-
audio_int16 = (audio_float32 * 32767).astype(np.int16)
|
323 |
-
audio_reshaped = audio_int16.reshape(audio_int16.shape[1])
|
324 |
-
|
325 |
-
return sr, audio_reshaped
|
326 |
-
|
327 |
-
def sum_audio(path):
|
328 |
-
|
329 |
-
sentence = summarize_abstract (path)
|
330 |
-
|
331 |
-
synthesiser = pipeline("text-to-speech", "suno/bark-small")
|
332 |
-
|
333 |
-
speech = synthesiser(sentence, forward_params={"do_sample": True})
|
334 |
-
|
335 |
-
audio_float32 = speech["audio"]
|
336 |
-
sr = speech["sampling_rate"]
|
337 |
-
|
338 |
-
#gr.Audio only accept a tuple(int, np.array(int16))
|
339 |
-
audio_int16 = (audio_float32 * 32767).astype(np.int16)
|
340 |
-
audio_reshaped = audio_int16.reshape(audio_int16.shape[1])
|
341 |
-
|
342 |
-
audio_tuple = (sr, audio_reshaped)
|
343 |
-
|
344 |
-
return sentence, audio_tuple
|
345 |
-
|
346 |
-
"""# Uploading PDF File"""
|
347 |
-
|
348 |
-
#from google.colab import files
|
349 |
-
#uploaded = files.upload()
|
350 |
-
|
351 |
-
|
352 |
-
"""#Gradio interface"""
|
353 |
-
|
354 |
-
interface = gr.Blocks()
|
355 |
-
|
356 |
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
# AbstracTalk
|
361 |
-
This app let's you upload an article (you can only upload a PDF with an abstract).
|
362 |
-
It reads the abstract and does not only summarize it in just one sentence,
|
363 |
-
but also makes it simpler for anybody to understand. Moreover, it also provides
|
364 |
-
an additional layer of accessibility through spoken versions of the text.
|
365 |
-
If you are not satisfied with the given summary you can press again the button and have a new summary.
|
366 |
-
Have fun and master knowledge with AbstracTalk!
|
367 |
-
""")
|
368 |
|
369 |
-
|
370 |
-
with gr.Row():
|
371 |
-
with gr.Column():
|
372 |
-
uploaded_article = gr.File()
|
373 |
-
|
374 |
-
with gr.Column():
|
375 |
-
summarized_abstract = gr.Textbox("One-sentence Abstract")
|
376 |
-
talked_abstract = gr.Audio(type="numpy")
|
377 |
-
with gr.Row():
|
378 |
-
summary_button = gr.Button(value="Summarize Abstract", size="lg")
|
379 |
-
tts_button = gr.Button(value="Speak Abstract", size="lg")
|
380 |
-
|
381 |
-
gr.Markdown("## PDF Examples")
|
382 |
-
gr.Examples(
|
383 |
-
examples=[[os.path.join(os.path.abspath(""), "Article 7 Efficient Estimation of Word Representations in Vector Space.pdf")],
|
384 |
-
[os.path.join(os.path.abspath(""), "Article 11 Hidden Technical Debt in Machine Learning Systems.pdf")]],
|
385 |
-
inputs=uploaded_article,
|
386 |
-
outputs=[summarized_abstract, talked_abstract],
|
387 |
-
fn=sum_audio,
|
388 |
-
cache_examples = True,
|
389 |
-
)
|
390 |
|
391 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
392 |
|
393 |
-
|
|
|
394 |
|
|
|
395 |
|
396 |
-
|
397 |
-
summary_button.click(summarize_abstract, inputs=uploaded_article, outputs=summarized_abstract)
|
398 |
-
tts_button.click(text_to_speech, inputs=summarized_abstract, outputs=talked_abstract)
|
399 |
|
400 |
-
|
401 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# To read the PDF
|
2 |
import PyPDF2
|
|
|
3 |
from pdfminer.high_level import extract_pages, extract_text
|
4 |
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
|
|
|
5 |
import pdfplumber
|
|
|
6 |
from PIL import Image
|
7 |
from pdf2image import convert_from_path
|
|
|
8 |
import pytesseract
|
|
|
9 |
import os
|
10 |
+
import torch
|
11 |
+
import soundfile as sf
|
12 |
+
from IPython.display import Audio
|
13 |
+
from datasets import load_dataset
|
14 |
+
from transformers import pipeline
|
15 |
+
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
|
16 |
+
|
17 |
+
hf_name = 'pszemraj/led-large-book-summary'
|
18 |
|
19 |
+
summarizer = pipeline(
|
20 |
+
"summarization",
|
21 |
+
hf_name,
|
22 |
+
device=0 if torch.cuda.is_available() else -1,
|
23 |
+
)
|
24 |
|
25 |
def text_extraction(element):
|
26 |
# Extracting the text from the in-line text element
|
|
|
44 |
# Return a tuple with the text in each line along with its format
|
45 |
return (line_text, format_per_line)
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
def read_pdf(pdf_path):
|
48 |
# create a PDF file object
|
49 |
pdfFileObj = open(pdf_path, 'rb')
|
|
|
101 |
# Omit the text that appeared in a table
|
102 |
pass
|
103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
# Create the key of the dictionary
|
106 |
dctkey = 'Page_'+str(pagenum)
|
107 |
# Add the list of list as the value of the page key
|
108 |
text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
|
109 |
|
110 |
+
# Closing the pdf file object
|
111 |
+
pdfFileObj.close()
|
112 |
+
return text_per_page
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
def upload_file(files):
|
115 |
+
print("here")
|
116 |
file_paths = [file.name for file in files]
|
117 |
return file_paths
|
118 |
|
119 |
+
with gr.Blocks() as demo:
|
120 |
+
file_output = gr.File()
|
121 |
+
upload_button = gr.UploadButton("Click to Upload a File", file_types=[".pdf"])
|
122 |
+
upload_button.upload(upload_file, upload_button, file_output)
|
123 |
+
|
124 |
+
pdf_path = file_output
|
125 |
+
|
126 |
+
demo.launch(debug=True)
|
127 |
+
|
128 |
+
text_per_page = read_pdf(pdf_path)
|
129 |
+
|
130 |
+
Page_0 = text_per_page['Page_0']
|
131 |
+
|
132 |
+
def nested_list_to_string(nested_list):
|
133 |
+
result = ''
|
134 |
+
for element in nested_list:
|
135 |
+
if isinstance(element, list): # Check if the element is a list
|
136 |
+
result += nested_list_to_string(element) # Recursively process the list
|
137 |
+
elif isinstance(element, str): # Check if the element is a string
|
138 |
+
result += element # Append the string to the result
|
139 |
+
return result
|
140 |
+
|
141 |
+
Page_0 = text_per_page['Page_0']
|
142 |
+
string_result = nested_list_to_string(Page_0)
|
143 |
+
|
144 |
+
def extract_abstract(page_0):
|
145 |
+
def nested_list_to_string(nested_list):
|
146 |
+
result = ''
|
147 |
+
for element in nested_list:
|
148 |
+
if isinstance(element, list): # Check if the element is a list
|
149 |
+
result += nested_list_to_string(element) # Recursively process the list
|
150 |
+
elif isinstance(element, str): # Check if the element is a string
|
151 |
+
result += element # Append the string to the result
|
152 |
+
return result
|
153 |
+
|
154 |
+
# Convert the nested list into a single string
|
155 |
+
full_text = nested_list_to_string(page_0)
|
156 |
+
|
157 |
+
# Find the start of the 'Abstract' section and the end of it (start of 'Introduction')
|
158 |
+
start_index = full_text.find('Abstract')
|
159 |
+
end_index = full_text.find('Introduction')
|
160 |
+
|
161 |
+
# If both 'Abstract' and 'Introduction' are found, extract the text in between
|
162 |
+
if start_index != -1 and end_index != -1:
|
163 |
+
# Extract the text and remove the word 'Abstract'
|
164 |
+
abstract_text = full_text[start_index + len('Abstract'):end_index]
|
165 |
+
return abstract_text.strip()
|
166 |
else:
|
167 |
+
return "Abstract or Introduction section not found."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
|
169 |
+
# Example usage
|
170 |
+
Page_0 = text_per_page['Page_0']
|
171 |
+
abstract_text = extract_abstract(Page_0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
|
173 |
+
wall_of_text = abstract_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
|
175 |
+
result = summarizer(
|
176 |
+
wall_of_text,
|
177 |
+
min_length=1,
|
178 |
+
max_length=30,
|
179 |
+
no_repeat_ngram_size=3,
|
180 |
+
encoder_no_repeat_ngram_size=3,
|
181 |
+
repetition_penalty=3.5,
|
182 |
+
num_beams=4,
|
183 |
+
early_stopping=True,
|
184 |
+
)
|
185 |
|
186 |
+
# Access the first element of the list (which is the dictionary) and then the value of 'summary_text'
|
187 |
+
summary_string = result[0]['summary_text']
|
188 |
|
189 |
+
print(summary_string)
|
190 |
|
191 |
+
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
|
|
|
|
|
192 |
|
193 |
+
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
194 |
+
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
|