FlavioBF commited on
Commit
c3b64eb
1 Parent(s): a205f19

Upload 5 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ 1812_05944.pdf filter=lfs diff=lfs merge=lfs -text
1812_05944.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c92dfcf7c47d641419c1f6355eaf21e4e9c644475452ac00c73f2298333188f
3
+ size 3077936
app.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ================================================================
2
+ # TESTING VERSION
3
+ # ALL-IN-ONE CELL VERSION
4
+ # OF THE PROGRAM
5
+ # ================================================================
6
+ #
7
+ # -------------------------
8
+ # PDF
9
+ # -------------------------
10
+
11
+ !pip install PyPDF2
12
+ !pip install pdfminer.six
13
+ !pip install pdfplumber
14
+ !pip install pdf2image
15
+ !pip install Pillow
16
+ !pip install pytesseract
17
+ !pip install poppler-utils
18
+ !pip install tesseract-ocr
19
+ !pip install libtesseract-dev
20
+
21
+ !pip install fastapi
22
+ !pip install -q torch
23
+ !pip install -q transformers
24
+ !pip install -q gradio
25
+ !pip install ffmpeg
26
+
27
+
28
+ #!apt-get install poppler-utils
29
+ #!apt install tesseract-ocr
30
+ #!apt install libtesseract-dev
31
+
32
+
33
+ # To read the PDF
34
+ import PyPDF2
35
+ # To analyze the PDF layout and extract text
36
+ from pdfminer.high_level import extract_pages, extract_text
37
+ from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
38
+ # To extract text from tables in PDF
39
+ import pdfplumber
40
+ # To extract the images from the PDFs
41
+ from PIL import Image
42
+ from pdf2image import convert_from_path
43
+ # To perform OCR to extract text from images
44
+ import pytesseract
45
+ # To remove the additional created files
46
+ import os
47
+
48
+ # -----------------------------------------------------------------------------
49
+ # Create a function to extract text
50
+
51
+ def text_extraction(element):
52
+ # Extracting the text from the in-line text element
53
+ line_text = element.get_text()
54
+
55
+ # Find the formats of the text
56
+ # Initialize the list with all the formats that appeared in the line of text
57
+ line_formats = []
58
+ for text_line in element:
59
+ if isinstance(text_line, LTTextContainer):
60
+ # Iterating through each character in the line of text
61
+ for character in text_line:
62
+ if isinstance(character, LTChar):
63
+ # Append the font name of the character
64
+ line_formats.append(character.fontname)
65
+ # Append the font size of the character
66
+ line_formats.append(character.size)
67
+ # Find the unique font sizes and names in the line
68
+ format_per_line = list(set(line_formats))
69
+
70
+ # Return a tuple with the text in each line along with its format
71
+ return (line_text, format_per_line)
72
+
73
+ # Create a function to crop the image elements from PDFs
74
+ def crop_image(element, pageObj):
75
+ # Get the coordinates to crop the image from the PDF
76
+ [image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1]
77
+ # Crop the page using coordinates (left, bottom, right, top)
78
+ pageObj.mediabox.lower_left = (image_left, image_bottom)
79
+ pageObj.mediabox.upper_right = (image_right, image_top)
80
+ # Save the cropped page to a new PDF
81
+ cropped_pdf_writer = PyPDF2.PdfWriter()
82
+ cropped_pdf_writer.add_page(pageObj)
83
+ # Save the cropped PDF to a new file
84
+ with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
85
+ cropped_pdf_writer.write(cropped_pdf_file)
86
+
87
+ # Create a function to convert the PDF to images
88
+ def convert_to_images(input_file,):
89
+ images = convert_from_path(input_file)
90
+ image = images[0]
91
+ output_file = "PDF_image.png"
92
+ image.save(output_file, "PNG")
93
+
94
+ # Create a function to read text from images
95
+ def image_to_text(image_path):
96
+ # Read the image
97
+ img = Image.open(image_path)
98
+ # Extract the text from the image
99
+ text = pytesseract.image_to_string(img)
100
+ return text
101
+
102
+
103
+ # Extracting tables from the page
104
+
105
+ def extract_table(pdf_path, page_num, table_num):
106
+ # Open the pdf file
107
+ pdf = pdfplumber.open(pdf_path)
108
+ # Find the examined page
109
+ table_page = pdf.pages[page_num]
110
+ # Extract the appropriate table
111
+ table = table_page.extract_tables()[table_num]
112
+ return table
113
+
114
+ # Convert table into the appropriate format
115
+ def table_converter(table):
116
+ table_string = ''
117
+ # Iterate through each row of the table
118
+ for row_num in range(len(table)):
119
+ row = table[row_num]
120
+ # Remove the line breaker from the wrapped texts
121
+ cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
122
+ # Convert the table into a string
123
+ table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
124
+ # Removing the last line break
125
+ table_string = table_string[:-1]
126
+ return table_string
127
+
128
+
129
+ # Extracting tables from the page
130
+
131
+ def extract_table(pdf_path, page_num, table_num):
132
+ # Open the pdf file
133
+ pdf = pdfplumber.open(pdf_path)
134
+ # Find the examined page
135
+ table_page = pdf.pages[page_num]
136
+ # Extract the appropriate table
137
+ table = table_page.extract_tables()[table_num]
138
+ return table
139
+
140
+ # Convert table into the appropriate format
141
+ def table_converter(table):
142
+ table_string = ''
143
+ # Iterate through each row of the table
144
+ for row_num in range(len(table)):
145
+ row = table[row_num]
146
+ # Remove the line breaker from the wrapped texts
147
+ cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
148
+ # Convert the table into a string
149
+ table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
150
+ # Removing the last line break
151
+ table_string = table_string[:-1]
152
+ return table_string
153
+
154
+ # ..............................................................
155
+
156
+ def read_pdf(pdf_path):
157
+ # create a PDF file object
158
+ pdfFileObj = open(pdf_path, 'rb')
159
+ # create a PDF reader object
160
+ pdfReaded = PyPDF2.PdfReader(pdfFileObj)
161
+
162
+ # Create the dictionary to extract text from each image
163
+ text_per_page = {}
164
+ # We extract the pages from the PDF
165
+ for pagenum, page in enumerate(extract_pages(pdf_path)):
166
+ print("Elaborating Page_" +str(pagenum))
167
+ # Initialize the variables needed for the text extraction from the page
168
+ pageObj = pdfReaded.pages[pagenum]
169
+ page_text = []
170
+ line_format = []
171
+ text_from_images = []
172
+ text_from_tables = []
173
+ page_content = []
174
+ # Initialize the number of the examined tables
175
+ table_num = 0
176
+ first_element= True
177
+ table_extraction_flag= False
178
+ # Open the pdf file
179
+ pdf = pdfplumber.open(pdf_path)
180
+ # Find the examined page
181
+ page_tables = pdf.pages[pagenum]
182
+ # Find the number of tables on the page
183
+ tables = page_tables.find_tables()
184
+
185
+
186
+ # Find all the elements
187
+ page_elements = [(element.y1, element) for element in page._objs]
188
+ # Sort all the elements as they appear in the page
189
+ page_elements.sort(key=lambda a: a[0], reverse=True)
190
+
191
+ # Find the elements that composed a page
192
+ for i,component in enumerate(page_elements):
193
+ # Extract the position of the top side of the element in the PDF
194
+ pos= component[0]
195
+ # Extract the element of the page layout
196
+ element = component[1]
197
+
198
+ # Check if the element is a text element
199
+ if isinstance(element, LTTextContainer):
200
+ # Check if the text appeared in a table
201
+ if table_extraction_flag == False:
202
+ # Use the function to extract the text and format for each text element
203
+ (line_text, format_per_line) = text_extraction(element)
204
+ # Append the text of each line to the page text
205
+ page_text.append(line_text)
206
+ # Append the format for each line containing text
207
+ line_format.append(format_per_line)
208
+ page_content.append(line_text)
209
+ else:
210
+ # Omit the text that appeared in a table
211
+ pass
212
+
213
+ # Check the elements for images
214
+ if isinstance(element, LTFigure):
215
+ # Crop the image from the PDF
216
+ crop_image(element, pageObj)
217
+ # Convert the cropped pdf to an image
218
+ convert_to_images('cropped_image.pdf')
219
+ # Extract the text from the image
220
+ image_text = image_to_text('PDF_image.png')
221
+ text_from_images.append(image_text)
222
+ page_content.append(image_text)
223
+ # Add a placeholder in the text and format lists
224
+ page_text.append('image')
225
+ line_format.append('image')
226
+
227
+ # Check the elements for tables
228
+ if isinstance(element, LTRect):
229
+ # If the first rectangular element
230
+ if first_element == True and (table_num+1) <= len(tables):
231
+ # Find the bounding box of the table
232
+ lower_side = page.bbox[3] - tables[table_num].bbox[3]
233
+ upper_side = element.y1
234
+ # Extract the information from the table
235
+ table = extract_table(pdf_path, pagenum, table_num)
236
+ # Convert the table information in structured string format
237
+ table_string = table_converter(table)
238
+ # Append the table string into a list
239
+ text_from_tables.append(table_string)
240
+ page_content.append(table_string)
241
+ # Set the flag as True to avoid the content again
242
+ table_extraction_flag = True
243
+ # Make it another element
244
+ first_element = False
245
+ # Add a placeholder in the text and format lists
246
+ page_text.append('table')
247
+ line_format.append('table')
248
+
249
+ # Check if we already extracted the tables from the page
250
+ if element.y0 >= lower_side and element.y1 <= upper_side:
251
+ pass
252
+ elif not isinstance(page_elements[i+1][1], LTRect):
253
+ table_extraction_flag = False
254
+ first_element = True
255
+ table_num+=1
256
+
257
+
258
+ # Create the key of the dictionary
259
+ dctkey = 'Page_'+str(pagenum)
260
+ # Add the list of list as the value of the page key
261
+ text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
262
+
263
+ # Closing the pdf file object
264
+ pdfFileObj.close()
265
+
266
+ # Deleting the additional files created
267
+ # os.remove('cropped_image.pdf')
268
+ # os.remove('PDF_image.png')
269
+ return text_per_page
270
+
271
+ # mount drive location
272
+
273
+ #from google.colab import drive
274
+ #drive.mount('/content/drive')
275
+
276
+ #pdf_path = 'C:/Users/Cristina/Documents/MDS/TERM1_AppliedArtificialIntelligence/Assesment3/NIPS-2015-hidden-technical-debt-in-machine-learning-systems-Paper.pdf'
277
+ pdf_path="C:/Users/Cristina/Documents/MDS/TERM1_AppliedArtificialIntelligence/Assesment3/hidden-technical-debt-in-machine-learning-systems-Paper.pdf"
278
+ pdf_path2="C:/Users/Cristina/Documents/MDS/TERM1_AppliedArtificialIntelligence/Assesment3/1812_05944.pdf"
279
+
280
+
281
+ text_per_page = read_pdf(pdf_path)
282
+
283
+ text_per_page.keys()
284
+
285
+
286
+ page_1 = text_per_page['Page_0']
287
+
288
+ # ============================================================================================
289
+
290
+ # picking up the abstract from the first page content
291
+ flag=False
292
+ abstract_sect=""
293
+
294
+ for i in range(len(page_1)):
295
+ if page_1[0][i].strip()=="Abstract":
296
+ flag=True
297
+ if page_1[0][i].strip()=="1 Introduction":
298
+ flag = False
299
+ if flag:
300
+ # abstract_sect contains the Abstract section content
301
+ abstract_sect+=page_1[0][i]
302
+
303
+
304
+ from transformers import pipeline
305
+
306
+ summarizer = pipeline("summarization", model="knkarthick/MEETING_SUMMARY")
307
+ summary=(summarizer(abstract_sect))
308
+ summary_text=summary[0].get("summary_text")
309
+ print(summary_text)
310
+
311
+
312
+
313
+ # =======================================
314
+
315
+ import gradio as gr
316
+ from transformers import pipeline, AutoProcessor, AutoModel
317
+ # =======================================
318
+ #
319
+ # =======================================
320
+ def sentence_to_audio(summary_text):
321
+ # Sentence 2 Speech
322
+ processor = AutoProcessor.from_pretrained("suno/bark-small")
323
+ model = AutoModel.from_pretrained("suno/bark-small")
324
+ inputs = processor(
325
+ text=summary_text,
326
+ return_tensors="pt",
327
+ )
328
+ speech_values = model.generate(**inputs, do_sample=True)
329
+ sampling_rate = model.generation_config.sample_rate
330
+ return sampling_rate, speech_values.cpu().numpy().squeeze()
331
+
332
+
333
+ summary_txt="It is dangerous to think of machine learning as a free-to-use toolkit, as it is common to incur ongoing maintenance costs in real-world ML systems"
334
+ sentence_to_audio(summary_txt)
335
+
336
+ pdf_path="C:/Users/Cristina/Documents/MDS/TERM1_AppliedArtificialIntelligence/Assesment3/hidden-technical-debt-in-machine-learning-systems-Paper.pdf"
337
+ pdf_path2="C:/Users/Cristina/Documents/MDS/TERM1_AppliedArtificialIntelligence/Assesment3/1812_05944.pdf"
338
+
339
+ demo = gr.Interface(fn=sentence_to_audio, inputs="file", outputs="audio",examples=[pdf_path,pdf_path2])
340
+ demo.launch(share=True)
hidden-technical-debt-in-machine-learning-systems-Paper.pdf ADDED
Binary file (166 kB). View file
 
packages.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ poppler-utils
2
+ tesseract-ocr
3
+ libtesseract-dev
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ gradio
3
+ torch
4
+ transformers
5
+ ffmpeg
6
+ PyPDF2
7
+ pdfminer.six
8
+ pdfplumber
9
+ pdf2image
10
+ Pillow
11
+ pytesseract