Mishmosh commited on
Commit
8bbb2aa
·
1 Parent(s): 31c5dc7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +281 -2
app.py CHANGED
@@ -1,4 +1,283 @@
1
- #test
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  #text to speech
3
  #!pip install git+https://github.com/huggingface/transformers.git
4
  #!pip install datasets sentencepiece
@@ -10,7 +289,7 @@ from transformers import pipeline
10
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
11
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
12
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
13
- text = "The future belongs to those who believe in the beauty of their dreams."
14
  #text = (summarized_text_list_list)
15
 
16
  inputs = processor(text=summarized_text_list_list, return_tensors="pt")
 
1
+ #temp
2
+ # https://huggingface.co/spaces/Mishmosh/MichelleAssessment3
3
+
4
+ # Install Rust
5
+ RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
6
+
7
+ #RUN python -m pip install --upgrade pip
8
+ python -m pip install --upgrade pip
9
+
10
+ #pip install --upgrade pip
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+ RUN pip install --use-feature=in-tree-build tokenizers
13
+
14
+ #!pip install PyPDF2
15
+ #!pip install sentencepiece
16
+ #!pip install pdfminer.six
17
+ #!pip install pdfplumber
18
+ #!pip install pdf2image
19
+ #!pip install Pillow
20
+ #!pip install pytesseract
21
+ # @title
22
+ #!apt-get install poppler-utils
23
+ #!apt install tesseract-ocr
24
+ #!apt install libtesseract-dev
25
+ import PyPDF2
26
+ from pdfminer.high_level import extract_pages, extract_text
27
+ from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
28
+ import pdfplumber
29
+ from PIL import Image
30
+ from pdf2image import convert_from_path
31
+ import pytesseract
32
+ import os
33
+ def text_extraction(element):
34
+ # Extracting the text from the in-line text element
35
+ line_text = element.get_text()
36
+
37
+ # Find the formats of the text
38
+ # Initialize the list with all the formats that appeared in the line of text
39
+ line_formats = []
40
+ for text_line in element:
41
+ if isinstance(text_line, LTTextContainer):
42
+ # Iterating through each character in the line of text
43
+ for character in text_line:
44
+ if isinstance(character, LTChar):
45
+ # Append the font name of the character
46
+ line_formats.append(character.fontname)
47
+ # Append the font size of the character
48
+ line_formats.append(character.size)
49
+ # Find the unique font sizes and names in the line
50
+ format_per_line = list(set(line_formats))
51
+
52
+ # Return a tuple with the text in each line along with its format
53
+ return (line_text, format_per_line)
54
+ # @title
55
+ # Create a function to crop the image elements from PDFs
56
+ def crop_image(element, pageObj):
57
+ # Get the coordinates to crop the image from the PDF
58
+ [image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1]
59
+ # Crop the page using coordinates (left, bottom, right, top)
60
+ pageObj.mediabox.lower_left = (image_left, image_bottom)
61
+ pageObj.mediabox.upper_right = (image_right, image_top)
62
+ # Save the cropped page to a new PDF
63
+ cropped_pdf_writer = PyPDF2.PdfWriter()
64
+ cropped_pdf_writer.add_page(pageObj)
65
+ # Save the cropped PDF to a new file
66
+ with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
67
+ cropped_pdf_writer.write(cropped_pdf_file)
68
+
69
+ # Create a function to convert the PDF to images
70
+ def convert_to_images(input_file,):
71
+ images = convert_from_path(input_file)
72
+ image = images[0]
73
+ output_file = "PDF_image.png"
74
+ image.save(output_file, "PNG")
75
+
76
+ # Create a function to read text from images
77
+ def image_to_text(image_path):
78
+ # Read the image
79
+ img = Image.open(image_path)
80
+ # Extract the text from the image
81
+ text = pytesseract.image_to_string(img)
82
+ return text
83
+ # @title
84
+ # Extracting tables from the page
85
+
86
+ def extract_table(pdf_path, page_num, table_num):
87
+ # Open the pdf file
88
+ pdf = pdfplumber.open(pdf_path)
89
+ # Find the examined page
90
+ table_page = pdf.pages[page_num]
91
+ # Extract the appropriate table
92
+ table = table_page.extract_tables()[table_num]
93
+ return table
94
+
95
+ # Convert table into the appropriate format
96
+ def table_converter(table):
97
+ table_string = ''
98
+ # Iterate through each row of the table
99
+ for row_num in range(len(table)):
100
+ row = table[row_num]
101
+ # Remove the line breaker from the wrapped texts
102
+ cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
103
+ # Convert the table into a string
104
+ table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
105
+ # Removing the last line break
106
+ table_string = table_string[:-1]
107
+ return table_string
108
+ # @title
109
+ def read_pdf(pdf_path):
110
+ # create a PDF file object
111
+ pdfFileObj = open(pdf_path, 'rb')
112
+ # create a PDF reader object
113
+ #pdfReaded = PyPDF2.PdfReader(pdfFileObj) #coded out as suggested by chatgpt
114
+ pdfReaded = PyPDF2.PdfFileReader(pdfFileObj)
115
+
116
+ # Create the dictionary to extract text from each image
117
+ text_per_page = {}
118
+ # We extract the pages from the PDF
119
+ for pagenum, page in enumerate(extract_pages(pdf_path)):
120
+ print("Elaborating Page_" +str(pagenum))
121
+ # Initialize the variables needed for the text extraction from the page
122
+ pageObj = pdfReaded.pages[pagenum]
123
+ page_text = []
124
+ line_format = []
125
+ text_from_images = []
126
+ text_from_tables = []
127
+ page_content = []
128
+ # Initialize the number of the examined tables
129
+ table_num = 0
130
+ first_element= True
131
+ table_extraction_flag= False
132
+ # Open the pdf file
133
+ pdf = pdfplumber.open(pdf_path)
134
+ # Find the examined page
135
+ page_tables = pdf.pages[pagenum]
136
+ # Find the number of tables on the page
137
+ tables = page_tables.find_tables()
138
+
139
+
140
+ # Find all the elements
141
+ page_elements = [(element.y1, element) for element in page._objs]
142
+ # Sort all the elements as they appear in the page
143
+ page_elements.sort(key=lambda a: a[0], reverse=True)
144
+
145
+ # Find the elements that composed a page
146
+ for i,component in enumerate(page_elements):
147
+ # Extract the position of the top side of the element in the PDF
148
+ pos= component[0]
149
+ # Extract the element of the page layout
150
+ element = component[1]
151
+
152
+ # Check if the element is a text element
153
+ if isinstance(element, LTTextContainer):
154
+ # Check if the text appeared in a table
155
+ if table_extraction_flag == False:
156
+ # Use the function to extract the text and format for each text element
157
+ (line_text, format_per_line) = text_extraction(element)
158
+ # Append the text of each line to the page text
159
+ page_text.append(line_text)
160
+ # Append the format for each line containing text
161
+ line_format.append(format_per_line)
162
+ page_content.append(line_text)
163
+ else:
164
+ # Omit the text that appeared in a table
165
+ pass
166
+
167
+ # Check the elements for images
168
+ if isinstance(element, LTFigure):
169
+ # Crop the image from the PDF
170
+ crop_image(element, pageObj)
171
+ # Convert the cropped pdf to an image
172
+ convert_to_images('cropped_image.pdf')
173
+ # Extract the text from the image
174
+ image_text = image_to_text('PDF_image.png')
175
+ text_from_images.append(image_text)
176
+ page_content.append(image_text)
177
+ # Add a placeholder in the text and format lists
178
+ page_text.append('image')
179
+ line_format.append('image')
180
+
181
+ # Check the elements for tables
182
+ if isinstance(element, LTRect):
183
+ # If the first rectangular element
184
+ if first_element == True and (table_num+1) <= len(tables):
185
+ # Find the bounding box of the table
186
+ lower_side = page.bbox[3] - tables[table_num].bbox[3]
187
+ upper_side = element.y1
188
+ # Extract the information from the table
189
+ table = extract_table(pdf_path, pagenum, table_num)
190
+ # Convert the table information in structured string format
191
+ table_string = table_converter(table)
192
+ # Append the table string into a list
193
+ text_from_tables.append(table_string)
194
+ page_content.append(table_string)
195
+ # Set the flag as True to avoid the content again
196
+ table_extraction_flag = True
197
+ # Make it another element
198
+ first_element = False
199
+ # Add a placeholder in the text and format lists
200
+ page_text.append('table')
201
+ line_format.append('table')
202
+
203
+ # Check if we already extracted the tables from the page
204
+ if element.y0 >= lower_side and element.y1 <= upper_side:
205
+ pass
206
+ elif not isinstance(page_elements[i+1][1], LTRect):
207
+ table_extraction_flag = False
208
+ first_element = True
209
+ table_num+=1
210
+
211
+
212
+ # Create the key of the dictionary
213
+ dctkey = 'Page_'+str(pagenum)
214
+ # Add the list of list as the value of the page key
215
+ text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
216
+
217
+ # Closing the pdf file object
218
+ pdfFileObj.close()
219
+
220
+ # Deleting the additional files created
221
+ #os.remove('cropped_image.pdf')
222
+ #os.remove('PDF_image.png')
223
+ return text_per_page
224
+
225
+ #google drive
226
+ #from google.colab import drive
227
+ #drive.mount('/content/drive')
228
+ #read PDF
229
+
230
+ pdf_path = 'test.pdf' #article 11
231
+ #pdf_path = 'https://huggingface.co/spaces/Mishmosh/MichelleAssessment3/blob/main/test.pdf' #article 11
232
+
233
+ text_per_page = read_pdf(pdf_path)
234
+
235
+ # This section finds the abstract. My plan was to find the end of the abstract by identifying the same font size as the text 'abstract', but it was too late
236
+ #to try this here since the formatting of the text has already been removed.
237
+ # Instead I extracted just one paragraph. If an abstract is more than 1 paragraph this will not extract the entire abstract
238
+ abstract_from_pdf='' # define empty variable that will hold the text from the abstract
239
+ found_abstract=False # has the abstract been found
240
+ for key in text_per_page.keys(): # go through keys in dictionary
241
+ current_item=text_per_page[key] #current key
242
+ for paragraphs in current_item: #go through each item
243
+ for index,paragraph in enumerate(paragraphs): #go through each line
244
+ if 'Abstract\n' == paragraph: #does line match paragraph
245
+ found_abstract=True #word abstract has been found
246
+ abstract_from_pdf=paragraphs[index+1] #get next paragraph
247
+ if found_abstract: #if abstract found
248
+ break
249
+ print(abstract_from_pdf)
250
+
251
+ from transformers import pipeline
252
+ summarizer = pipeline("summarization", model="ainize/bart-base-cnn")
253
+ #summarizer = pipeline("summarization", model="linydub/bart-large-samsum") # various models were tried and the best one was selected
254
+ #summarizer = pipeline("summarization", model="slauw87/bart_summarisation")
255
+ #summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
256
+ #summarizer = pipeline("summarization", model="google/pegasus-cnn_dailymail")
257
+ #print(summarizer(abstract_from_pdf, max_length=50, min_length=5, do_sample=False))
258
+ summarized_text=(summarizer(abstract_from_pdf))
259
+ print(summarized_text)
260
+ #summary_of_abstract=str(summarizer)
261
+ #type(summary_of_abstract)
262
+ #print(summary_of_abstract)
263
+
264
+ # the aim of this section of code is to get a summary of just one sentence by summarizing the summary all while the summary is longer than one sentence.
265
+ # unfortunately, I tried many many models and none of them actually summarize the text to as short as one sentence.
266
+ #I had searched for ways to fine tune the summarization model to specify that the summarization should be done in just one sentence but did not find a way to implement it
267
+ from transformers import pipeline
268
+ summarized_text_list_list=summarized_text_list['summary_text']
269
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
270
+ #print(summarizer)
271
+ number_of_sentences=summarized_text_list_list.count('.')
272
+ print(number_of_sentences)
273
+ while(number_of_sentences)>1:
274
+ print(number_of_sentences)
275
+ summarized_text_list_list=summarizer(summarized_text_list_list)[0]['summary_text']
276
+ number_of_sentences-=1
277
+ print(summarized_text_list_list)
278
+ print(number_of_sentences)
279
+
280
+
281
  #text to speech
282
  #!pip install git+https://github.com/huggingface/transformers.git
283
  #!pip install datasets sentencepiece
 
289
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
290
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
291
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
292
+ #text = "The future belongs to those who believe in the beauty of their dreams."
293
  #text = (summarized_text_list_list)
294
 
295
  inputs = processor(text=summarized_text_list_list, return_tensors="pt")