File size: 6,917 Bytes
63ea7df
 
 
 
 
 
 
 
 
7c0ce54
 
 
 
 
 
 
 
63ea7df
7c0ce54
 
 
 
 
63ea7df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9bedcbe
63ea7df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7c0ce54
 
 
63ea7df
 
7c0ce54
63ea7df
 
 
7c0ce54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9bedcbe
7c0ce54
63ea7df
7c0ce54
 
 
9bedcbe
7c0ce54
9bedcbe
7c0ce54
 
 
 
 
 
 
 
 
 
9bedcbe
7c0ce54
 
96f46f4
7c0ce54
9bedcbe
7c0ce54
9bedcbe
7c0ce54
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# To read the PDF
import PyPDF2
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
import pdfplumber
from PIL import Image
from pdf2image import convert_from_path
import pytesseract
import os
import torch
import soundfile as sf
from IPython.display import Audio
from datasets import load_dataset
from transformers import pipeline
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech

hf_name = 'pszemraj/led-large-book-summary'

summarizer = pipeline(
    "summarization",
    hf_name,
    device=0 if torch.cuda.is_available() else -1,
)

def text_extraction(element):
    # Extracting the text from the in-line text element
    line_text = element.get_text()

    # Find the formats of the text
    # Initialize the list with all the formats that appeared in the line of text
    line_formats = []
    for text_line in element:
        if isinstance(text_line, LTTextContainer):
            # Iterating through each character in the line of text
            for character in text_line:
                if isinstance(character, LTChar):
                    # Append the font name of the character
                    line_formats.append(character.fontname)
                    # Append the font size of the character
                    line_formats.append(character.size)
    # Find the unique font sizes and names in the line
    format_per_line = list(set(line_formats))

    # Return a tuple with the text in each line along with its format
    return (line_text, format_per_line)

def read_pdf(pdf_path):
  # create a PDF file object
  pdfFileObj = open(pdf_path, 'rb')
  # create a PDF reader object
  pdfReaded = PyPDF2.PdfReader(pdfFileObj)

  # Create the dictionary to extract text from each image
  text_per_page = {}
  # We extract the pages from the PDF
  for pagenum, page in enumerate(extract_pages(pdf_path)):
      print("Elaborating Page_" +str(pagenum))
      # Initialize the variables needed for the text extraction from the page
      pageObj = pdfReaded.pages[pagenum]
      page_text = []
      line_format = []
      text_from_images = []
      text_from_tables = []
      page_content = []
      # Initialize the number of the examined tables
      table_num = 0
      first_element= True
      table_extraction_flag= False
      # Open the pdf file
      pdf = pdfplumber.open(pdf_path)
      # Find the examined page
      page_tables = pdf.pages[pagenum]
      # Find the number of tables on the page
      tables = page_tables.find_tables()


      # Find all the elements
      page_elements = [(element.y1, element) for element in page._objs]
      # Sort all the elements as they appear in the page
      page_elements.sort(key=lambda a: a[0], reverse=True)

      # Find the elements that composed a page
      for i,component in enumerate(page_elements):
          # Extract the position of the top side of the element in the PDF
          pos= component[0]
          # Extract the element of the page layout
          element = component[1]

          # Check if the element is a text element
          if isinstance(element, LTTextContainer):
              # Check if the text appeared in a table
              if table_extraction_flag == False:
                  # Use the function to extract the text and format for each text element
                  (line_text, format_per_line) = text_extraction(element)
                  # Append the text of each line to the page text
                  page_text.append(line_text)
                  # Append the format for each line containing text
                  line_format.append(format_per_line)
                  page_content.append(line_text)
              else:
                  # Omit the text that appeared in a table
                  pass


      # Create the key of the dictionary
      dctkey = 'Page_'+str(pagenum)
      # Add the list of list as the value of the page key
      text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]

      # Closing the pdf file object
      pdfFileObj.close()
  return text_per_page

def upload_file(files):
    print("here")
    file_paths = [file.name for file in files]
    return file_paths

with gr.Blocks() as demo:
    file_output = gr.File()
    upload_button = gr.UploadButton("Click to Upload a File", file_types=[".pdf"])
    upload_button.upload(upload_file, upload_button, file_output)
    
    pdf_path = file_output

demo.launch(debug=True)

text_per_page = read_pdf(pdf_path)

Page_0 = text_per_page['Page_0']

def nested_list_to_string(nested_list):
    result = ''
    for element in nested_list:
        if isinstance(element, list):  # Check if the element is a list
            result += nested_list_to_string(element)  # Recursively process the list
        elif isinstance(element, str):  # Check if the element is a string
            result += element  # Append the string to the result
    return result

Page_0 = text_per_page['Page_0']
string_result = nested_list_to_string(Page_0)

def extract_abstract(page_0):
    def nested_list_to_string(nested_list):
        result = ''
        for element in nested_list:
            if isinstance(element, list):  # Check if the element is a list
                result += nested_list_to_string(element)  # Recursively process the list
            elif isinstance(element, str):  # Check if the element is a string
                result += element  # Append the string to the result
        return result

    # Convert the nested list into a single string
    full_text = nested_list_to_string(page_0)

    # Find the start of the 'Abstract' section and the end of it (start of 'Introduction')
    start_index = full_text.find('Abstract')
    end_index = full_text.find('Introduction')

    # If both 'Abstract' and 'Introduction' are found, extract the text in between
    if start_index != -1 and end_index != -1:
        # Extract the text and remove the word 'Abstract'
        abstract_text = full_text[start_index + len('Abstract'):end_index]
        return abstract_text.strip()
    else:
        return "Abstract or Introduction section not found."

# Example usage
Page_0 = text_per_page['Page_0']
abstract_text = extract_abstract(Page_0)

wall_of_text = abstract_text

result = summarizer(
    wall_of_text,
    min_length=1,
    max_length=30,
    no_repeat_ngram_size=3,
    encoder_no_repeat_ngram_size=3,
    repetition_penalty=3.5,
    num_beams=4,
    early_stopping=True,
)

# Access the first element of the list (which is the dictionary) and then the value of 'summary_text'
summary_string = result[0]['summary_text']

print(summary_string)

from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")