File size: 7,959 Bytes
a3908d4
 
 
 
 
 
 
 
 
 
 
 
da4e22b
a3908d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# https://huggingface.co/spaces/rashisinghal/ai_speech_application

# Here are the imports
"""
!pip install pymupdf
!pip install git+https://github.com/huggingface/transformers.git
!pip install datasets sentencepiece
!pip install unidecode
!pip install transformers
!pip install gradio
"""
import gradio as gr
import fitz
import torch
from unidecode import unidecode
import pandas as pd
import numpy as np
import re
import soundfile as sf
from IPython.display import Audio
from datasets import load_dataset
from transformers import pipeline
from transformers import SpeechT5HifiGan
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech


# Here is the code


def pdf_to_speech(pdf_path):
# The “doc” is a PyMuPDF’s Document class representing the whole document. We will get every necessary information from it, including the text.

    doc = fitz.open(pdf_path)

# We need to isolate various sections of the page in order to search for Abstract Paragraph. It can be done by passing the parameter “blocks” to the get_text() method.
# The output is a list of tuple items, each item will look like this:
# (x0, yo, x1, y1, "lines in the block", block_no, block_type)

    
# Since our PDF is a multipage document we will using a loop to get the plain text from the document
    for page in doc:
        text = page.get_text()
    output = page.get_text("blocks")

# ANALYZING THE TEXT TO EXTRACT ABSTRACT

# A span is an inline container that helps mark up a part of a text or a part of a document. In short, span is a small chunk of text.
# To get the spans from the PDF file, we have passed the parameter “dict” into the get_text() method of the doc object.
# The “block_dict” is a dictionary containing detailed information of all spans in a document.


    block_dict = {}
    page_num = 1
    for page in doc: # Iterate all pages in the document
        file_dict = page.get_text('dict') # Get the page dictionary
        block = file_dict['blocks'] # Get the block information
        block_dict[page_num] = block # Store in block dictionary
        page_num += 1 # Increase the page value by 1
    

# In this we will retrieve the spans and store them in a DataFrame as follow:
# The code tries to loop over the page, blocks, and lines in a document. Then we will get every span in a line.
# Although there are some properties in the spans, we care about the bbox (the bounding box), size, font, and text only.


    spans = pd.DataFrame(columns=['xmin', 'ymin', 'xmax', 'ymax', 'text', 'tag'])
    rows = []
    for page_num, blocks in block_dict.items():
        for block in blocks:
            if block['type'] == 0:
                for line in block['lines']:
                    for span in line['spans']:
                        xmin, ymin, xmax, ymax = list(span['bbox'])
                        font_size = span['size']
                        text = unidecode(span['text'])
                        span_font = span['font']
                        is_upper = False
                        is_bold = False
                        if "bold" in span_font.lower():
                            is_bold = True
                        if re.sub("[\(\[].*?[\)\]]", "", text).isupper():
                            is_upper = True
                        if text.replace(" ","") !=  "":
                            rows.append((xmin, ymin, xmax, ymax, text, is_upper, is_bold, span_font, font_size))
    span_df = pd.DataFrame(rows, columns=['xmin','ymin','xmax','ymax', 'text', 'is_upper','is_bold','span_font', 'font_size'])

    span_scores=[]
    span_num_occur={}
    special = '[(_:/,#%\=@)]'
    for index, span_row in span_df.iterrows():
    
        score = round(span_row.font_size)
        text = span_row.text
        if not re.search(special, text):
            if span_row.is_bold:
                score +=1
            if span_row.is_upper:
                score +=1
        span_scores.append(score)
    values, counts = np.unique(span_scores, return_counts=True)


# From this, we want to know the numer of unique text styles in the document, and the number of its occurrences.

    values, counts = np.unique(span_scores, return_counts=True)
    style_dict = {}
    for value, count in zip(values, counts):
        style_dict[value] = count
    sorted(style_dict.items(), key=lambda x: x[1])


# From this, we will be able to create a new column in our span dataframe for the tag information.
# More the occurances means its a Paragraph and not the heading

    p_size = max(style_dict, key=style_dict.get)
    idx = 0
    tag = {}
    for size in sorted(values, reverse = True):
        idx += 1
        if size == p_size:
            idx = 0
            tag[size] = 'p'
        if size > p_size:
            tag[size] = 'h{0}'.format(idx)
        if size < p_size:
            tag[size] = 's{0}'.format(idx)
        

    span_tags = [tag[score] for score in span_scores]
    span_df['tag'] = span_tags

# We’re now clear on which text is the headings and which one is the content in the document. This is very useful when extracting information
# since we want all paragraphs below a heading will be grouped. We will create a new dataframe where we can store the text by headings.
# Thus we can easily extract information based on headings.

    headings_list = []
    text_list = []
    tmp = []
    heading = ''

    for index, span_row in span_df.iterrows():
        text = span_row.text
        tag = span_row.tag
        if 'h' in tag:
            headings_list.append(text)
            text_list.append('\n'.join(tmp))
            tmp = []
            heading = text            
        else:
            tmp.append(text)
    text_list.append('\n'.join(tmp))
    text_list = text_list[1:]
    text_df = pd.DataFrame(zip(headings_list, text_list),columns=['heading', 'content'] )

    # Extracting the content of the column of the dataframe where the another column named heading is Abstract.
    # Basically, extracting the content of the paragraph abstract
    str_abstract=text_df.loc[text_df['heading'] == 'Abstract', 'content'].item()

    # Using the Summarization model pszemraj/long-t5-tglobal-base-sci-simplify in the pipeline in order to generate summary of text

    new_summarized_pipeline= pipeline(task="summarization", model="pszemraj/long-t5-tglobal-base-sci-simplify")
    summarized_text=new_summarized_pipeline(str_abstract)

    # Creating string from the list of dictionary
    str_summary = ",".join([item['summary_text'] for item in summarized_text])

    # We tokenize the input with the processor. The input is the string that we generated of the summary
    
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")

    inputs = processor(text=str_summary, return_tensors="pt")

    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

    spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)


    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
    with torch.no_grad():
        speech = vocoder(spectrogram)
    
# Generating the speech of the summarized one liner Abstract
    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

    sr=16000
    return (sr,speech.numpy())
    # Audio(speech, rate=16000)


# Using Gradio Interface to specify the function name, inputs and outputs
app = gr.Interface(fn=pdf_to_speech,
                     inputs="file",
                     outputs="audio",
                     title="PDF Abstract to Audio Application",
                     description="This App accepts PDF which has Abstract , summarises it and converts into Speech. Click to upload PDF with abstract.",
                     theme="soft")

app.launch()