Spaces:

rashisinghal
/

ai_speech_application

Runtime error

File size: 7,959 Bytes

# https://huggingface.co/spaces/rashisinghal/ai_speech_application

# Here are the imports
"""
!pip install pymupdf
!pip install git+https://github.com/huggingface/transformers.git
!pip install datasets sentencepiece
!pip install unidecode
!pip install transformers
!pip install gradio
"""
import gradio as gr
import fitz
import torch
from unidecode import unidecode
import pandas as pd
import numpy as np
import re
import soundfile as sf
from IPython.display import Audio
from datasets import load_dataset
from transformers import pipeline
from transformers import SpeechT5HifiGan
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech


# Here is the code


def pdf_to_speech(pdf_path):
# The “doc” is a PyMuPDF’s Document class representing the whole document. We will get every necessary information from it, including the text.

    doc = fitz.open(pdf_path)

# We need to isolate various sections of the page in order to search for Abstract Paragraph. It can be done by passing the parameter “blocks” to the get_text() method.
# The output is a list of tuple items, each item will look like this:
# (x0, yo, x1, y1, "lines in the block", block_no, block_type)

    
# Since our PDF is a multipage document we will using a loop to get the plain text from the document
    for page in doc:
        text = page.get_text()
    output = page.get_text("blocks")

# ANALYZING THE TEXT TO EXTRACT ABSTRACT

# A span is an inline container that helps mark up a part of a text or a part of a document. In short, span is a small chunk of text.
# To get the spans from the PDF file, we have passed the parameter “dict” into the get_text() method of the doc object.
# The “block_dict” is a dictionary containing detailed information of all spans in a document.


    block_dict = {}
    page_num = 1
    for page in doc: # Iterate all pages in the document
        file_dict = page.get_text('dict') # Get the page dictionary
        block = file_dict['blocks'] # Get the block information
        block_dict[page_num] = block # Store in block dictionary
        page_num += 1 # Increase the page value by 1
    

# In this we will retrieve the spans and store them in a DataFrame as follow:
# The code tries to loop over the page, blocks, and lines in a document. Then we will get every span in a line.
# Although there are some properties in the spans, we care about the bbox (the bounding box), size, font, and text only.


    spans = pd.DataFrame(columns=['xmin', 'ymin', 'xmax', 'ymax', 'text', 'tag'])
    rows = []
    for page_num, blocks in block_dict.items():
        for block in blocks:
            if block['type'] == 0:
                for line in block['lines']:
                    for span in line['spans']:
                        xmin, ymin, xmax, ymax = list(span['bbox'])
                        font_size = span['size']
                        text = unidecode(span['text'])
                        span_font = span['font']
                        is_upper = False
                        is_bold = False
                        if "bold" in span_font.lower():
                            is_bold = True
                        if re.sub("[\(\[].*?[\)\]]", "", text).isupper():
                            is_upper = True
                        if text.replace(" ","") !=  "":
                            rows.append((xmin, ymin, xmax, ymax, text, is_upper, is_bold, span_font, font_size))
    span_df = pd.DataFrame(rows, columns=['xmin','ymin','xmax','ymax', 'text', 'is_upper','is_bold','span_font', 'font_size'])

    span_scores=[]
    span_num_occur={}
    special = '[(_:/,#%\=@)]'
    for index, span_row in span_df.iterrows():
    
        score = round(span_row.font_size)
        text = span_row.text
        if not re.search(special, text):
            if span_row.is_bold:
                score +=1
            if span_row.is_upper:
                score +=1
        span_scores.append(score)
    values, counts = np.unique(span_scores, return_counts=True)


# From this, we want to know the numer of unique text styles in the document, and the number of its occurrences.

    values, counts = np.unique(span_scores, return_counts=True)
    style_dict = {}
    for value, count in zip(values, counts):
        style_dict[value] = count
    sorted(style_dict.items(), key=lambda x: x[1])


# From this, we will be able to create a new column in our span dataframe for the tag information.
# More the occurances means its a Paragraph and not the heading

    p_size = max(style_dict, key=style_dict.get)
    idx = 0
    tag = {}
    for size in sorted(values, reverse = True):
        idx += 1
        if size == p_size:
            idx = 0
            tag[size] = 'p'
        if size > p_size:
            tag[size] = 'h{0}'.format(idx)
        if size < p_size:
            tag[size] = 's{0}'.format(idx)
        

    span_tags = [tag[score] for score in span_scores]
    span_df['tag'] = span_tags

# We’re now clear on which text is the headings and which one is the content in the document. This is very useful when extracting information
# since we want all paragraphs below a heading will be grouped. We will create a new dataframe where we can store the text by headings.
# Thus we can easily extract information based on headings.

    headings_list = []
    text_list = []
    tmp = []
    heading = ''

    for index, span_row in span_df.iterrows():
        text = span_row.text
        tag = span_row.tag
        if 'h' in tag:
            headings_list.append(text)
            text_list.append('\n'.join(tmp))
            tmp = []
            heading = text            
        else:
            tmp.append(text)
    text_list.append('\n'.join(tmp))
    text_list = text_list[1:]
    text_df = pd.DataFrame(zip(headings_list, text_list),columns=['heading', 'content'] )

    # Extracting the content of the column of the dataframe where the another column named heading is Abstract.
    # Basically, extracting the content of the paragraph abstract
    str_abstract=text_df.loc[text_df['heading'] == 'Abstract', 'content'].item()

    # Using the Summarization model pszemraj/long-t5-tglobal-base-sci-simplify in the pipeline in order to generate summary of text

    new_summarized_pipeline= pipeline(task="summarization", model="pszemraj/long-t5-tglobal-base-sci-simplify")
    summarized_text=new_summarized_pipeline(str_abstract)

    # Creating string from the list of dictionary
    str_summary = ",".join([item['summary_text'] for item in summarized_text])

    # We tokenize the input with the processor. The input is the string that we generated of the summary
    
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")

    inputs = processor(text=str_summary, return_tensors="pt")

    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

    spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)


    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
    with torch.no_grad():
        speech = vocoder(spectrogram)
    
# Generating the speech of the summarized one liner Abstract
    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

    sr=16000
    return (sr,speech.numpy())
    # Audio(speech, rate=16000)


# Using Gradio Interface to specify the function name, inputs and outputs
app = gr.Interface(fn=pdf_to_speech,
                     inputs="file",
                     outputs="audio",
                     title="PDF Abstract to Audio Application",
                     description="This App accepts PDF which has Abstract , summarises it and converts into Speech. Click to upload PDF with abstract.",
                     theme="soft")

app.launch()