Spaces:

Alioth86
/

SpeechAbstractor

Runtime error

File size: 5,398 Bytes

import PyPDF2
import pdfplumber
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
import re
import torch
import transformers
from transformers import pipeline
from datasets import load_dataset
import soundfile as sf
from IPython.display import Audio
from datasets import load_dataset
import sentencepiece as spm
import os
import tempfile
import gradio as gr

description = """**SpeechAbstractor**\n
This app enables users to upload academic articles in PDF format, specifically focusing on abstracts. 
It efficiently summarizes the abstract and provides an audio playback of the summarized content. 
Below are some example PDFs for you to experiment with. Feel free to explore the functionality of SpeechAbstractor!"""

examples = [
    ["Article_7.pdf"],["Article_11.pdf"]
    ]

#reporting the functions created for the part 1 
def text_extraction(element):
    line_text = element.get_text()

    line_formats = []
    for text_line in element:
        if isinstance(text_line, LTTextContainer):
            for character in text_line:
                if isinstance(character, LTChar):
                    line_formats.append(character.fontname)
                    line_formats.append(character.size)
    format_per_line = list(set(line_formats))

    return (line_text, format_per_line)

def read_pdf(pdf_pathy):
  pdfFileObj = open(pdf_pathy, 'rb')
  pdfReaded = PyPDF2.PdfReader(pdfFileObj)

  text_per_pagy = {}
  for pagenum, page in enumerate(extract_pages(pdf_pathy)):
      print("Elaborating Page_" +str(pagenum))
      pageObj = pdfReaded.pages[pagenum]
      page_text = []
      line_format = []
      page_content = []

      pdf = pdfplumber.open(pdf_pathy)

      page_elements = [(element.y1, element) for element in page._objs]
      page_elements.sort(key=lambda a: a[0], reverse=True)

      for i,component in enumerate(page_elements):
          pos= component[0]
          element = component[1]

          if isinstance(element, LTTextContainer):
                  (line_text, format_per_line) = text_extraction(element)
                  page_text.append(line_text)
                  line_format.append(format_per_line)
                  page_content.append(line_text)
          

      dctkey = 'Page_'+str(pagenum)
      text_per_pagy[dctkey]= [page_text, line_format, page_content]

  pdfFileObj.close()


  return text_per_pagy


def clean_text(text):
    # remove extra spaces
    text = re.sub(r'\s+', ' ', text)

    return text.strip()


def extract_abstract(text_per_pagy):
    abstract_text = ""
    
    for page_num, page_text in text_per_pagy.items():
        if page_text:
            page_text = page_text.replace("- ", "")
           
            start_index = page_text.find("Abstract")
            if start_index != -1:
                start_index += len("Abstract") + 1

                end_markers = ["Introduction", "Summary", "Overview", "Background", "Contents"]
                end_index = -1

                for marker in end_markers:
                    temp_index = page_text.find(marker, start_index)
                    if temp_index != -1:
                        end_index = temp_index
                        break

                if end_index == -1:
                    end_index = len(page_text)

                abstract = page_text[start_index:end_index].strip()

                abstract_text += " " + abstract

                break

    return abstract_text

#let's define a main function that gets the uploaded file (pdf) to do the job
def main_function(uploaded_filepath):
    #put a control to see if there is a file uploaded
    if uploaded_filepath is None:
        return "No file loaded", None

    #read and process the file according to read_pdf
    text_per_pagy = read_pdf(uploaded_filepath)

    #cleaning the text and getting the abstract using the 2 other functions
    for key, value in text_per_pagy.items():
        cleaned_text = clean_text(' '.join(value[0]))
        text_per_pagy[key] = cleaned_text
    abstract_text = extract_abstract(text_per_pagy)

    #abstract the summary with my pipeline and model, deciding the length
    summarizer = pipeline("summarization", model="pszemraj/long-t5-tglobal-base-sci-simplify")
    summary = summarizer(abstract_text, max_length=65, do_sample=False)[0]['summary_text']

    #generating the audio from the text, with my pipeline and model
    synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
    speech = synthesiser(summary, forward_params={"speaker_embeddings": speaker_embedding})

    #saving the audio in a temporary file
    audio_file_path = "summary.wav"
    sf.write(audio_file_path, speech["audio"], samplerate=speech["sampling_rate"])

    #the function returns the 2 pieces we need
    return summary, audio_file_path

#let's communicate with gradio what it has to put in
iface = gr.Interface(
    fn=main_function,
    inputs=gr.File(type="filepath"),  
    outputs=[gr.Textbox(label="Summary Text"), gr.Audio(label="Summary Audio", type="filepath")],
    description=description, 
    examples=examples
)

#launching the app
if __name__ == "__main__":
    iface.launch()