Spaces:

yilmazmusa-ml
/

abstract_summarizer

Sleeping

App Files Files Community

abstract_summarizer / app.py

yilmazmusa

fixed app.py

0dd9926 10 months ago

raw

history blame

No virus

3.47 kB

	# https://huggingface.co/spaces/yilmazmusa_ml/abstract_summarizer

	# Here are the imports
	import warnings
	import pdfplumber
	import torch
	from transformers import pipeline, AutoProcessor, AutoModel
	import numpy as np
	import gradio as gr
	from scipy.io.wavfile import write as write_wav
	warnings.filterwarnings("ignore")


	# Here is the code
	def extract_abstract(uploaded_file):
	with pdfplumber.open(pdf_bytes) as pdf:
	abstract = ""
	# Iterate through each page
	for page in pdf.pages:
	text = page.extract_text(x_tolerance = 1, y_tolerance = 1) # these parameters are set 1 in order to get spaces between words and lines
	# Search for the "Abstract" keyword
	if "abstract" in text.lower():
	# Found the "Abstract" keyword
	start_index = text.lower().find("abstract") # find the "abstract" title as starter index
	end_index = text.lower().find("introduction") # find the "introduction" title as end index
	abstract = text[start_index:end_index]
	break
	print(abstract)
	return abstract

	def process_summary(summary):
	# Split the summary by the first period
	summary = summary[0]["summary_text"]
	sentences = summary.split('.', 1)
	if len(sentences) > 0:
	# Retrieve the first part before the period
	processed_summary = sentences[0].strip() + "."
	# Replace "-" with an empty string
	processed_summary = processed_summary.replace("-", "")
	return processed_summary

	# Function for summarization and audio conversion
	def summarize_and_convert_to_audio(pdf_file):
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(device)

	# Move models and related tensors to CUDA device if available
	processor = AutoProcessor.from_pretrained("suno/bark")
	model = AutoModel.from_pretrained("suno/bark").to(device)

	# Extract abstract
	abstract_text = extract_abstract(pdf_file)

	if not abstract_text:
	return "No 'abstract' section found in the uploaded PDF. Please upload a different PDF."

	# Summarize the abstract
	summarization_pipeline = pipeline(task='summarization', model='knkarthick/MEETING_SUMMARY', min_length=15, max_length=30)
	summarized_text = summarization_pipeline(abstract_text)
	one_sentence_summary = process_summary(summarized_text)

	print(one_sentence_summary)

	# Text-to-audio conversion
	inputs = processor(
	text=[one_sentence_summary],
	return_tensors="pt",
	)
	inputs = inputs.to(device)

	speech_values = model.generate(**inputs, do_sample=True)
	sampling_rate = model.generation_config.sample_rate

	# Convert speech values to audio data
	audio_data = speech_values.cpu().numpy().squeeze()

	# Convert audio data to bytes
	with BytesIO() as buffer:
	write_wav(buffer, sampling_rate, audio_data.astype(np.float32))
	audio_bytes = buffer.getvalue()

	return audio_bytes, sampling_rate # Return audio as bytes with sampling rate


	# Create a Gradio interface
	iface = gr.Interface(
	fn=summarize_and_convert_to_audio,
	inputs=gr.UploadButton(label="Upload PDF", type="filepath", file_types=["pdf"]), # Set to accept only PDF files
	outputs=gr.Audio(label="Audio"),
	title="PDF Abstract Summarizer",
	description="Upload a PDF with an abstract to generate a summarized audio."
	)

	iface.launch()