Spaces:

noahnsimbe
/

text-summarizer

Runtime error

Noah Nsimbe

udate

b5498e2 5 months ago

3.46 kB

	import gradio as gr
	from transformers import pipeline
	from pypdf import PdfReader
	import docx
	import os
	from langdetect import detect

	summarizer = pipeline(task="summarization", model="facebook/bart-large-cnn")
	translator_to_french = pipeline(
	task="translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr"
	)
	translator_to_english = pipeline(
	task="translation_fr_to_en", model="Helsinki-NLP/opus-mt-fr-en"
	)

	MAX_FILE_SIZE = 10000000

	class TextExtractor:
	def __init__(self, doc_location: str):
	if doc_location is None:
	raise Exception(f"Please select a PDF to summarize")
	self.doc_location = doc_location

	def extract_text_from_pdf(self):
	reader = PdfReader(self.doc_location)
	text = ""

	for page in reader.pages:
	text += page.extract_text()

	return text

	def extract_text_from_doc(self):
	doc = docx.Document(self.doc_location)
	text = ""

	for paragraph in doc.paragraphs:
	text += paragraph.text + "\n"
	return text

	def extract_text_from_txt(self):
	with open(self.doc_location, "r", encoding="utf-8") as file:
	text = file.read()
	return text

	def extract_text_from_txt(self):
	with open(self.doc_location, "r", encoding="utf-8") as file:
	text = file.read()
	return text

	def text_length(self):
	words = self.text.split()
	num_words = len(words)
	return num_words

	def get_text(self) -> str:
	file_extension = os.path.splitext(self.doc_location)[1]
	if file_extension == ".pdf":
	self.text = self.extract_text_from_pdf()
	elif file_extension == ".txt":
	self.text = self.extract_text_from_txt()
	elif file_extension == ".docx" or file_extension == ".doc":
	self.text = self.extract_text_from_doc()
	else:
	raise gr.Error(f"We only support .pdf, .txt, .doc and .docx files")

	if len(self.text) > MAX_FILE_SIZE:
	raise gr.Error(
	f"Document exceeds the maximum supported size of {MAX_FILE_SIZE} characters."
	)

	return self.text




	def summarize(doc: str, target_language: str) -> str:
	text_extractor = TextExtractor(doc)
	text = text_extractor.get_text()

	text_length = text_extractor.text_length()
	summary_length = int(text_length / 2)

	summary = summarizer(text, max_length=summary_length, do_sample=False)[0]["summary_text"]
	detected_lang = detect(summary)

	if target_language is None:
	pass
	elif detected_lang == "fr" and str(target_language).lower() == "english":
	summary = translator_to_english(summary)[0]["translation_text"]
	elif detected_lang == "en" and str(target_language).lower() == "french":
	summary = translator_to_french(summary)[0]["translation_text"]

	return summary


	app = gr.Interface(
	fn=summarize,
	inputs=[
	gr.File(
	label="Document to summarize",
	file_types=["pdf", "docx", "doc", "txt", "odt", "dot", "dotx"],
	),
	gr.Radio(
	label="Translate summary to", choices=["English", "French"], value="English"
	),
	],
	outputs=gr.Textbox(label="Summary"),
	examples=[
	["data/pd-file-example.pdf"],
	["data/doc-file-example.docx"],
	["data/text-file-example.txt"],
	],
	)

	if __name__ == "__main__":
	app.launch()