Tiju1996's picture
Update app.py
3105091
import gradio as gr
import re
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, pipeline
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")
pipe2 = pipeline('summarization', model="Tiju1996/conversation-summ")
def process_text(text):
# Remove all reference citations
text = re.sub(r'\[[0-9]*\]', '', text)
# Remove all footnotes
text = re.sub(r'\[\d*\]', '', text)
# Remove all images
text = re.sub(r'(\[[^\]]*\])?\[[^\]]*\]', '', text)
# Remove all non-string characters
text = re.sub(r'[^\x00-\x7F]+', '', text)
# Remove all emojis
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
text = emoji_pattern.sub(r'', text)
# Remove all HTML tags
text = re.sub(r'<.*?>', '', text)
#Remove all hyperlinks from the text
text=re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1', text)
#Remove all url from the text
text=re.sub(r'http\S+', '', text)
# Strip whitespace
text = text.strip(" ")
return text
def summarize(article_en_raw):
article_en=process_text(article_en_raw)
summary_en=pipe2(article_en)
model_inputs = tokenizer(summary_en[0]['summary_text'], return_tensors="pt")
generated_tokens = model.generate(
**model_inputs,
forced_bos_token_id=tokenizer.lang_code_to_id["hi_IN"]
)
translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
return translation[0]
input_text = gr.inputs.Textbox(lines=20, label="Enter text document to be summarized")
output_text = gr.outputs.Textbox(label="Summarized Text")
#gr.Interface(fn=summarize, inputs=input_text, outputs=output_text, title="Text Summarization App", description="Enter a text document and get its summarized version.").launch()
gradio_interface = gr.Interface(fn=summarize, inputs=input_text, outputs=output_text,
title="DistilBART Text Summarization App",
description="Enter a text document and get its summarized version.")
if __name__ == "__main__":
gradio_interface.launch()