File size: 2,703 Bytes
0cbcb1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b49864d
 
 
 
3105091
b49864d
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import gradio as gr
import re
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, pipeline

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")
pipe2 = pipeline('summarization', model="Tiju1996/conversation-summ")



def process_text(text):
    # Remove all reference citations
    text = re.sub(r'\[[0-9]*\]', '', text)

    # Remove all footnotes
    text = re.sub(r'\[\d*\]', '', text)

    # Remove all images
    text = re.sub(r'(\[[^\]]*\])?\[[^\]]*\]', '', text)

    # Remove all non-string characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Remove all emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    # Remove all HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    #Remove all hyperlinks from the text
    text=re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1', text)

    #Remove all url from the text
    text=re.sub(r'http\S+', '', text)

    # Strip whitespace
    text = text.strip(" ")

    return text


def summarize(article_en_raw):
    article_en=process_text(article_en_raw)
    summary_en=pipe2(article_en)
    model_inputs = tokenizer(summary_en[0]['summary_text'], return_tensors="pt")
    generated_tokens = model.generate(
        **model_inputs,
        forced_bos_token_id=tokenizer.lang_code_to_id["hi_IN"]
    )
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    return  translation[0]

input_text = gr.inputs.Textbox(lines=20, label="Enter text document to be summarized")
output_text = gr.outputs.Textbox(label="Summarized Text")

#gr.Interface(fn=summarize, inputs=input_text, outputs=output_text, title="Text Summarization App",  description="Enter a text document and get its summarized version.").launch()


gradio_interface = gr.Interface(fn=summarize, inputs=input_text, outputs=output_text, 
                                title="DistilBART Text Summarization App", 
                                description="Enter a text document and get its summarized version.")

if __name__ == "__main__":
    gradio_interface.launch()