File size: 9,198 Bytes
e623457
 
d03bc9b
f2f5171
957c035
3207602
e623457
3207602
d03bc9b
2a3c4ce
e623457
 
2a3c4ce
e623457
 
 
 
 
 
957c035
e623457
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d03bc9b
957c035
28a5857
f2f5171
 
 
 
 
 
 
957c035
f2f5171
957c035
 
f2f5171
 
 
 
957c035
 
2a3c4ce
 
e623457
 
 
 
 
957c035
2a3c4ce
957c035
 
 
e623457
957c035
 
 
e623457
957c035
 
 
e623457
d95cbea
957c035
 
 
 
e623457
 
957c035
e623457
 
 
957c035
e623457
 
 
957c035
e623457
 
 
957c035
e623457
 
 
 
 
957c035
d0f5098
 
 
 
 
f2f5171
28a5857
f2f5171
cdd5b2f
3207602
cdd5b2f
3207602
957c035
f2f5171
 
 
 
 
957c035
d0f5098
957c035
 
2a3c4ce
d03bc9b
e623457
 
 
 
 
957c035
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e623457
 
957c035
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e623457
957c035
e623457
d0f5098
e623457
 
 
 
d0f5098
e623457
 
 
957c035
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import re

import gradio as gr
from pydantic import BaseModel
from transformers import pipeline
from loguru import logger

# from pydantic import BaseModel

# RU_SUMMARY_MODEL = "IlyaGusev/rubart-large-sum"
# RU_SUMMARY_MODEL = "IlyaGusev/mbart_ru_sum_gazeta"
RU_SUMMARY_MODEL = "csebuetnlp/mT5_multilingual_XLSum"
# RU_SENTIMENT_MODEL = "IlyaGusev/rubart-large-sentiment"
RU_SENTIMENT_MODEL = "blanchefort/rubert-base-cased-sentiment"

EN_SUMMARY_MODEL = "csebuetnlp/mT5_multilingual_XLSum"
EN_SENTIMENT_MODEL = "distilbert-base-uncased-finetuned-sst-2-english"


DEFAULT_EN_TEXT = """Flags on official buildings are being flown at half-mast and a minute's silence will be observed at midday.
Fourteen people were shot dead at the Faculty of Arts building of Charles University in the capital by a student who then killed himself.
Police are working to uncover the motive behind the attack.
It is one of the deadliest assaults by a lone gunman in Europe this century.
Those killed in Thursday's attack included Lenka Hlavkova, head of the Institute of Musicology at the university.
Other victims were named as translator and Finnish literature expert Jan Dlask and student Lucie Spindlerova.
The shooting began at around 15:00 local time (14:00 GMT) at the Faculty of Arts building off Jan Palach Square in the centre of the Czech capital.
The gunman opened fire in the corridors and classrooms of the building, before shooting himself as security forces closed in on him, police say.
US tourist Hannah Mallicoat told the BBC that she and her family had been on Jan Palach Square during the attack.
"A crowd of people were crossing the street when the first shot hit. I thought it was something like a firecracker or a car backfire until I heard the second shot and people started running," she said.
"I saw a bullet hit the ground on the other side of the square about 30ft [9m] away before ducking into a store. The whole area was blocked off and dozens of police cars and ambulances were going towards the university."
In a statement, Czech Prime Minister Petr Fiala said the country had been shocked by this "horrendous act".
"It is hard to find the words to express condemnation on the one hand and, on the other, the pain and sorrow that our entire society is feeling in these days before Christmas."
The gunman is thought to have killed his father at a separate location. He is also suspected in the killing of a young man and his two-month-old daughter who were found dead in a forest on the outskirts of Prague on 15 December.
"""

DEFAULT_RU_TEXT = """В результате взрыва на заправке, который произошел накануне вечером, 
пострадали 56 человек, 13 из них — дети, сообщил минздрав Дагестана. 
Погибли 12 человек, в том числе двое несовершеннолетних. На место происшествия 
приехала глава минздрава республики Татьяна Беляева, она держит под личным контролем 
оказание помощи пострадавшим. В Махачкалу вылетел первый заместитель министра здравоохранения России Виктор Фисенко.
Врачам и пострадавшим помогают волонтеры Всероссийского студенческого корпуса спасателей 
и сотрудники некоммерческой организации «Добровольцы Дагестана», сообщило министерство молодежи Дагестана. 
Жители республики массово пришли сдавать кровь, заявил региональный минздрав. 
«Просим отложить визит на станцию переливания на завтра. Запасы крови есть, 
доноров для их пополнения на данный час тоже уже немало», — написало ведомство.
"""


class TextRequest(BaseModel):
    text: str


class Result(BaseModel):
    sentiment_score: float
    sentiment_label: str
    summary: str

    def to_str(self):
        return f"Summary:  {self.summary}\nSentiment:  {self.sentiment_label} ({self.sentiment_score:.3f})"


# class Response(BaseModel):
#     results: List[Result] # list of Result objects


class Summarizer:
    ru_summary_pipe: pipeline
    ru_sentiment_pipe: pipeline
    en_summary_pipe: pipeline
    en_sentiment_pipe: pipeline
    # sum_model_name = "csebuetnlp/mT5_multilingual_XLSum"
    # sum_tokenizer = AutoTokenizer.from_pretrained(sum_model_name)
    # sum_model = AutoModelForSeq2SeqLM.from_pretrained(sum_model_name)

    def __init__(self) -> None:
        sum_pipe = pipeline(
            "summarization", model=RU_SUMMARY_MODEL, max_length=100, truncation=True
        )
        self.ru_summary_pipe = sum_pipe
        self.ru_sentiment_pipe = pipeline(
            "sentiment-analysis", model=RU_SENTIMENT_MODEL
        )
        self.en_summary_pipe = sum_pipe
        self.en_sentiment_pipe = pipeline(
            "sentiment-analysis", model=EN_SENTIMENT_MODEL
        )

    def mT5_summarize(self, text: str) -> str:
        """Handle text with mT5 model without pipeline"""

        def whitespace_handler(text: str):
            return re.sub("\s+", " ", re.sub("\n+", " ", text.strip()))

        input_ids = self.sum_tokenizer(
            [whitespace_handler(text)],
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=512,
        )["input_ids"]

        output_ids = self.sum_model.generate(
            input_ids=input_ids, max_length=84, no_repeat_ngram_size=2, num_beams=4
        )[0]

        summary = self.sum_tokenizer.decode(
            output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )

        return summary

    def get_pipe(self, lang: str):
        logger.info(f"Pipe language: {lang}")
        if lang == "en":
            return self.en_summary_pipe, self.en_sentiment_pipe
        if lang == "ru":
            return self.ru_summary_pipe, self.ru_sentiment_pipe
        raise ValueError(f"Language {lang} is not supported")

    def summarize(self, req: TextRequest, lang: str = "en") -> Result:
        sum_pipe, sent_pipe = self.get_pipe(lang)
        response_summary = sum_pipe(req)
        logger.info(response_summary)
        response_sentiment = sent_pipe(req)
        logger.info(response_sentiment)
        result = Result(
            summary=response_summary[0]["summary_text"],
            sentiment_label=response_sentiment[0]["label"],
            sentiment_score=response_sentiment[0]["score"],
        )
        return result

    def get_summary(self, req: TextRequest, lang: str = "en") -> str:
        return self.summarize(req, lang).to_str()


if __name__ == "__main__":
    pipe = Summarizer()

    with gr.Blocks() as demo:
        with gr.Row():
            with gr.Column(scale=2, min_width=600):
                en_sum_description = gr.Markdown(
                    value=f"Model for Summary: {EN_SUMMARY_MODEL}"
                )
                en_sent_description = gr.Markdown(
                    value=f"Model for Sentiment: {EN_SENTIMENT_MODEL}"
                )
                en_inputs = gr.Textbox(
                    label="en_input",
                    lines=5,
                    value=DEFAULT_EN_TEXT,
                    placeholder=DEFAULT_EN_TEXT,
                )
                en_lang = gr.Textbox(value="en", visible=False)
                en_outputs = gr.Textbox(
                    label="en_output",
                    lines=5,
                    placeholder="Summary and Sentiment would be here...",
                )
                en_inbtn = gr.Button("Proceed")
            with gr.Column(scale=2, min_width=600):
                ru_sum_description = gr.Markdown(
                    value=f"Model for Summary: {RU_SUMMARY_MODEL}"
                )
                ru_sent_description = gr.Markdown(
                    value=f"Model for Sentiment: {RU_SENTIMENT_MODEL}"
                )
                ru_inputs = gr.Textbox(
                    label="ru_input",
                    lines=5,
                    value=DEFAULT_RU_TEXT,
                    placeholder=DEFAULT_RU_TEXT,
                )
                ru_lang = gr.Textbox(value="ru", visible=False)
                ru_outputs = gr.Textbox(
                    label="ru_output",
                    lines=5,
                    placeholder="Здесь будет обобщение и эмоциональный окрас текста...",
                )
                ru_inbtn = gr.Button("Запустить")

        en_inbtn.click(
            pipe.get_summary,
            [en_inputs, en_lang],
            [en_outputs],
        )
        ru_inbtn.click(
            pipe.get_summary,
            [ru_inputs, ru_lang],
            [ru_outputs],
        )
    demo.launch(show_api=False)