File size: 9,279 Bytes
2f5e722
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bce7f77
2f5e722
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb160df
2f5e722
 
 
bd626a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f5e722
bd626a2
 
 
 
 
 
 
8718841
 
 
bd626a2
 
8718841
bd626a2
8718841
2f5e722
bd626a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8718841
bd626a2
 
8718841
bd626a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f5e722
bd626a2
 
 
 
2f5e722
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import os

import gdown as gdown
import nltk
import streamlit as st
import torch
from transformers import AutoTokenizer

from mt5 import MT5


def download_models(ids):
    """
    Download all models.

    :param ids: name and links of models
    :return:
    """

    # Download sentence tokenizer
    nltk.download('punkt')

    # Download model from drive if not stored locally
    for key in ids:
        if not os.path.isfile(f"model/{key}.ckpt"):
            url = f"https://drive.google.com/u/0/uc?id={ids[key]}"
            gdown.download(url=url, output=f"model/{key}.ckpt")


@st.cache(allow_output_mutation=True)
def load_model(model_path):
    """
    Load model and cache it.

    :param model_path: path to model
    :return:
    """

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # Loading model and tokenizer
    model = MT5.load_from_checkpoint(model_path).eval().to(device)
    model.tokenizer = AutoTokenizer.from_pretrained('tokenizer')

    return model


# Page config
st.set_page_config(layout="centered")
st.title("Questions/Answers Pairs Gen.")
st.write("Question Generation, Question Answering and Questions/Answers Generation using Google MT5. ")

# Variables
ids = {'mt5-small': st.secrets['small'],
       'mt5-base': st.secrets['base']}

maintenance = True

if maintenance:
    st.write("Unavailable for now (maintenance). ")
else:
    # Download all models from drive
    download_models(ids)

    # Task selection

    left, right = st.columns([4, 2])
    task = left.selectbox('Choose the task: ',
                          options=['Questions/Answers Pairs Generation', 'Question Answering', 'Question Generation'],
                          help='Choose the task you want to try out')

    # Model selection
    model_path = right.selectbox('', options=[k for k in ids], index=1, help='Model to use. ')
    model = load_model(model_path=f"model/{model_path}.ckpt")
    right.write(model.device)

    if task == 'Questions/Answers Pairs Generation':
        # Input area
        inputs = st.text_area('Context:', value="A few years after the First Crusade, in 1107, the Normans under "
                                                "the command of Bohemond, Robert\'s son, landed in Valona and "
                                                "besieged Dyrrachium using the most sophisticated military "
                                                "equipment of the time, but to no avail. Meanwhile, they occupied "
                                                "Petrela, the citadel of Mili at the banks of the river Deabolis, "
                                                "Gllavenica (Ballsh), Kanina and Jericho. This time, "
                                                "the Albanians sided with the Normans, dissatisfied by the heavy "
                                                "taxes the Byzantines had imposed upon them. With their help, "
                                                "the Normans secured the Arbanon passes and opened their way to "
                                                "Dibra. The lack of supplies, disease and Byzantine resistance "
                                                "forced Bohemond to retreat from his campaign and sign a peace "
                                                "treaty with the Byzantines in the city of Deabolis. ", max_chars=2048,
                              height=250)
        split = st.checkbox('Split into sentences', value=True)

        if split:
            # Split into sentences
            sent_tokenized = nltk.sent_tokenize(inputs)
            res = {}

            with st.spinner('Please wait while the inputs are being processed...'):
                # Iterate over sentences
                for sentence in sent_tokenized:
                    predictions = model.multitask([sentence], max_length=512)
                    questions, answers, answers_bis = predictions['questions'], predictions['answers'], predictions[
                        'answers_bis']

                    # Build answer dict
                    content = {}
                    for question, answer, answer_bis in zip(questions[0], answers[0], answers_bis[0]):
                        content[question] = {'answer (extracted)': answer, 'answer (generated)': answer_bis}
                    res[sentence] = content

            # Answer area
            st.write(res)

        else:
            with st.spinner('Please wait while the inputs are being processed...'):
                # Prediction
                predictions = model.multitask([inputs], max_length=512)
                questions, answers, answers_bis = predictions['questions'], predictions['answers'], predictions[
                    'answers_bis']

                # Answer area
                zip = zip(questions[0], answers[0], answers_bis[0])
                content = {}
                for question, answer, answer_bis in zip:
                    content[question] = {'answer (extracted)': answer, 'answer (generated)': answer_bis}

            st.write(content)

    elif task == 'Question Answering':

        # Input area
        inputs = st.text_area('Context:', value="A few years after the First Crusade, in 1107, the Normans under "
                                                "the command of Bohemond, Robert\'s son, landed in Valona and "
                                                "besieged Dyrrachium using the most sophisticated military "
                                                "equipment of the time, but to no avail. Meanwhile, they occupied "
                                                "Petrela, the citadel of Mili at the banks of the river Deabolis, "
                                                "Gllavenica (Ballsh), Kanina and Jericho. This time, "
                                                "the Albanians sided with the Normans, dissatisfied by the heavy "
                                                "taxes the Byzantines had imposed upon them. With their help, "
                                                "the Normans secured the Arbanon passes and opened their way to "
                                                "Dibra. The lack of supplies, disease and Byzantine resistance "
                                                "forced Bohemond to retreat from his campaign and sign a peace "
                                                "treaty with the Byzantines in the city of Deabolis. ", max_chars=2048,
                              height=250)
        question = st.text_input('Question:', value="What forced Bohemond to retreat from his campaign? ")

        # Prediction
        with st.spinner('Please wait while the inputs are being processed...'):
            predictions = model.qa([{'question': question, 'context': inputs}], max_length=512)
            answer = {question: predictions[0]}

        # Answer area
        st.write(answer)

    elif task == 'Question Generation':

        # Input area
        inputs = st.text_area('Context (highlight answers with <hl> tokens): ',
                              value="A few years after the First Crusade, in <hl> 1107 <hl>, the <hl> Normans <hl> under "
                                    "the command of <hl> Bohemond <hl>, Robert\'s son, landed in Valona and "
                                    "besieged Dyrrachium using the most sophisticated military "
                                    "equipment of the time, but to no avail. Meanwhile, they occupied "
                                    "Petrela, <hl> the citadel of Mili <hl> at the banks of the river Deabolis, "
                                    "Gllavenica (Ballsh), Kanina and Jericho. This time, "
                                    "the Albanians sided with the Normans, dissatisfied by the heavy "
                                    "taxes the Byzantines had imposed upon them. With their help, "
                                    "the Normans secured the Arbanon passes and opened their way to "
                                    "Dibra. The <hl> lack of supplies, disease and Byzantine resistance <hl> "
                                    "forced Bohemond to retreat from his campaign and sign a peace "
                                    "treaty with the Byzantines in the city of Deabolis. ", max_chars=2048,
                              height=250)

        # Split by highlights
        hl_index = [i for i in range(len(inputs)) if inputs.startswith('<hl>', i)]
        contexts = []
        answers = []

        # Build a context for each highlight pair
        for i in range(0, len(hl_index), 2):
            contexts.append(inputs[:hl_index[i]].replace('<hl>', '') +
                            inputs[hl_index[i]: hl_index[i + 1] + 4] +
                            inputs[hl_index[i + 1] + 4:].replace('<hl>', ''))
            answers.append(inputs[hl_index[i]: hl_index[i + 1] + 4].replace('<hl>', '').strip())

        # Prediction
        with st.spinner('Please wait while the inputs are being processed...'):
            predictions = model.qg(contexts, max_length=512)

        # Answer area
        content = {}
        for pred, ans in zip(predictions, answers):
            content[pred] = ans
        st.write(content)