Spaces:

Eddevs
/

brian-challenge

Runtime error

App Files Files Community

Christian Koch commited on Apr 24, 2022

Commit

0df07e9

1 Parent(s): fe38db6

question generator

Browse files

Files changed (11) hide show

.gitignore +3 -0
app.py +61 -11
models/.gitkeep +0 -0
mt5.py +133 -0
question_generator.py +109 -0
requirements.txt +4 -0
tokenizer/added_tokens.json +1 -0
tokenizer/special_tokens_map.json +1 -0
tokenizer/spiece.model +3 -0
tokenizer/tokenizer.json +0 -0
tokenizer/tokenizer_config.json +1 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.idea/
+model/*.ckpt
+venv/

app.py CHANGED Viewed

@@ -1,12 +1,18 @@
 import streamlit as st
 from transformers import pipeline, PegasusForConditionalGeneration, PegasusTokenizer
 from fill_in_summary import FillInSummary
 from paraphrase import PegasusParaphraser
-def paraphrase(text):
-    return text
 st.title('Question Generator by Eddevs')
 select = st.selectbox('Type',  ['Question Generator', 'Paraphrasing', 'Summarization', 'Fill in the blank'])
@@ -18,17 +24,61 @@ if select == "Question Generator":
         # left_column.selectbox('Type',  ['Question Generator', 'Paraphrasing'])
         #st.selectbox('Model',  ['T5', 'GPT Neo-X'])
-        text_input = st.text_area("Input Text")
-        submitted = st.form_submit_button("Generate")
-        if submitted:
-            with st.spinner('Wait for it...'):
-                result = FillInSummary().summarize(text_input)
-            st.write(text_input)
-if select == "Summarization":
     with st.form("summarization"):
         # left_column, right_column = st.columns(2)
         # left_column.selectbox('Type',  ['Question Generator', 'Paraphrasing'])
@@ -44,7 +94,7 @@ if select == "Summarization":
             st.write(text_input)
-if select == "Fill in the blank":
     with st.form("fill_in_the_blank"):
         text_input = st.text_area("Input Text")
@@ -58,7 +108,7 @@ if select == "Fill in the blank":
             st.write(result)
-if select == "Paraphrasing":
     with st.form("paraphrasing"):
         # st.selectbox('Model',  ['T5', 'GPT Neo-X'])
         left_column, right_column = st.columns(2)

 import streamlit as st
 from transformers import pipeline, PegasusForConditionalGeneration, PegasusTokenizer
+import nltk
 from fill_in_summary import FillInSummary
 from paraphrase import PegasusParaphraser
+import question_generator as q
+# Question Generator Variables
+ids = {'mt5-small': st.secrets['small'],
+       'mt5-base': st.secrets['base']}
+st.set_page_config(layout="centered")
 st.title('Question Generator by Eddevs')
 select = st.selectbox('Type',  ['Question Generator', 'Paraphrasing', 'Summarization', 'Fill in the blank'])
         # left_column.selectbox('Type',  ['Question Generator', 'Paraphrasing'])
         #st.selectbox('Model',  ['T5', 'GPT Neo-X'])
+        # Download all models from drive
+        q.download_models(ids)
+        # Model selection
+        model_path = st.selectbox('', options=[k for k in ids], index=1, help='Model to use. ')
+        model = q.load_model(model_path=f"model/{model_path}.ckpt")
+        text_input = st.text_area("Input Text")
+        submitted = st.form_submit_button("Generate")
+        split = st.checkbox('Split into sentences', value=True)
+        if split:
+            # Split into sentences
+            sent_tokenized = nltk.sent_tokenize(inputs)
+            res = {}
+            with st.spinner('Please wait while the inputs are being processed...'):
+                # Iterate over sentences
+                for sentence in sent_tokenized:
+                    predictions = model.multitask([sentence], max_length=512)
+                    questions, answers, answers_bis = predictions['questions'], predictions['answers'], predictions[
+                        'answers_bis']
+                    # Build answer dict
+                    content = {}
+                    for question, answer, answer_bis in zip(questions[0], answers[0], answers_bis[0]):
+                        content[question] = {'answer (extracted)': answer, 'answer (generated)': answer_bis}
+                    res[sentence] = content
+            # Answer area
+            st.write(res)
+        else:
+            with st.spinner('Please wait while the inputs are being processed...'):
+                # Prediction
+                predictions = model.multitask([inputs], max_length=512)
+                questions, answers, answers_bis = predictions['questions'], predictions['answers'], predictions[
+                    'answers_bis']
+                # Answer area
+                zip = zip(questions[0], answers[0], answers_bis[0])
+                content = {}
+                for question, answer, answer_bis in zip:
+                    content[question] = {'answer (extracted)': answer, 'answer (generated)': answer_bis}
+            st.write(content)
+            if submitted:
+                with st.spinner('Wait for it...'):
+                    result = FillInSummary().summarize(text_input)
+                st.write(text_input)
+elif select == "Summarization":
     with st.form("summarization"):
         # left_column, right_column = st.columns(2)
         # left_column.selectbox('Type',  ['Question Generator', 'Paraphrasing'])
             st.write(text_input)
+elif select == "Fill in the blank":
     with st.form("fill_in_the_blank"):
         text_input = st.text_area("Input Text")
             st.write(result)
+elif select == "Paraphrasing":
     with st.form("paraphrasing"):
         # st.selectbox('Model',  ['T5', 'GPT Neo-X'])
         left_column, right_column = st.columns(2)

models/.gitkeep ADDED Viewed

File without changes

mt5.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# coding:utf-8
+"""
+Filename: mt5.py
+Author: @DvdNss
+Created on 12/30/2021
+"""
+from typing import List
+from pytorch_lightning import LightningModule
+from transformers import MT5ForConditionalGeneration, AutoTokenizer
+class MT5(LightningModule):
+    """
+    Google MT5 transformer class.
+    """
+    def __init__(self, model_name_or_path: str = None):
+        """
+        Initialize module.
+        :param model_name_or_path: model name
+        """
+        super().__init__()
+        # Load model and tokenizer
+        self.save_hyperparameters()
+        self.model = MT5ForConditionalGeneration.from_pretrained(
+            model_name_or_path) if model_name_or_path is not None else None
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,
+                                                       use_fast=True) if model_name_or_path is not None else None
+    def forward(self, **inputs):
+        """
+        Forward inputs.
+        :param inputs: dictionary of inputs (input_ids, attention_mask, labels)
+        """
+        return self.model(**inputs)
+    def qa(self, batch: List[dict], max_length: int = 512, **kwargs):
+        """
+        Question answering prediction.
+        :param batch: batch of dict {question: q, context: c}
+        :param max_length: max length of output
+        """
+        # Transform inputs
+        inputs = [f"question: {context['question']}  context: {context['context']}" for context in batch]
+        # Predict
+        outputs = self.predict(inputs=inputs, max_length=max_length, **kwargs)
+        return outputs
+    def qg(self, batch: List[str] = None, max_length: int = 512, **kwargs):
+        """
+        Question generation prediction.
+        :param batch: batch of context with highlighted elements
+        :param max_length: max length of output
+        """
+        # Transform inputs
+        inputs = [f"generate: {context}" for context in batch]
+        # Predict
+        outputs = self.predict(inputs=inputs, max_length=max_length, **kwargs)
+        return outputs
+    def ae(self, batch: List[str], max_length: int = 512, **kwargs):
+        """
+        Answer extraction prediction.
+        :param batch: list of context
+        :param max_length: max length of output
+        """
+        # Transform inputs
+        inputs = [f"extract: {context}" for context in batch]
+        # Predict
+        outputs = self.predict(inputs=inputs, max_length=max_length, **kwargs)
+        return outputs
+    def multitask(self, batch: List[str], max_length: int = 512, **kwargs):
+        """
+        Answer extraction + question generation + question answering.
+        :param batch: list of context
+        :param max_length: max length of outputs
+        """
+        # Build output dict
+        dict_batch = {'context': [context for context in batch], 'answers': [], 'questions': [], 'answers_bis': []}
+        # Iterate over context
+        for context in batch:
+            answers = self.ae(batch=[context], max_length=max_length, **kwargs)[0]
+            answers = answers.split('<sep>')
+            answers = [ans.strip() for ans in answers if ans != ' ']
+            dict_batch['answers'].append(answers)
+            for_qg = [f"{context.replace(ans, f'<hl> {ans} <hl> ')}" for ans in answers]
+            questions = self.qg(batch=for_qg, max_length=max_length, **kwargs)
+            dict_batch['questions'].append(questions)
+            new_answers = self.qa([{'context': context, 'question': question} for question in questions],
+                                  max_length=max_length, **kwargs)
+            dict_batch['answers_bis'].append(new_answers)
+        return dict_batch
+    def predict(self, inputs, max_length, **kwargs):
+        """
+        Inference processing.
+        :param inputs: list of inputs
+        :param max_length: max_length of outputs
+        """
+        # Tokenize inputs
+        inputs = self.tokenizer(inputs, max_length=max_length, padding='max_length', truncation=True,
+                                return_tensors="pt")
+        # Retrieve input_ids and attention_mask
+        input_ids = inputs.input_ids.to(self.model.device)
+        attention_mask = inputs.attention_mask.to(self.model.device)
+        # Predict
+        outputs = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_length,
+                                      **kwargs)
+        # Decode outputs
+        predictions = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        return predictions

question_generator.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import os
+import gdown as gdown
+import nltk
+import streamlit as st
+import torch
+from transformers import AutoTokenizer
+from mt5 import MT5
+def download_models(ids):
+    """
+    Download all models.
+    :param ids: name and links of models
+    :return:
+    """
+    # Download sentence tokenizer
+    nltk.download('punkt')
+    # Download model from drive if not stored locally
+    for key in ids:
+        if not os.path.isfile(f"model/{key}.ckpt"):
+            url = f"https://drive.google.com/u/0/uc?id={ids[key]}"
+            gdown.download(url=url, output=f"model/{key}.ckpt")
+@st.cache(allow_output_mutation=True)
+def load_model(model_path):
+    """
+    Load model and cache it.
+    :param model_path: path to model
+    :return:
+    """
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    # Loading model and tokenizer
+    model = MT5.load_from_checkpoint(model_path).eval().to(device)
+    model.tokenizer = AutoTokenizer.from_pretrained('tokenizer')
+    return model
+# elif task == 'Question Answering':
+#
+#     # Input area
+#     inputs = st.text_area('Context:', value="A few years after the First Crusade, in 1107, the Normans under "
+#                                             "the command of Bohemond, Robert\'s son, landed in Valona and "
+#                                             "besieged Dyrrachium using the most sophisticated military "
+#                                             "equipment of the time, but to no avail. Meanwhile, they occupied "
+#                                             "Petrela, the citadel of Mili at the banks of the river Deabolis, "
+#                                             "Gllavenica (Ballsh), Kanina and Jericho. This time, "
+#                                             "the Albanians sided with the Normans, dissatisfied by the heavy "
+#                                             "taxes the Byzantines had imposed upon them. With their help, "
+#                                             "the Normans secured the Arbanon passes and opened their way to "
+#                                             "Dibra. The lack of supplies, disease and Byzantine resistance "
+#                                             "forced Bohemond to retreat from his campaign and sign a peace "
+#                                             "treaty with the Byzantines in the city of Deabolis. ", max_chars=2048,
+#                           height=250)
+#     question = st.text_input('Question:', value="What forced Bohemond to retreat from his campaign? ")
+#
+#     # Prediction
+#     with st.spinner('Please wait while the inputs are being processed...'):
+#         predictions = model.qa([{'question': question, 'context': inputs}], max_length=512)
+#         answer = {question: predictions[0]}
+#
+#     # Answer area
+#     st.write(answer)
+#
+# elif task == 'Question Generation':
+#
+#     # Input area
+#     inputs = st.text_area('Context (highlight answers with <hl> tokens): ',
+#                           value="A few years after the First Crusade, in <hl> 1107 <hl>, the <hl> Normans <hl> under "
+#                                 "the command of <hl> Bohemond <hl>, Robert\'s son, landed in Valona and "
+#                                 "besieged Dyrrachium using the most sophisticated military "
+#                                 "equipment of the time, but to no avail. Meanwhile, they occupied "
+#                                 "Petrela, <hl> the citadel of Mili <hl> at the banks of the river Deabolis, "
+#                                 "Gllavenica (Ballsh), Kanina and Jericho. This time, "
+#                                 "the Albanians sided with the Normans, dissatisfied by the heavy "
+#                                 "taxes the Byzantines had imposed upon them. With their help, "
+#                                 "the Normans secured the Arbanon passes and opened their way to "
+#                                 "Dibra. The <hl> lack of supplies, disease and Byzantine resistance <hl> "
+#                                 "forced Bohemond to retreat from his campaign and sign a peace "
+#                                 "treaty with the Byzantines in the city of Deabolis. ", max_chars=2048,
+#                           height=250)
+#
+#     # Split by highlights
+#     hl_index = [i for i in range(len(inputs)) if inputs.startswith('<hl>', i)]
+#     contexts = []
+#     answers = []
+#
+#     # Build a context for each highlight pair
+#     for i in range(0, len(hl_index), 2):
+#         contexts.append(inputs[:hl_index[i]].replace('<hl>', '') +
+#                         inputs[hl_index[i]: hl_index[i + 1] + 4] +
+#                         inputs[hl_index[i + 1] + 4:].replace('<hl>', ''))
+#         answers.append(inputs[hl_index[i]: hl_index[i + 1] + 4].replace('<hl>', '').strip())
+#
+#     # Prediction
+#     with st.spinner('Please wait while the inputs are being processed...'):
+#         predictions = model.qg(contexts, max_length=512)
+#
+#     # Answer area
+#     content = {}
+#     for pred, ans in zip(predictions, answers):
+#         content[pred] = ans
+#     st.write(content)

requirements.txt CHANGED Viewed

@@ -3,3 +3,7 @@ torch
 tensorflow
 streamlit~=1.8.1
 sentencepiece==0.1.96

 tensorflow
 streamlit~=1.8.1
 sentencepiece==0.1.96
+gdown~=4.3.1
+nltk~=3.7
+pytorch-lightning~=1.5.10
+protobuf~=3.19.4

tokenizer/added_tokens.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"<hl>": 250100, "<sep>": 250101}

tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}

tokenizer/spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef78f86560d809067d12bac6c09f19a462cb3af3f54d2b8acbba26e1433125d6
+size 4309802

tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "extra_ids": 0, "additional_special_tokens": null, "special_tokens_map_file": "C:\\Users\\dvdna/.cache\\huggingface\\transformers\\685ac0ca8568ec593a48b61b0a3c272beee9bc194a3c7241d15dcadb5f875e53.f76030f3ec1b96a8199b2593390c610e76ca8028ef3d24680000619ffb646276", "name_or_path": "google/mt5-small", "sp_model_kwargs": {}, "tokenizer_class": "T5Tokenizer"}