p-christ
/

12412fsasf

Text2Text Generation PyTorch generic t5

Model card Files Files and versions Community

p-christ commited on Jan 27, 2022

Commit

3aabb87

•

1 Parent(s): f8c6704

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +36 -24

pipeline.py CHANGED Viewed

@@ -2,8 +2,8 @@ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 from typing import Dict, List, Any
 import itertools
 from nltk import sent_tokenize
 import nltk
-import torch
 class PreTrainedPipeline():
@@ -11,16 +11,20 @@ class PreTrainedPipeline():
         # IMPLEMENT_THIS
         # Preload all the elements you are going to need at inference.
         # For instance your model, processors, tokenizer that might be needed.
-        # This function is only called once, so do all the heavy processing I/O here"""
         nltk.download('punkt')
-        self.model = AutoModelForSeq2SeqLM.from_pretrained(path)
         self.tokenizer = AutoTokenizer.from_pretrained(path)
         self.model_type="t5"
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-    def __call__(self, inputs: str):
         if len(inputs) == 0: return []
         inputs = " ".join(inputs.split())
         sents, answers = self._extract_answers(inputs)
@@ -29,16 +33,29 @@ class PreTrainedPipeline():
         if len(flat_answers) == 0:
           return []
         qg_examples = self._prepare_inputs_for_qg_from_answers_hl(sents, answers)
         qg_inputs = [example['source_text'] for example in qg_examples]
         questions = self._generate_questions(qg_inputs)
-        output = [{'answer': example['answer'], 'question': que} for example, que in zip(qg_examples, questions)]
-        output = self.clean_generated_QAs(output)
-        return output
     def _extract_answers(self, context):
-        print("_extract_answers")
         sents, inputs = self._prepare_inputs_for_ans_extraction(context)
         inputs = self._tokenize(inputs, padding=True, truncation=True)
@@ -50,13 +67,14 @@ class PreTrainedPipeline():
         dec = [self.tokenizer.decode(ids, skip_special_tokens=False) for ids in outs]
         answers = [item.split('<sep>') for item in dec]
-        answers = [i[:-1] for i in answers]
         return sents, answers
     def _prepare_inputs_for_ans_extraction(self, text):
-        print("_prepare_inputs_for_ans_extraction")
         sents = sent_tokenize(text)
         inputs = []
@@ -93,7 +111,6 @@ class PreTrainedPipeline():
         return inputs
     def _generate_questions(self, inputs):
-        print("_generate_questions")
         inputs = self._tokenize(inputs, padding=True, truncation=True)
         outs = self.model.generate(
@@ -105,11 +122,8 @@ class PreTrainedPipeline():
         questions = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
         return questions
     def _prepare_inputs_for_qg_from_answers_hl(self, sents, answers):
-        print("_prepare_inputs_for_qg_from_answers_hl")
         inputs = []
         for i, answer in enumerate(answers):
             if len(answer) == 0: continue
@@ -118,8 +132,6 @@ class PreTrainedPipeline():
                 sents_copy = sents[:]
                 answer_text = self.remove_pad(answer_text)
                 answer_text = answer_text.strip()
-                print("Answer", answer)
-                print("Answer text", answer_text)
                 try:
                   ans_start_idx = sent.lower().index(answer_text.lower())
@@ -139,13 +151,14 @@ class PreTrainedPipeline():
         return inputs
-    def clean_generated_QAs(self, generated_QAs):
       clean_QAs = []
       answers_used = set()
       # Only allow 1 question per answer, take the first case of it
       for qa in generated_QAs:
-        if qa['answer'] in answers_used:
-          break
         answers_used.add(qa['answer'])
         clean_QAs.append(qa)
       return clean_QAs
@@ -153,5 +166,4 @@ class PreTrainedPipeline():
     def remove_pad(self, str):
       if "<pad>" in str:
         return str.replace("<pad>", "")
-      return str

 from typing import Dict, List, Any
 import itertools
 from nltk import sent_tokenize
+# import torch
 import nltk
 class PreTrainedPipeline():
         # IMPLEMENT_THIS
         # Preload all the elements you are going to need at inference.
         # For instance your model, processors, tokenizer that might be needed.
+        # This function is only called once, so do all the heavy processing I/O here"""
         nltk.download('punkt')
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(path)
         self.tokenizer = AutoTokenizer.from_pretrained(path)
         self.model_type="t5"
+        # self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device =  "cpu"
+        self.model.to(self.device)
+    def __call__(self, inputs: str, max_words_per_answer: int = 3):
         if len(inputs) == 0: return []
         inputs = " ".join(inputs.split())
         sents, answers = self._extract_answers(inputs)
         if len(flat_answers) == 0:
           return []
+        questions, qg_examples = self.prepare_and_generate_questions(sents, answers)
+        output = [{'answer': example['answer'], 'question': que} for example, que in zip(qg_examples, questions)]
+        output = self.clean_generated_QAs(output, max_words_per_answer)
+        return output
+    def prepare_and_generate_questions(self, sents, answers):
         qg_examples = self._prepare_inputs_for_qg_from_answers_hl(sents, answers)
         qg_inputs = [example['source_text'] for example in qg_examples]
         questions = self._generate_questions(qg_inputs)
+        return questions, qg_examples
+    def clean_answers_list_of_lists(self, answers):
+        clean_answers = []
+        for answer_list in answers:
+            answer_list = answer_list[:-1]
+            answer_list = list(set([a.strip() for a in answer_list]))
+            clean_answers.append(answer_list)
+        return clean_answers
     def _extract_answers(self, context):
         sents, inputs = self._prepare_inputs_for_ans_extraction(context)
         inputs = self._tokenize(inputs, padding=True, truncation=True)
         dec = [self.tokenizer.decode(ids, skip_special_tokens=False) for ids in outs]
         answers = [item.split('<sep>') for item in dec]
+        answers = self.clean_answers_list_of_lists(answers)
         return sents, answers
     def _prepare_inputs_for_ans_extraction(self, text):
         sents = sent_tokenize(text)
         inputs = []
         return inputs
     def _generate_questions(self, inputs):
         inputs = self._tokenize(inputs, padding=True, truncation=True)
         outs = self.model.generate(
         questions = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
         return questions
     def _prepare_inputs_for_qg_from_answers_hl(self, sents, answers):
         inputs = []
         for i, answer in enumerate(answers):
             if len(answer) == 0: continue
                 sents_copy = sents[:]
                 answer_text = self.remove_pad(answer_text)
                 answer_text = answer_text.strip()
                 try:
                   ans_start_idx = sent.lower().index(answer_text.lower())
         return inputs
+    def clean_generated_QAs(self, generated_QAs, max_words_per_answer):
       clean_QAs = []
       answers_used = set()
       # Only allow 1 question per answer, take the first case of it
       for qa in generated_QAs:
+        answer_word_length = len(qa['answer'].strip().split())
+        if qa['answer'] in answers_used or answer_word_length > max_words_per_answer:
+          continue
         answers_used.add(qa['answer'])
         clean_QAs.append(qa)
       return clean_QAs
     def remove_pad(self, str):
       if "<pad>" in str:
         return str.replace("<pad>", "")
+      return str