import os import sys import math import numpy as np import torch import spacy import re import random import json import en_core_web_sm from string import punctuation #from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config #from transformers import BertTokenizer, BertForSequenceClassification from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification class QuestionGenerator(): def __init__(self, model_dir=None): QG_PRETRAINED = 'iarfmoose/t5-base-question-generator' self.ANSWER_TOKEN = '' self.CONTEXT_TOKEN = '' self.SEQ_LENGTH = 512 self.device = torch.device('cpu') # self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.qg_tokenizer = AutoTokenizer.from_pretrained(QG_PRETRAINED) self.qg_model = AutoModelForSeq2SeqLM.from_pretrained(QG_PRETRAINED) self.qg_model.to(self.device) self.qa_evaluator = QAEvaluator(model_dir) def generate(self, article, use_evaluator=True, num_questions=None, answer_style='all'): print("Generating questions...\n") qg_inputs, qg_answers = self.generate_qg_inputs(article, answer_style) print("qg_inputs, qg_answers=>",qg_inputs, qg_answers) generated_questions = self.generate_questions_from_inputs(qg_inputs,num_questions) print("generated_questions(generate)=>",generated_questions) return generated_questions message = "{} questions doesn't match {} answers".format( len(generated_questions), len(qg_answers)) assert len(generated_questions) == len(qg_answers), message if use_evaluator: print("Evaluating QA pairs...\n") encoded_qa_pairs = self.qa_evaluator.encode_qa_pairs(generated_questions, qg_answers) scores = self.qa_evaluator.get_scores(encoded_qa_pairs) if num_questions: qa_list = self._get_ranked_qa_pairs(generated_questions, qg_answers, scores, num_questions) else: qa_list = self._get_ranked_qa_pairs(generated_questions, qg_answers, scores) else: print("Skipping evaluation step.\n") qa_list = self._get_all_qa_pairs(generated_questions, qg_answers) return qa_list def generate_qg_inputs(self, text, answer_style): VALID_ANSWER_STYLES = ['all', 'sentences', 'multiple_choice'] if answer_style not in VALID_ANSWER_STYLES: raise ValueError( "Invalid answer style {}. Please choose from {}".format( answer_style, VALID_ANSWER_STYLES ) ) inputs = [] answers = [] if answer_style == 'sentences' or answer_style == 'all': segments = self._split_into_segments(text) for segment in segments: sentences = self._split_text(segment) prepped_inputs, prepped_answers = self._prepare_qg_inputs(sentences, segment) inputs.extend(prepped_inputs) answers.extend(prepped_answers) if answer_style == 'multiple_choice' or answer_style == 'all': sentences = self._split_text(text) prepped_inputs, prepped_answers = self._prepare_qg_inputs_MC(sentences) inputs.extend(prepped_inputs) answers.extend(prepped_answers) return inputs, answers def generate_questions_from_inputs(self, qg_inputs,num_questions): generated_questions = [] count = 0 print("num que => ", num_questions) for qg_input in qg_inputs: if count < int(num_questions): question = self._generate_question(qg_input) question = question.strip() #remove trailing spaces question = question.strip(punctuation) #remove trailing questionmarks question += "?" #add one ? if question not in generated_questions: generated_questions.append(question) print("question ===> ",question) count += 1 else: return generated_questions return generated_questions # def _split_text(self, text): MAX_SENTENCE_LEN = 128 sentences = re.findall('.*?[.!\?]', text) cut_sentences = [] for sentence in sentences: if len(sentence) > MAX_SENTENCE_LEN: cut_sentences.extend(re.split('[,;:)]', sentence)) # temporary solution to remove useless post-quote sentence fragments cut_sentences = [s for s in sentences if len(s.split(" ")) > 5] sentences = sentences + cut_sentences return list(set([s.strip(" ") for s in sentences])) def _split_into_segments(self, text): MAX_TOKENS = 490 paragraphs = text.split('\n') tokenized_paragraphs = [self.qg_tokenizer(p)['input_ids'] for p in paragraphs if len(p) > 0] segments = [] while len(tokenized_paragraphs) > 0: segment = [] while len(segment) < MAX_TOKENS and len(tokenized_paragraphs) > 0: paragraph = tokenized_paragraphs.pop(0) segment.extend(paragraph) segments.append(segment) return [self.qg_tokenizer.decode(s) for s in segments] def _prepare_qg_inputs(self, sentences, text): inputs = [] answers = [] for sentence in sentences: qg_input = '{} {} {} {}'.format( self.ANSWER_TOKEN, sentence, self.CONTEXT_TOKEN, text ) inputs.append(qg_input) answers.append(sentence) return inputs, answers def _prepare_qg_inputs_MC(self, sentences): spacy_nlp = en_core_web_sm.load() docs = list(spacy_nlp.pipe(sentences, disable=['parser'])) inputs_from_text = [] answers_from_text = [] for i in range(len(sentences)): entities = docs[i].ents if entities: for entity in entities: qg_input = '{} {} {} {}'.format( self.ANSWER_TOKEN, entity, self.CONTEXT_TOKEN, sentences[i] ) answers = self._get_MC_answers(entity, docs) inputs_from_text.append(qg_input) answers_from_text.append(answers) return inputs_from_text, answers_from_text def _get_MC_answers(self, correct_answer, docs): entities = [] for doc in docs: entities.extend([{'text': e.text, 'label_': e.label_} for e in doc.ents]) # remove duplicate elements entities_json = [json.dumps(kv) for kv in entities] pool = set(entities_json) num_choices = min(4, len(pool)) - 1 # -1 because we already have the correct answer # add the correct answer final_choices = [] correct_label = correct_answer.label_ final_choices.append({'answer': correct_answer.text, 'correct': True}) pool.remove(json.dumps({'text': correct_answer.text, 'label_': correct_answer.label_})) # find answers with the same NER label matches = [e for e in pool if correct_label in e] # if we don't have enough then add some other random answers if len(matches) < num_choices: choices = matches pool = pool.difference(set(choices)) choices.extend(random.sample(pool, num_choices - len(choices))) else: choices = random.sample(matches, num_choices) choices = [json.loads(s) for s in choices] for choice in choices: final_choices.append({'answer': choice['text'], 'correct': False}) random.shuffle(final_choices) return final_choices def _generate_question(self, qg_input): self.qg_model.eval() encoded_input = self._encode_qg_input(qg_input) with torch.no_grad(): output = self.qg_model.generate(input_ids=encoded_input['input_ids']) return self.qg_tokenizer.decode(output[0]) def _encode_qg_input(self, qg_input): return self.qg_tokenizer( qg_input, pad_to_max_length=True, max_length=self.SEQ_LENGTH, truncation=True, return_tensors="pt" ).to(self.device) def _get_ranked_qa_pairs(self, generated_questions, qg_answers, scores, num_questions=10): if num_questions > len(scores): num_questions = len(scores) print("\nWas only able to generate {} questions. For more questions, please input a longer text.".format(num_questions)) qa_list = [] for i in range(num_questions): index = scores[i] qa = self._make_dict( generated_questions[index].split('?')[0] + '?', qg_answers[index]) qa_list.append(qa) return qa_list def _get_all_qa_pairs(self, generated_questions, qg_answers): qa_list = [] for i in range(len(generated_questions)): qa = self._make_dict( generated_questions[i].split('?')[0] + '?', qg_answers[i]) qa_list.append(qa) return qa_list def _make_dict(self, question, answer): qa = {} qa['question'] = question qa['answer'] = answer return qa class QAEvaluator(): def __init__(self, model_dir=None): QAE_PRETRAINED = 'iarfmoose/bert-base-cased-qa-evaluator' self.SEQ_LENGTH = 512 self.device = torch.device('cpu') # self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.qae_tokenizer = AutoTokenizer.from_pretrained(QAE_PRETRAINED) self.qae_model = AutoModelForSequenceClassification.from_pretrained(QAE_PRETRAINED) self.qae_model.to(self.device) def encode_qa_pairs(self, questions, answers): encoded_pairs = [] for i in range(len(questions)): encoded_qa = self._encode_qa(questions[i], answers[i]) encoded_pairs.append(encoded_qa.to(self.device)) return encoded_pairs def get_scores(self, encoded_qa_pairs): scores = {} self.qae_model.eval() with torch.no_grad(): for i in range(len(encoded_qa_pairs)): scores[i] = self._evaluate_qa(encoded_qa_pairs[i]) return [k for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)] def _encode_qa(self, question, answer): if type(answer) is list: for a in answer: if a['correct']: correct_answer = a['answer'] else: correct_answer = answer return self.qae_tokenizer( text=question, text_pair=correct_answer, pad_to_max_length=True, max_length=self.SEQ_LENGTH, truncation=True, return_tensors="pt" ) def _evaluate_qa(self, encoded_qa_pair): output = self.qae_model(**encoded_qa_pair) return output[0][0][1] def print_qa(qa_list, show_answers=True): for i in range(len(qa_list)): space = ' ' * int(np.where(i < 9, 3, 4)) # wider space for 2 digit q nums print('{}) Q: {}'.format(i + 1, qa_list[i]['question'])) answer = qa_list[i]['answer'] # print a list of multiple choice answers if type(answer) is list: if show_answers: print('{}A: 1.'.format(space), answer[0]['answer'], np.where(answer[0]['correct'], '(correct)', '')) for j in range(1, len(answer)): print('{}{}.'.format(space + ' ', j + 1), answer[j]['answer'], np.where(answer[j]['correct'] == True, '(correct)', '')) else: print('{}A: 1.'.format(space), answer[0]['answer']) for j in range(1, len(answer)): print('{}{}.'.format(space + ' ', j + 1), answer[j]['answer']) print('') # print full sentence answers else: if show_answers: print('{}A:'.format(space), answer, '\n')