import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import time import torch from transformers import T5ForConditionalGeneration,T5Tokenizer import random import spacy import zipfile import os import json from sense2vec import Sense2Vec import requests from collections import OrderedDict import string import pke import nltk import numpy from nltk import FreqDist nltk.download('brown', quiet=True, force=True) nltk.download('stopwords', quiet=True, force=True) nltk.download('popular', quiet=True, force=True) from nltk.corpus import stopwords from nltk.corpus import brown from similarity.normalized_levenshtein import NormalizedLevenshtein from nltk.tokenize import sent_tokenize from flashtext import KeywordProcessor from encoding import beam_search_decoding from mcq import tokenize_sentences from mcq import get_keywords from mcq import get_sentences_for_keyword from mcq import generate_questions_mcq from mcq import generate_normal_questions import time os.system('!pip install git+https://github.com/boudinfl/pke.git') os.system('!python -m nltk.downloader universal_tagset') os.system('!python -m spacy download en') os.system('!wget https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz') os.system('!tar -xvf s2v_reddit_2015_md.tar.gz') tokenizer = T5Tokenizer.from_pretrained('t5-large') model = T5ForConditionalGeneration.from_pretrained('Parth/result') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) # model.eval() device = device model = model nlp = spacy.load('en_core_web_sm') s2v = Sense2Vec().from_disk('s2v_old') fdist = FreqDist(brown.words()) normalized_levenshtein = NormalizedLevenshtein() def set_seed(seed): numpy.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) set_seed(42) def predict_mcq(payload): start = time.time() inp = { "input_text": payload.get("input_text"), "max_questions": payload.get("max_questions", 4) } text = inp['input_text'] sentences = tokenize_sentences(text) joiner = " " modified_text = joiner.join(sentences) keywords = get_keywords(nlp,modified_text,inp['max_questions'],s2v,fdist,normalized_levenshtein,len(sentences) ) keyword_sentence_mapping = get_sentences_for_keyword(keywords, sentences) for k in keyword_sentence_mapping.keys(): text_snippet = " ".join(keyword_sentence_mapping[k][:3]) keyword_sentence_mapping[k] = text_snippet final_output = {} if len(keyword_sentence_mapping.keys()) == 0: return final_output else: try: generated_questions = generate_questions_mcq(keyword_sentence_mapping,device,tokenizer,model,s2v,normalized_levenshtein) except: return final_output end = time.time() final_output["statement"] = modified_text final_output["questions"] = generated_questions["questions"] final_output["time_taken"] = end-start if torch.device=='cuda': torch.cuda.empty_cache() return final_output def predict_shortq(payload): inp = { "input_text": payload.get("input_text"), "max_questions": payload.get("max_questions", 4) } text = inp['input_text'] sentences = tokenize_sentences(text) joiner = " " modified_text = joiner.join(sentences) keywords = get_keywords(nlp,modified_text,inp['max_questions'],s2v,fdist,normalized_levenshtein,len(sentences) ) keyword_sentence_mapping = get_sentences_for_keyword(keywords, sentences) for k in keyword_sentence_mapping.keys(): text_snippet = " ".join(keyword_sentence_mapping[k][:3]) keyword_sentence_mapping[k] = text_snippet final_output = {} if len(keyword_sentence_mapping.keys()) == 0: print('ZERO') return final_output else: generated_questions = generate_normal_questions(keyword_sentence_mapping,device,tokenizer,model) print(generated_questions) final_output["statement"] = modified_text final_output["questions"] = generated_questions["questions"] if torch.device=='cuda': torch.cuda.empty_cache() return final_output def paraphrase(payload): start = time.time() inp = { "input_text": payload.get("input_text"), "max_questions": payload.get("max_questions", 3) } text = inp['input_text'] num = inp['max_questions'] sentence= text text= "paraphrase: " + sentence + " " encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt") input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device) beam_outputs = model.generate( input_ids=input_ids, attention_mask=attention_masks, max_length= 50, num_beams=50, num_return_sequences=num, no_repeat_ngram_size=2, early_stopping=True ) # print ("\nOriginal Question ::") # print (text) # print ("\n") # print ("Paraphrased Questions :: ") final_outputs =[] for beam_output in beam_outputs: sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True) if sent.lower() != sentence.lower() and sent not in final_outputs: final_outputs.append(sent) output= {} output['Question']= text output['Count']= num output['Paraphrased Questions']= final_outputs for i, final_output in enumerate(final_outputs): print("{}".format(i, final_output)) if torch.device=='cuda': torch.cuda.empty_cache() return output