import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import time import torch from transformers import T5ForConditionalGeneration,T5Tokenizer import random import spacy import zipfile import os import git os.system('pip install git+https://github.com/boudinfl/pke.git') os.system('python -m nltk.downloader universal_tagset') os.system('python -m spacy download en') os.system('wget https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz') os.system('tar -xvf s2v_reddit_2015_md.tar.gz') os.system('python -m spacy download en_core_web_sm') import json from sense2vec import Sense2Vec import requests from collections import OrderedDict import string import pke import nltk import numpy import en_core_web_sm from nltk import FreqDist nltk.download('brown', quiet=True, force=True) nltk.download('stopwords', quiet=True, force=True) nltk.download('popular', quiet=True, force=True) from nltk.corpus import stopwords from nltk.corpus import brown from similarity.normalized_levenshtein import NormalizedLevenshtein from nltk.tokenize import sent_tokenize from flashtext import KeywordProcessor from encoding import beam_search_decoding from mcq import tokenize_sentences from mcq import get_keywords from mcq import get_sentences_for_keyword from mcq import generate_questions_mcq from mcq import generate_normal_questions import time tokenizer = T5Tokenizer.from_pretrained('t5-large') model = T5ForConditionalGeneration.from_pretrained('Parth/result') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) # model.eval() device = device model = model nlp = spacy.load('en_core_web_sm') s2v = Sense2Vec().from_disk('s2v_old') fdist = FreqDist(brown.words()) normalized_levenshtein = NormalizedLevenshtein() def set_seed(seed): numpy.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) set_seed(42) def predict_mcq(payload): start = time.time() inp = { "input_text": payload.get("input_text"), "max_questions": payload.get("max_questions", 4) } text = inp['input_text'] sentences = tokenize_sentences(text) joiner = " " modified_text = joiner.join(sentences) keywords = get_keywords(nlp,modified_text,inp['max_questions'],s2v,fdist,normalized_levenshtein,len(sentences) ) keyword_sentence_mapping = get_sentences_for_keyword(keywords, sentences) for k in keyword_sentence_mapping.keys(): text_snippet = " ".join(keyword_sentence_mapping[k][:3]) keyword_sentence_mapping[k] = text_snippet final_output = {} if len(keyword_sentence_mapping.keys()) == 0: return final_output else: try: generated_questions = generate_questions_mcq(keyword_sentence_mapping,device,tokenizer,model,s2v,normalized_levenshtein) except: return final_output end = time.time() final_output["statement"] = modified_text final_output["questions"] = generated_questions["questions"] final_output["time_taken"] = end-start if torch.device=='cuda': torch.cuda.empty_cache() return final_output def predict_shortq(payload): inp = { "input_text": payload.get("input_text"), "max_questions": payload.get("max_questions", 4) } text = inp['input_text'] sentences = tokenize_sentences(text) joiner = " " modified_text = joiner.join(sentences) keywords = get_keywords(nlp,modified_text,inp['max_questions'],s2v,fdist,normalized_levenshtein,len(sentences) ) keyword_sentence_mapping = get_sentences_for_keyword(keywords, sentences) for k in keyword_sentence_mapping.keys(): text_snippet = " ".join(keyword_sentence_mapping[k][:3]) keyword_sentence_mapping[k] = text_snippet final_output = {} if len(keyword_sentence_mapping.keys()) == 0: print('ZERO') return final_output else: generated_questions = generate_normal_questions(keyword_sentence_mapping,device,tokenizer,model) print(generated_questions) final_output["statement"] = modified_text final_output["questions"] = generated_questions["questions"] if torch.device=='cuda': torch.cuda.empty_cache() return final_output def paraphrase(payload): start = time.time() inp = { "input_text": payload.get("input_text"), "max_questions": payload.get("max_questions", 3) } text = inp['input_text'] num = inp['max_questions'] sentence= text text= "paraphrase: " + sentence + " " encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt") input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device) beam_outputs = model.generate( input_ids=input_ids, attention_mask=attention_masks, max_length= 50, num_beams=50, num_return_sequences=num, no_repeat_ngram_size=2, early_stopping=True ) # print ("\nOriginal Question ::") # print (text) # print ("\n") # print ("Paraphrased Questions :: ") final_outputs =[] for beam_output in beam_outputs: sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True) if sent.lower() != sentence.lower() and sent not in final_outputs: final_outputs.append(sent) output= {} output['Question']= text output['Count']= num output['Paraphrased Questions']= final_outputs for i, final_output in enumerate(final_outputs): print("{}".format(i, final_output)) if torch.device=='cuda': torch.cuda.empty_cache() return output