|
import numpy as np |
|
import pandas as pd |
|
import time |
|
import torch |
|
from transformers import T5ForConditionalGeneration,T5Tokenizer |
|
import random |
|
import spacy |
|
import zipfile |
|
import os |
|
os.system('pip install git+https://github.com/boudinfl/pke.git') |
|
os.system('python -m nltk.downloader universal_tagset') |
|
os.system('python -m spacy download en') |
|
os.system('wget https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz') |
|
os.system('tar -xvf s2v_reddit_2015_md.tar.gz') |
|
os.system('python -m spacy download en_core_web_sm') |
|
import git |
|
import json |
|
from sense2vec import Sense2Vec |
|
import requests |
|
from collections import OrderedDict |
|
import string |
|
import pke |
|
import nltk |
|
import numpy |
|
import en_core_web_sm |
|
from nltk import FreqDist |
|
nltk.download('brown', quiet=True, force=True) |
|
nltk.download('stopwords', quiet=True, force=True) |
|
nltk.download('popular', quiet=True, force=True) |
|
from nltk.corpus import stopwords |
|
from nltk.corpus import brown |
|
from similarity.normalized_levenshtein import NormalizedLevenshtein |
|
from nltk.tokenize import sent_tokenize |
|
from flashtext import KeywordProcessor |
|
from encoding import beam_search_decoding |
|
from mcq import tokenize_sentences |
|
from mcq import get_keywords |
|
from mcq import get_sentences_for_keyword |
|
from mcq import generate_questions_mcq |
|
from mcq import generate_normal_questions |
|
import time |
|
tokenizer = T5Tokenizer.from_pretrained('t5-large') |
|
model = T5ForConditionalGeneration.from_pretrained('Parth/result') |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model.to(device) |
|
|
|
device = device |
|
model = model |
|
nlp = spacy.load('en_core_web_sm') |
|
s2v = Sense2Vec().from_disk('s2v_old') |
|
fdist = FreqDist(brown.words()) |
|
normalized_levenshtein = NormalizedLevenshtein() |
|
def set_seed(seed): |
|
numpy.random.seed(seed) |
|
torch.manual_seed(seed) |
|
if torch.cuda.is_available(): |
|
torch.cuda.manual_seed_all(seed) |
|
set_seed(42) |
|
|
|
|
|
|
|
def predict_mcq(payload): |
|
start = time.time() |
|
inp = { |
|
"input_text": payload.get("input_text"), |
|
"max_questions": payload.get("max_questions", 10) |
|
} |
|
|
|
text = inp['input_text'] |
|
sentences = tokenize_sentences(text) |
|
joiner = " " |
|
modified_text = joiner.join(sentences) |
|
|
|
|
|
keywords = get_keywords(nlp,modified_text,inp['max_questions'],s2v,fdist,normalized_levenshtein,len(sentences) ) |
|
|
|
|
|
keyword_sentence_mapping = get_sentences_for_keyword(keywords, sentences) |
|
|
|
for k in keyword_sentence_mapping.keys(): |
|
text_snippet = " ".join(keyword_sentence_mapping[k][:3]) |
|
keyword_sentence_mapping[k] = text_snippet |
|
|
|
|
|
final_output = {} |
|
|
|
if len(keyword_sentence_mapping.keys()) == 0: |
|
return final_output |
|
else: |
|
try: |
|
generated_questions = generate_questions_mcq(keyword_sentence_mapping,device,tokenizer,model,s2v,normalized_levenshtein) |
|
|
|
except: |
|
return final_output |
|
end = time.time() |
|
|
|
final_output["statement"] = modified_text |
|
final_output["questions"] = generated_questions["questions"] |
|
final_output["time_taken"] = end-start |
|
|
|
if torch.device=='cuda': |
|
torch.cuda.empty_cache() |
|
|
|
return final_output |