File size: 6,123 Bytes
c1b9d7d
 
 
bef439a
c1b9d7d
 
bef439a
c1b9d7d
 
90350be
c1b9d7d
 
 
 
 
 
 
fcee375
c1b9d7d
 
 
5968428
c1b9d7d
 
 
1aa8621
c1b9d7d
46db2ba
c1b9d7d
 
 
 
0074c68
 
15291b0
13b79aa
8ee4f89
e2e14fd
189514b
90350be
9a708c4
8336e59
15291b0
8336e59
39105ec
c1b9d7d
 
9bc33fc
c1b9d7d
b306dde
8ee4f89
 
c1b9d7d
 
b306dde
8ee4f89
 
 
 
c1b9d7d
0074c68
 
 
507bb54
 
0074c68
 
c1b9d7d
 
189514b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa4c7fd
0f34a00
189514b
c1b9d7d
 
 
bef439a
 
 
 
c1b9d7d
 
 
 
918bd10
88f594a
918bd10
dc9cd8d
918bd10
 
dc9cd8d
918bd10
 
 
 
 
c1b9d7d
 
 
 
 
 
 
bef439a
8ee4f89
c1b9d7d
 
 
8ee4f89
c1b9d7d
 
 
effac9d
 
c1b9d7d
 
9bc33fc
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import transformers
from transformers import (
    # Text2TextGenerationPipeline,
    AutoModelForSeq2SeqLM as alwm,
    # TokenClassificationPipeline,
    # AutoModelForTokenClassification,
    AutoModelForQuestionAnswering as amqa,
    AutoTokenizer as att,
    # BertTokenizer,
    AlbertTokenizer,
    # BertForQuestionAnswering,
    # AlbertForQuestionAnswering,
    # T5Config,
    # T5ForConditionalGeneration, 
    T5TokenizerFast,
    PreTrainedTokenizer,
    PreTrainedModel,
    ElectraTokenizer as et,
    # ElectraForQuestionAnswering
)
import torch
import sentencepiece
import string
import numpy as np
from transformers import pipeline
# from transformers.pipelines import pipeline
import pickle
import streamlit as st

# sq_tokenizer = att.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
# sq_model = alwm.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
# text= "The abolition of feudal privileges by the National Constituent Assembly on 4 August 1789 and the Declaration \\nof the Rights of Man and of the Citizen (La Déclaration des Droits de l'Homme et du Citoyen), drafted by Lafayette \\nwith the help of Thomas Jefferson and adopted on 26 August, paved the way to a Constitutional Monarchy \\n(4 September 1791 – 21 September 1792). Despite these dramatic changes, life at the court continued, while the situation \\nin Paris was becoming critical because of bread shortages in September. On 5 October 1789, a crowd from Paris descended upon Versailles \\nand forced the royal family to move to the Tuileries Palace in Paris, where they lived under a form of house arrest under \\nthe watch of Lafayette's Garde Nationale, while the Comte de Provence and his wife were allowed to reside in the \\nPetit Luxembourg, where they remained until they went into exile on 20 June 1791."
# hftokenizer = pickle.load(open('models/hftokenizer.sav', 'rb'))
# hfmodel = pickle.load(open('models/hfmodel.sav', 'rb'))

def load_model():
  hfm = pickle.load(open('hfmodel.sav','rb'))
  hft = T5TokenizerFast.from_pretrained("t5-base")
  model = pickle.load(open('model.sav','rb'))
  tok = AlbertTokenizer.from_pretrained("ahotrod/albert_xxlargev1_squad2_512")
  # return hfm, hft,tok, model
  return hfm, hft, tok, model

hfmodel, hftokenizer, tokenizer, model = load_model()

def run_model(input_string, **generator_args):
  generator_args = {
  "max_length": 256,
  "num_beams": 4,
  "length_penalty": 1.5,
  "no_repeat_ngram_size": 3,
  "early_stopping": True,
  }
  # tokenizer = att.from_pretrained("ThomasSimonini/t5-end2end-question-generation")
  input_string = "generate questions: " + input_string + " </s>"
  input_ids = hftokenizer.encode(input_string, return_tensors="pt")
  res = hfmodel.generate(input_ids, **generator_args)
  output = hftokenizer.batch_decode(res, skip_special_tokens=True)
  output = [item.split("<sep>") for item in output]
  return output



# al_tokenizer = att.from_pretrained("deepset/electra-base-squad2")
# al_model = amqa.from_pretrained("deepset/electra-base-squad2")
# al_model = pickle.load(open('models/al_model.sav', 'rb'))
# al_tokenizer = pickle.load(open('models/al_tokenizer.sav', 'rb'))
def QA(question, context):
  # model_name="deepset/electra-base-squad2"
  # nlp = pipeline("question-answering",model=model,tokenizer = tok)
  # format = {
  #     'question':question,
  #     'context':context
  # }
  # res = nlp(format)
  # output = f"{question}\n{string.capwords(res['answer'])}\n"
  # return output
  inputs = tokenizer(question, context, return_tensors="pt")
  # Run the model, the deepset way
  with torch.no_grad():
    output = model(**inputs)
  start_score = output.start_logits
  end_score = output.end_logits
  #Get the rel scores for the context, and calculate the most probable begginign using torch
  start = torch.argmax(start_score)
  end = torch.argmax(end_score)
  #cinvert tokens to strings
  # output = tokenizer.decode(input_ids[start:end+1], skip_special_tokens=True)
  predict_answer_tokens = inputs.input_ids[0, start : end + 1]
  output = tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)
  output = string.capwords(output)
  if output.isspace() or len(output) == 0:
    return f"Possible question : {question}\n Answer could not be generated accurately."
  return f"Q. {question} \n Ans. {output}"
# QA("What was the first C program","The first prgram written in C was Hello World")

def gen_question(inputs):

 questions = run_model(inputs)

 return questions

# string_query = "Hello World"
# gen_question(f"answer: {string_query} context: The first C program said {string_query} ").  #The format of the query to generate questions

def tokenize(inputs) :
    inputs = hftokenizer.batch_encode_plus(
        inputs, 
        max_length=512,
        add_special_tokens=True,
        truncation=True,
        # padding="max_length",
        pad_to_max_length=True,
        return_tensors="pt"
    )
    return inputs
    
def read_file(filepath_name):
  with open(text, "r") as infile:
    contents = infile.read()
    context = contents.replace("\n", " ")
  return context

def create_string_for_generator(context):
    gen_list = gen_question(context)
    return (gen_list[0][0]).split('? ')

def creator(context):
  questions = create_string_for_generator(context)
  # questions = questions.split('?')
  pairs = []
  for ques in questions:
    pair = QA(ques,context)
    if len(pair) == 0:
        continue
    pairs.append(pair)
  return pairs
# creator(""""Hello, World!" program by Brian Kernighan (1978).
# A "Hello, World!" program is generally a computer program that ignores any input and outputs or displays a message similar to "Hello, World!". A small piece of code in most general-purpose programming languages, this program is used to illustrate a language's basic syntax. "Hello, World!" programs are often the first a student learns to write in a given language,[1] and they can also be used as a sanity check to ensure computer software intended to compile or run source code is correctly installed, and that its operator understands how to use it.
# """)