Spaces:
Runtime error
Runtime error
ashishraics
commited on
Commit
•
51fb126
1
Parent(s):
4b9b600
requiremtn fix
Browse files- app.py +8 -10
- extract_config.py +8 -0
- keywords.py +38 -8
- requirements.txt +3 -1
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import streamlit as st
|
2 |
from transformers import AutoTokenizer
|
3 |
from fastT5 import OnnxT5,get_onnx_runtime_sessions
|
|
|
4 |
from annotated_text import annotated_text
|
5 |
import nltk
|
6 |
nltk.download('stopwords')
|
@@ -8,15 +9,14 @@ nltk.download('wordnet')
|
|
8 |
nltk.download('punkt')
|
9 |
from nltk.corpus import stopwords,wordnet
|
10 |
from nltk.tokenize import sent_tokenize
|
11 |
-
from flashtext import KeywordProcessor
|
12 |
-
import regex as re
|
13 |
import string
|
14 |
import subprocess
|
15 |
-
from PIL import Image
|
16 |
import logging
|
17 |
import multiprocessing
|
18 |
total_threads=multiprocessing.cpu_count()
|
19 |
import onnxruntime as ort
|
|
|
|
|
20 |
try:
|
21 |
import pke
|
22 |
logging.error("importing pke info")
|
@@ -84,7 +84,7 @@ model_t5=OnnxT5(model_or_model_path=t5_chkpt,onnx_model_sessions=model_session)
|
|
84 |
tokenizer_t5=AutoTokenizer.from_pretrained(t5_tokenizer)
|
85 |
|
86 |
def create_question_t5(model,tokenizer,context,answer,max_length=64):
|
87 |
-
input = "context: %s answer: %s " % (context, answer)
|
88 |
features=tokenizer([input],return_tensors='pt')
|
89 |
output=model.generate(input_ids=features['input_ids'],
|
90 |
attention_mask=features['attention_mask'],
|
@@ -94,7 +94,7 @@ def create_question_t5(model,tokenizer,context,answer,max_length=64):
|
|
94 |
return tokenizer.decode(output.squeeze(), skip_special_tokens=True)
|
95 |
|
96 |
def create_answers_t5(model,tokenizer,context,question,max_length=128):
|
97 |
-
input = "context: %s question: %s " % (context, question)
|
98 |
features=tokenizer([input],return_tensors='pt')
|
99 |
output=model.generate(input_ids=features['input_ids'],
|
100 |
attention_mask=features['attention_mask'],
|
@@ -124,10 +124,8 @@ c1,c2,c3=st.columns(3)
|
|
124 |
with c1:
|
125 |
create_usingkeyword = st.button("Create Questions using Keywords")
|
126 |
if create_usingkeyword:
|
127 |
-
|
128 |
-
from keywords import tokenize_sentence,get_noun_adj_verb
|
129 |
tokenized_sent = tokenize_sentence(input_context)
|
130 |
-
keywords_noun_adj_verb =
|
131 |
t5_questions=[]
|
132 |
|
133 |
with st.spinner("Creating Questionsssss"):
|
@@ -144,6 +142,6 @@ with c1:
|
|
144 |
st.markdown("---")
|
145 |
|
146 |
with c2:
|
147 |
-
|
148 |
-
if
|
149 |
pass
|
|
|
1 |
import streamlit as st
|
2 |
from transformers import AutoTokenizer
|
3 |
from fastT5 import OnnxT5,get_onnx_runtime_sessions
|
4 |
+
from keywords import tokenize_sentence, get_multipartiterank_topics,get_topicrank_topics,get_yake_topics
|
5 |
from annotated_text import annotated_text
|
6 |
import nltk
|
7 |
nltk.download('stopwords')
|
|
|
9 |
nltk.download('punkt')
|
10 |
from nltk.corpus import stopwords,wordnet
|
11 |
from nltk.tokenize import sent_tokenize
|
|
|
|
|
12 |
import string
|
13 |
import subprocess
|
|
|
14 |
import logging
|
15 |
import multiprocessing
|
16 |
total_threads=multiprocessing.cpu_count()
|
17 |
import onnxruntime as ort
|
18 |
+
# from bertopic import BERTopic
|
19 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
20 |
try:
|
21 |
import pke
|
22 |
logging.error("importing pke info")
|
|
|
84 |
tokenizer_t5=AutoTokenizer.from_pretrained(t5_tokenizer)
|
85 |
|
86 |
def create_question_t5(model,tokenizer,context,answer,max_length=64):
|
87 |
+
input = "context: %s answer: %s </s>" % (context, answer)
|
88 |
features=tokenizer([input],return_tensors='pt')
|
89 |
output=model.generate(input_ids=features['input_ids'],
|
90 |
attention_mask=features['attention_mask'],
|
|
|
94 |
return tokenizer.decode(output.squeeze(), skip_special_tokens=True)
|
95 |
|
96 |
def create_answers_t5(model,tokenizer,context,question,max_length=128):
|
97 |
+
input = "context: %s question: %s </s>" % (context, question)
|
98 |
features=tokenizer([input],return_tensors='pt')
|
99 |
output=model.generate(input_ids=features['input_ids'],
|
100 |
attention_mask=features['attention_mask'],
|
|
|
124 |
with c1:
|
125 |
create_usingkeyword = st.button("Create Questions using Keywords")
|
126 |
if create_usingkeyword:
|
|
|
|
|
127 |
tokenized_sent = tokenize_sentence(input_context)
|
128 |
+
keywords_noun_adj_verb = get_multipartiterank_topics(input_context)
|
129 |
t5_questions=[]
|
130 |
|
131 |
with st.spinner("Creating Questionsssss"):
|
|
|
142 |
st.markdown("---")
|
143 |
|
144 |
with c2:
|
145 |
+
create_usinglongformer = st.button("Create Questions using Longformer")
|
146 |
+
if create_usinglongformer:
|
147 |
pass
|
extract_config.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import BertConfig,BertForMaskedLM
|
2 |
+
|
3 |
+
config=BertConfig()
|
4 |
+
model=BertForMaskedLM(config)
|
5 |
+
|
6 |
+
print(config)
|
7 |
+
|
8 |
+
print(model.config)
|
keywords.py
CHANGED
@@ -4,8 +4,6 @@ nltk.download('wordnet')
|
|
4 |
nltk.download('punkt')
|
5 |
from nltk.corpus import stopwords,wordnet
|
6 |
from nltk.tokenize import sent_tokenize
|
7 |
-
from flashtext import KeywordProcessor
|
8 |
-
import regex as re
|
9 |
import string
|
10 |
import subprocess
|
11 |
import logging
|
@@ -19,24 +17,40 @@ except:
|
|
19 |
subprocess.run(['python3' ,'-m' ,'spacy' ,'download' ,'en'])
|
20 |
import pke
|
21 |
|
|
|
|
|
|
|
|
|
22 |
|
23 |
def tokenize_sentence(text):
|
24 |
sentences=sent_tokenize(text)
|
25 |
sentences=[s.strip().lstrip().rstrip() for s in sentences if len(s) > 20]
|
26 |
return sentences
|
27 |
|
28 |
-
def
|
29 |
output = []
|
30 |
try:
|
31 |
extractor = pke.unsupervised.MultipartiteRank()
|
32 |
-
extractor.load_document(input=text, language='en',normalization=None)
|
33 |
# keyphrase candidate selection #'ADJ' 'ADP' 'ADV' 'AUX' 'DET' 'NOUN' 'NUM' 'PART' 'PROPN' 'PUNCT' 'VERB'
|
34 |
-
extractor.candidate_selection(pos={'NOUN',
|
|
|
|
|
35 |
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
38 |
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
keyphrases = extractor.get_n_best(n=5)
|
41 |
|
42 |
for val in keyphrases:
|
@@ -45,3 +59,19 @@ def get_noun_adj_verb(text):
|
|
45 |
print("found exception",e)
|
46 |
return list(set(output))
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
nltk.download('punkt')
|
5 |
from nltk.corpus import stopwords,wordnet
|
6 |
from nltk.tokenize import sent_tokenize
|
|
|
|
|
7 |
import string
|
8 |
import subprocess
|
9 |
import logging
|
|
|
17 |
subprocess.run(['python3' ,'-m' ,'spacy' ,'download' ,'en'])
|
18 |
import pke
|
19 |
|
20 |
+
stoplist = list(string.punctuation)
|
21 |
+
stoplist += pke.lang.stopwords.get('en')
|
22 |
+
stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
|
23 |
+
stoplist += stopwords.words('english')
|
24 |
|
25 |
def tokenize_sentence(text):
|
26 |
sentences=sent_tokenize(text)
|
27 |
sentences=[s.strip().lstrip().rstrip() for s in sentences if len(s) > 20]
|
28 |
return sentences
|
29 |
|
30 |
+
def get_multipartiterank_topics(text):
|
31 |
output = []
|
32 |
try:
|
33 |
extractor = pke.unsupervised.MultipartiteRank()
|
34 |
+
extractor.load_document(input=text, language='en',normalization=None,stoplist=stoplist)
|
35 |
# keyphrase candidate selection #'ADJ' 'ADP' 'ADV' 'AUX' 'DET' 'NOUN' 'NUM' 'PART' 'PROPN' 'PUNCT' 'VERB'
|
36 |
+
extractor.candidate_selection(pos={'NOUN','VERB','ADJ'})
|
37 |
+
extractor.candidate_weighting(threshold=0.7,method='average',alpha=1.1)
|
38 |
+
keyphrases = extractor.get_n_best(n=5)
|
39 |
|
40 |
+
for val in keyphrases:
|
41 |
+
output.append(val[0])
|
42 |
+
except Exception as e:
|
43 |
+
print("found exception",e)
|
44 |
+
return list(set(output))
|
45 |
|
46 |
+
def get_topicrank_topics(text):
|
47 |
+
output = []
|
48 |
+
try:
|
49 |
+
extractor = pke.unsupervised.TopicRank()
|
50 |
+
extractor.load_document(input=text, language='en',normalization=None,stoplist=stoplist)
|
51 |
+
# keyphrase candidate selection #'ADJ' 'ADP' 'ADV' 'AUX' 'DET' 'NOUN' 'NUM' 'PART' 'PROPN' 'PUNCT' 'VERB'
|
52 |
+
extractor.candidate_selection(pos={'NOUN', 'ADJ'})
|
53 |
+
extractor.candidate_weighting(threshold=0.7,method='average')
|
54 |
keyphrases = extractor.get_n_best(n=5)
|
55 |
|
56 |
for val in keyphrases:
|
|
|
59 |
print("found exception",e)
|
60 |
return list(set(output))
|
61 |
|
62 |
+
def get_yake_topics(text):
|
63 |
+
#statistics model --very poor performance
|
64 |
+
output = []
|
65 |
+
try:
|
66 |
+
extractor = pke.unsupervised.YAKE()
|
67 |
+
extractor.load_document(input=text, language='en',normalization=None,stoplist=stoplist)
|
68 |
+
extractor.candidate_selection(n=3)
|
69 |
+
extractor.candidate_weighting(window=2)
|
70 |
+
keyphrases = extractor.get_n_best(n=5,threshold=0.9)
|
71 |
+
|
72 |
+
for val in keyphrases:
|
73 |
+
output.append(val[0])
|
74 |
+
except Exception as e:
|
75 |
+
print("found exception",e)
|
76 |
+
return list(set(output))
|
77 |
+
|
requirements.txt
CHANGED
@@ -25,4 +25,6 @@ tokenizers~=0.12.1
|
|
25 |
flatbuffers~=1.12
|
26 |
filelock~=3.6.0
|
27 |
sacremoses~=0.0.53
|
28 |
-
fastT5~=0.1.4
|
|
|
|
|
|
25 |
flatbuffers~=1.12
|
26 |
filelock~=3.6.0
|
27 |
sacremoses~=0.0.53
|
28 |
+
fastT5~=0.1.4
|
29 |
+
nltk~=3.6
|
30 |
+
st-annotated-text~=3.0.0
|