ashishraics commited on
Commit
51fb126
1 Parent(s): 4b9b600

requiremtn fix

Browse files
Files changed (4) hide show
  1. app.py +8 -10
  2. extract_config.py +8 -0
  3. keywords.py +38 -8
  4. requirements.txt +3 -1
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import streamlit as st
2
  from transformers import AutoTokenizer
3
  from fastT5 import OnnxT5,get_onnx_runtime_sessions
 
4
  from annotated_text import annotated_text
5
  import nltk
6
  nltk.download('stopwords')
@@ -8,15 +9,14 @@ nltk.download('wordnet')
8
  nltk.download('punkt')
9
  from nltk.corpus import stopwords,wordnet
10
  from nltk.tokenize import sent_tokenize
11
- from flashtext import KeywordProcessor
12
- import regex as re
13
  import string
14
  import subprocess
15
- from PIL import Image
16
  import logging
17
  import multiprocessing
18
  total_threads=multiprocessing.cpu_count()
19
  import onnxruntime as ort
 
 
20
  try:
21
  import pke
22
  logging.error("importing pke info")
@@ -84,7 +84,7 @@ model_t5=OnnxT5(model_or_model_path=t5_chkpt,onnx_model_sessions=model_session)
84
  tokenizer_t5=AutoTokenizer.from_pretrained(t5_tokenizer)
85
 
86
  def create_question_t5(model,tokenizer,context,answer,max_length=64):
87
- input = "context: %s answer: %s " % (context, answer)
88
  features=tokenizer([input],return_tensors='pt')
89
  output=model.generate(input_ids=features['input_ids'],
90
  attention_mask=features['attention_mask'],
@@ -94,7 +94,7 @@ def create_question_t5(model,tokenizer,context,answer,max_length=64):
94
  return tokenizer.decode(output.squeeze(), skip_special_tokens=True)
95
 
96
  def create_answers_t5(model,tokenizer,context,question,max_length=128):
97
- input = "context: %s question: %s " % (context, question)
98
  features=tokenizer([input],return_tensors='pt')
99
  output=model.generate(input_ids=features['input_ids'],
100
  attention_mask=features['attention_mask'],
@@ -124,10 +124,8 @@ c1,c2,c3=st.columns(3)
124
  with c1:
125
  create_usingkeyword = st.button("Create Questions using Keywords")
126
  if create_usingkeyword:
127
-
128
- from keywords import tokenize_sentence,get_noun_adj_verb
129
  tokenized_sent = tokenize_sentence(input_context)
130
- keywords_noun_adj_verb = get_noun_adj_verb(input_context)
131
  t5_questions=[]
132
 
133
  with st.spinner("Creating Questionsssss"):
@@ -144,6 +142,6 @@ with c1:
144
  st.markdown("---")
145
 
146
  with c2:
147
- create_usingtopics = st.button("Create Questions using Topic Modelling")
148
- if create_usingtopics:
149
  pass
 
1
  import streamlit as st
2
  from transformers import AutoTokenizer
3
  from fastT5 import OnnxT5,get_onnx_runtime_sessions
4
+ from keywords import tokenize_sentence, get_multipartiterank_topics,get_topicrank_topics,get_yake_topics
5
  from annotated_text import annotated_text
6
  import nltk
7
  nltk.download('stopwords')
 
9
  nltk.download('punkt')
10
  from nltk.corpus import stopwords,wordnet
11
  from nltk.tokenize import sent_tokenize
 
 
12
  import string
13
  import subprocess
 
14
  import logging
15
  import multiprocessing
16
  total_threads=multiprocessing.cpu_count()
17
  import onnxruntime as ort
18
+ # from bertopic import BERTopic
19
+ from sklearn.feature_extraction.text import CountVectorizer
20
  try:
21
  import pke
22
  logging.error("importing pke info")
 
84
  tokenizer_t5=AutoTokenizer.from_pretrained(t5_tokenizer)
85
 
86
  def create_question_t5(model,tokenizer,context,answer,max_length=64):
87
+ input = "context: %s answer: %s </s>" % (context, answer)
88
  features=tokenizer([input],return_tensors='pt')
89
  output=model.generate(input_ids=features['input_ids'],
90
  attention_mask=features['attention_mask'],
 
94
  return tokenizer.decode(output.squeeze(), skip_special_tokens=True)
95
 
96
  def create_answers_t5(model,tokenizer,context,question,max_length=128):
97
+ input = "context: %s question: %s </s>" % (context, question)
98
  features=tokenizer([input],return_tensors='pt')
99
  output=model.generate(input_ids=features['input_ids'],
100
  attention_mask=features['attention_mask'],
 
124
  with c1:
125
  create_usingkeyword = st.button("Create Questions using Keywords")
126
  if create_usingkeyword:
 
 
127
  tokenized_sent = tokenize_sentence(input_context)
128
+ keywords_noun_adj_verb = get_multipartiterank_topics(input_context)
129
  t5_questions=[]
130
 
131
  with st.spinner("Creating Questionsssss"):
 
142
  st.markdown("---")
143
 
144
  with c2:
145
+ create_usinglongformer = st.button("Create Questions using Longformer")
146
+ if create_usinglongformer:
147
  pass
extract_config.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertConfig,BertForMaskedLM
2
+
3
+ config=BertConfig()
4
+ model=BertForMaskedLM(config)
5
+
6
+ print(config)
7
+
8
+ print(model.config)
keywords.py CHANGED
@@ -4,8 +4,6 @@ nltk.download('wordnet')
4
  nltk.download('punkt')
5
  from nltk.corpus import stopwords,wordnet
6
  from nltk.tokenize import sent_tokenize
7
- from flashtext import KeywordProcessor
8
- import regex as re
9
  import string
10
  import subprocess
11
  import logging
@@ -19,24 +17,40 @@ except:
19
  subprocess.run(['python3' ,'-m' ,'spacy' ,'download' ,'en'])
20
  import pke
21
 
 
 
 
 
22
 
23
  def tokenize_sentence(text):
24
  sentences=sent_tokenize(text)
25
  sentences=[s.strip().lstrip().rstrip() for s in sentences if len(s) > 20]
26
  return sentences
27
 
28
- def get_noun_adj_verb(text):
29
  output = []
30
  try:
31
  extractor = pke.unsupervised.MultipartiteRank()
32
- extractor.load_document(input=text, language='en',normalization=None)
33
  # keyphrase candidate selection #'ADJ' 'ADP' 'ADV' 'AUX' 'DET' 'NOUN' 'NUM' 'PART' 'PROPN' 'PUNCT' 'VERB'
34
- extractor.candidate_selection(pos={'NOUN', 'VERB', 'ADJ'})
 
 
35
 
36
- # candidate weighting,
37
- extractor.candidate_weighting(threshold=0.9,method='average',alpha=1.1)
 
 
 
38
 
39
- #extract top n
 
 
 
 
 
 
 
40
  keyphrases = extractor.get_n_best(n=5)
41
 
42
  for val in keyphrases:
@@ -45,3 +59,19 @@ def get_noun_adj_verb(text):
45
  print("found exception",e)
46
  return list(set(output))
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  nltk.download('punkt')
5
  from nltk.corpus import stopwords,wordnet
6
  from nltk.tokenize import sent_tokenize
 
 
7
  import string
8
  import subprocess
9
  import logging
 
17
  subprocess.run(['python3' ,'-m' ,'spacy' ,'download' ,'en'])
18
  import pke
19
 
20
+ stoplist = list(string.punctuation)
21
+ stoplist += pke.lang.stopwords.get('en')
22
+ stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
23
+ stoplist += stopwords.words('english')
24
 
25
  def tokenize_sentence(text):
26
  sentences=sent_tokenize(text)
27
  sentences=[s.strip().lstrip().rstrip() for s in sentences if len(s) > 20]
28
  return sentences
29
 
30
+ def get_multipartiterank_topics(text):
31
  output = []
32
  try:
33
  extractor = pke.unsupervised.MultipartiteRank()
34
+ extractor.load_document(input=text, language='en',normalization=None,stoplist=stoplist)
35
  # keyphrase candidate selection #'ADJ' 'ADP' 'ADV' 'AUX' 'DET' 'NOUN' 'NUM' 'PART' 'PROPN' 'PUNCT' 'VERB'
36
+ extractor.candidate_selection(pos={'NOUN','VERB','ADJ'})
37
+ extractor.candidate_weighting(threshold=0.7,method='average',alpha=1.1)
38
+ keyphrases = extractor.get_n_best(n=5)
39
 
40
+ for val in keyphrases:
41
+ output.append(val[0])
42
+ except Exception as e:
43
+ print("found exception",e)
44
+ return list(set(output))
45
 
46
+ def get_topicrank_topics(text):
47
+ output = []
48
+ try:
49
+ extractor = pke.unsupervised.TopicRank()
50
+ extractor.load_document(input=text, language='en',normalization=None,stoplist=stoplist)
51
+ # keyphrase candidate selection #'ADJ' 'ADP' 'ADV' 'AUX' 'DET' 'NOUN' 'NUM' 'PART' 'PROPN' 'PUNCT' 'VERB'
52
+ extractor.candidate_selection(pos={'NOUN', 'ADJ'})
53
+ extractor.candidate_weighting(threshold=0.7,method='average')
54
  keyphrases = extractor.get_n_best(n=5)
55
 
56
  for val in keyphrases:
 
59
  print("found exception",e)
60
  return list(set(output))
61
 
62
+ def get_yake_topics(text):
63
+ #statistics model --very poor performance
64
+ output = []
65
+ try:
66
+ extractor = pke.unsupervised.YAKE()
67
+ extractor.load_document(input=text, language='en',normalization=None,stoplist=stoplist)
68
+ extractor.candidate_selection(n=3)
69
+ extractor.candidate_weighting(window=2)
70
+ keyphrases = extractor.get_n_best(n=5,threshold=0.9)
71
+
72
+ for val in keyphrases:
73
+ output.append(val[0])
74
+ except Exception as e:
75
+ print("found exception",e)
76
+ return list(set(output))
77
+
requirements.txt CHANGED
@@ -25,4 +25,6 @@ tokenizers~=0.12.1
25
  flatbuffers~=1.12
26
  filelock~=3.6.0
27
  sacremoses~=0.0.53
28
- fastT5~=0.1.4
 
 
 
25
  flatbuffers~=1.12
26
  filelock~=3.6.0
27
  sacremoses~=0.0.53
28
+ fastT5~=0.1.4
29
+ nltk~=3.6
30
+ st-annotated-text~=3.0.0