mbahrami commited on
Commit
eacbe96
1 Parent(s): 4fd1747

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -12
app.py CHANGED
@@ -7,7 +7,7 @@ semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
7
 
8
  @st.cache(allow_output_mutation=True)
9
  def get_model(model):
10
- return pipeline("fill-mask", model=model, top_k=100)#seto maximum of tokens to be retrieved after each inference to model
11
 
12
 
13
  HISTORY_WEIGHT = 100 # set history weight (if found any keyword from history, it will priorities based on its weight)
@@ -15,28 +15,37 @@ HISTORY_WEIGHT = 100 # set history weight (if found any keyword from history, it
15
  st.caption("This is a simple auto-completion where the next token is predicted per probability and a weigh if appears in user's history")
16
 
17
  history_keyword_text = st.text_input("Enter users's history keywords (optional, i.e., 'Gates')", value="")
 
18
 
19
  text = st.text_input("Enter a text for auto completion...", value='Where is Bill')
20
- semantic_text = st.text_input("Enter users's history semantic (optional, i.e., 'Microsoft')", value="Microsoft")
21
 
22
- model = st.selectbox("choose a model", ["roberta-base", "bert-base-uncased"])
 
23
 
24
- data_load_state = st.text('Loading model...')
 
25
  nlp = get_model(model)
 
 
 
26
 
27
  if text:
28
- data_load_state = st.text('Inference to model...')
29
  result = nlp(text+' '+nlp.tokenizer.mask_token)
30
- data_load_state.text('')
31
-
32
- if len(semantic_text):
33
- predicted_embeddings = model.encode(result['sequence'], convert_to_tensor=True)
34
- semantic_history_embeddings = model.encode(semantic_text.spllit(','), convert_to_tensor=True)
 
35
  cosine_scores = util.cos_sim(predicted_embeddings, semantic_history_embeddings)
36
 
37
  for index, r in enumerate(result):
38
  if len(semantic_text):
39
- result[index]['score']*=cosine_scores[index][index]
 
 
40
  if r['token_str'].lower().strip() in history_keyword_text.lower().strip() and len(r['token_str'].lower().strip())>1:
41
  #found from history, then increase the score of tokens
42
  result[index]['score']*=HISTORY_WEIGHT
@@ -44,5 +53,5 @@ if text:
44
  #sort the results
45
  df=pd.DataFrame(result).sort_values(by='score', ascending=False)
46
 
47
- #show the results as a table
48
  st.table(df)
 
7
 
8
  @st.cache(allow_output_mutation=True)
9
  def get_model(model):
10
+ return pipeline("fill-mask", model=model, top_k=100)#set the maximum of tokens to be retrieved after each inference to model
11
 
12
 
13
  HISTORY_WEIGHT = 100 # set history weight (if found any keyword from history, it will priorities based on its weight)
 
15
  st.caption("This is a simple auto-completion where the next token is predicted per probability and a weigh if appears in user's history")
16
 
17
  history_keyword_text = st.text_input("Enter users's history keywords (optional, i.e., 'Gates')", value="")
18
+ #history_keyword_text=''
19
 
20
  text = st.text_input("Enter a text for auto completion...", value='Where is Bill')
21
+ #text='Where is Bill'
22
 
23
+ semantic_text = st.text_input("Enter users's history semantic (optional, i.e., 'Microsoft or President')", value="Microsoft")
24
+ #semantic_text='President'
25
 
26
+ model = st.selectbox("choose a model", ["roberta-base", "bert-base-uncased"])
27
+ #model='roberta-base'
28
  nlp = get_model(model)
29
+ #data_load_state = st.text('Loading model...')
30
+
31
+
32
 
33
  if text:
34
+ # data_load_state = st.text('Inference to model...')
35
  result = nlp(text+' '+nlp.tokenizer.mask_token)
36
+ # data_load_state.text('')
37
+ sem_list=[_.strip() for _ in semantic_text.split(',')]
38
+ if len(semantic_text):
39
+ predicted_seq=[rec['sequence'] for rec in result]
40
+ predicted_embeddings = semantic_model.encode(predicted_seq, convert_to_tensor=True)
41
+ semantic_history_embeddings = semantic_model.encode(sem_list, convert_to_tensor=True)
42
  cosine_scores = util.cos_sim(predicted_embeddings, semantic_history_embeddings)
43
 
44
  for index, r in enumerate(result):
45
  if len(semantic_text):
46
+ # for j_index in range(len(sem_list)):
47
+ if len(r['token_str'])>2: #skip spcial chars such as "?"
48
+ result[index]['score']+=float(sum(cosine_scores[index]))
49
  if r['token_str'].lower().strip() in history_keyword_text.lower().strip() and len(r['token_str'].lower().strip())>1:
50
  #found from history, then increase the score of tokens
51
  result[index]['score']*=HISTORY_WEIGHT
 
53
  #sort the results
54
  df=pd.DataFrame(result).sort_values(by='score', ascending=False)
55
 
56
+ # show the results as a table
57
  st.table(df)