liujch1998 commited on
Commit
619c9ac
1 Parent(s): 2e63f1e

Adapt to API updates

Browse files
Files changed (1) hide show
  1. app.py +54 -15
app.py CHANGED
@@ -30,78 +30,117 @@ def process(query_type, corpus_desc, engine_desc, query, maxnum, request: gr.Req
30
  print(result)
31
  return result
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def count(corpus_desc, engine_desc, query, request: gr.Request):
34
  result = process('count', corpus_desc, engine_desc, query, None, request)
35
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
36
- tokenized = '' if 'tokenized' not in result else result['tokenized']
37
  if 'error' in result:
38
  count = result['error']
39
  else:
40
  count = f'{result["count"]:,}'
41
- return latency, tokenized, count
42
 
43
  def prob(corpus_desc, engine_desc, query, request: gr.Request):
44
  result = process('prob', corpus_desc, engine_desc, query, None, request)
45
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
46
- tokenized = '' if 'tokenized' not in result else result['tokenized']
47
  if 'error' in result:
48
  prob = result['error']
49
  elif result['prompt_cnt'] == 0:
50
  prob = '(n-1)-gram is not found in the corpus'
51
  else:
52
  prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
53
- return latency, tokenized, prob
54
 
55
  def ntd(corpus_desc, engine_desc, query, request: gr.Request):
56
  result = process('ntd', corpus_desc, engine_desc, query, None, request)
57
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
58
- tokenized = '' if 'tokenized' not in result else result['tokenized']
59
  if 'error' in result:
60
  ntd = result['error']
61
  else:
62
- ntd = result['ntd']
 
 
 
63
  if ntd == {}:
64
  ntd = '(n-1)-gram is not found in the corpus'
65
- return latency, tokenized, ntd
66
 
67
  def infgram_prob(corpus_desc, engine_desc, query, request: gr.Request):
68
  result = process('infgram_prob', corpus_desc, engine_desc, query, None, request)
69
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
70
- tokenized = '' if 'tokenized' not in result else result['tokenized']
71
  if 'error' in result:
72
  longest_suffix = ''
73
  prob = result['error']
74
  else:
75
  longest_suffix = result['longest_suffix']
76
  prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
77
- return latency, tokenized, longest_suffix, prob
78
 
79
  def infgram_ntd(corpus_desc, engine_desc, query, request: gr.Request):
80
  result = process('infgram_ntd', corpus_desc, engine_desc, query, None, request)
81
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
82
- tokenized = '' if 'tokenized' not in result else result['tokenized']
83
  if 'error' in result:
84
  longest_suffix = ''
85
  ntd = result['error']
86
  else:
87
  longest_suffix = result['longest_suffix']
88
- ntd = result['ntd']
89
- return latency, tokenized, longest_suffix, ntd
 
 
 
90
 
91
  def search_docs(corpus_desc, engine_desc, query, maxnum, request: gr.Request):
92
  result = process('search_docs', corpus_desc, engine_desc, query, maxnum, request)
93
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
94
- tokenized = '' if 'tokenized' not in result else result['tokenized']
95
  if 'error' in result:
96
  message = result['error']
97
  docs = [[] for _ in range(10)]
98
  else:
99
  message = result['message']
100
- docs = result['docs']
 
101
  docs = docs[:maxnum]
102
  while len(docs) < 10:
103
  docs.append([])
104
- return latency, tokenized, message, docs[0], docs[1], docs[2], docs[3], docs[4], docs[5], docs[6], docs[7], docs[8], docs[9]
105
 
106
  def analyze_document(corpus_desc, engine_desc, query, request: gr.Request):
107
  result = process('analyze_document', corpus_desc, engine_desc, query, None, request)
 
30
  print(result)
31
  return result
32
 
33
+ def format_tokenization_info(result):
34
+ if not ('token_ids' in result and 'tokens' in result):
35
+ return ''
36
+ token_ids = result['token_ids']
37
+ tokens = result['tokens']
38
+ t = '[' + " ".join(['"' + token.replace('Ġ', ' ') + '"' for token in tokens]) + '] ' + str(token_ids)
39
+ return t
40
+ def format_tokenization_info_nested(result):
41
+ if not ('token_idsss' in result and 'tokensss' in result):
42
+ return ''
43
+ token_idsss = result['token_idsss']
44
+ tokensss = result['tokensss']
45
+ ttt = []
46
+ for token_idss, tokenss in zip(token_idsss, tokensss):
47
+ tt = []
48
+ for token_ids, tokens in zip(token_idss, tokenss):
49
+ t = '[' + " ".join(['"' + token.replace('Ġ', ' ') + '"' for token in tokens]) + '] ' + str(token_ids)
50
+ tt.append(t)
51
+ tt = '\n'.join(tt)
52
+ ttt.append(tt)
53
+ ttt = '\n\n'.join(ttt)
54
+ return ttt
55
+ def format_doc(doc):
56
+ formatted = []
57
+ if doc['doc_len'] == doc['disp_len']:
58
+ header = f'[Document #{doc["doc_ix"]}, length = {doc["doc_len"]} tokens]\n\n'
59
+ else:
60
+ header = f'[Document #{doc["doc_ix"]}, length = {doc["doc_len"]} tokens ({doc["disp_len"]} tokens displayed)]\n\n'
61
+ formatted.append((header, None))
62
+ formatted += doc['spans']
63
+ return formatted
64
+
65
  def count(corpus_desc, engine_desc, query, request: gr.Request):
66
  result = process('count', corpus_desc, engine_desc, query, None, request)
67
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
68
+ tokenization_info = format_tokenization_info(result)
69
  if 'error' in result:
70
  count = result['error']
71
  else:
72
  count = f'{result["count"]:,}'
73
+ return latency, tokenization_info, count
74
 
75
  def prob(corpus_desc, engine_desc, query, request: gr.Request):
76
  result = process('prob', corpus_desc, engine_desc, query, None, request)
77
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
78
+ tokenization_info = format_tokenization_info(result)
79
  if 'error' in result:
80
  prob = result['error']
81
  elif result['prompt_cnt'] == 0:
82
  prob = '(n-1)-gram is not found in the corpus'
83
  else:
84
  prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
85
+ return latency, tokenization_info, prob
86
 
87
  def ntd(corpus_desc, engine_desc, query, request: gr.Request):
88
  result = process('ntd', corpus_desc, engine_desc, query, None, request)
89
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
90
+ tokenization_info = format_tokenization_info(result)
91
  if 'error' in result:
92
  ntd = result['error']
93
  else:
94
+ result_by_token_id = result['result_by_token_id']
95
+ ntd = {}
96
+ for token_id, r in result_by_token_id.items():
97
+ ntd[f'{r["token"]} ({r["cont_cnt"]} / {result["prompt_cnt"]})'] = r['prob']
98
  if ntd == {}:
99
  ntd = '(n-1)-gram is not found in the corpus'
100
+ return latency, tokenization_info, ntd
101
 
102
  def infgram_prob(corpus_desc, engine_desc, query, request: gr.Request):
103
  result = process('infgram_prob', corpus_desc, engine_desc, query, None, request)
104
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
105
+ tokenization_info = format_tokenization_info(result)
106
  if 'error' in result:
107
  longest_suffix = ''
108
  prob = result['error']
109
  else:
110
  longest_suffix = result['longest_suffix']
111
  prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
112
+ return latency, tokenization_info, longest_suffix, prob
113
 
114
  def infgram_ntd(corpus_desc, engine_desc, query, request: gr.Request):
115
  result = process('infgram_ntd', corpus_desc, engine_desc, query, None, request)
116
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
117
+ tokenization_info = format_tokenization_info(result)
118
  if 'error' in result:
119
  longest_suffix = ''
120
  ntd = result['error']
121
  else:
122
  longest_suffix = result['longest_suffix']
123
+ result_by_token_id = result['result_by_token_id']
124
+ ntd = {}
125
+ for token_id, r in result_by_token_id.items():
126
+ ntd[f'{r["token"]} ({r["cont_cnt"]} / {result["prompt_cnt"]})'] = r['prob']
127
+ return latency, tokenization_info, longest_suffix, ntd
128
 
129
  def search_docs(corpus_desc, engine_desc, query, maxnum, request: gr.Request):
130
  result = process('search_docs', corpus_desc, engine_desc, query, maxnum, request)
131
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
132
+ tokenization_info = format_tokenization_info_nested(result)
133
  if 'error' in result:
134
  message = result['error']
135
  docs = [[] for _ in range(10)]
136
  else:
137
  message = result['message']
138
+ docs = result['documents']
139
+ docs = [format_doc(doc) for doc in docs]
140
  docs = docs[:maxnum]
141
  while len(docs) < 10:
142
  docs.append([])
143
+ return latency, tokenization_info, message, docs[0], docs[1], docs[2], docs[3], docs[4], docs[5], docs[6], docs[7], docs[8], docs[9]
144
 
145
  def analyze_document(corpus_desc, engine_desc, query, request: gr.Request):
146
  result = process('analyze_document', corpus_desc, engine_desc, query, None, request)