liujch1998 commited on
Commit
2195005
1 Parent(s): aa7da7f

Sync changes

Browse files
Files changed (1) hide show
  1. app.py +15 -19
app.py CHANGED
@@ -40,23 +40,19 @@ def format_tokenization_info(result):
40
  return ''
41
  token_ids = result['token_ids']
42
  tokens = result['tokens']
43
- t = '[' + " ".join(['"' + token.replace('Ġ', ' ') + '"' for token in tokens]) + '] ' + str(token_ids)
44
- return t
45
- def format_tokenization_info_nested(result):
46
- if not ('token_idsss' in result and 'tokensss' in result):
47
- return ''
48
- token_idsss = result['token_idsss']
49
- tokensss = result['tokensss']
50
- ttt = []
51
- for token_idss, tokenss in zip(token_idsss, tokensss):
52
- tt = []
53
- for token_ids, tokens in zip(token_idss, tokenss):
54
- t = '[' + " ".join(['"' + token.replace('Ġ', ' ') + '"' for token in tokens]) + '] ' + str(token_ids)
55
- tt.append(t)
56
- tt = '\n'.join(tt)
57
- ttt.append(tt)
58
- ttt = '\n\n'.join(ttt)
59
- return ttt
60
  def format_doc(doc):
61
  formatted = []
62
  if doc['doc_len'] == doc['disp_len']:
@@ -134,7 +130,7 @@ def infgram_ntd(corpus_desc, engine_desc, query, request: gr.Request):
134
  def search_docs(corpus_desc, engine_desc, query, maxnum, request: gr.Request):
135
  result = process('search_docs', corpus_desc, engine_desc, query, maxnum, request)
136
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
137
- tokenization_info = format_tokenization_info_nested(result)
138
  if 'error' in result:
139
  message = result['error']
140
  docs = [[] for _ in range(10)]
@@ -157,7 +153,7 @@ with gr.Blocks() as demo:
157
  '''<h1 text-align="center">Infini-gram: An Engine for n-gram / ∞-gram Language Modeling with Trillion-Token Corpora</h1>
158
 
159
  <p style='font-size: 16px;'>This is an engine that processes n-gram / ∞-gram queries on massive text corpora. Please first select the corpus and the type of query, then enter your query and submit.</p>
160
- <p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng (Gary) Liu</a> and documented in our paper: <a href="https://huggingface.co/papers/2401.17377">Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens</a>.</p>
161
  <p style='font-size: 16px;'><b>API Endpoint:</b> If you'd like to issue batch queries to infini-gram, you may invoke our API endpoint. Please refer to the <a href="https://infini-gram.io/api_doc">API documentation</a>.</p>
162
  <p style='font-size: 16px;'><b>Note:</b> The query is <b>case-sensitive</b>. Your query will be tokenized with the Llama-2 tokenizer (unless otherwise specified).</p>
163
  '''
 
40
  return ''
41
  token_ids = result['token_ids']
42
  tokens = result['tokens']
43
+ if type(token_ids) == list and all([type(token_id) == int for token_id in token_ids]):
44
+ output = '[' + " ".join(['"' + token.replace('Ġ', ' ') + '"' for token in tokens]) + '] ' + str(token_ids)
45
+ else:
46
+ ttt = []
47
+ for token_idss, tokenss in zip(token_ids, tokens):
48
+ tt = []
49
+ for token_ids, tokens in zip(token_idss, tokenss):
50
+ t = '[' + " ".join(['"' + token.replace('Ġ', ' ') + '"' for token in tokens]) + '] ' + str(token_ids)
51
+ tt.append(t)
52
+ tt = '\n'.join(tt)
53
+ ttt.append(tt)
54
+ output = '\n\n'.join(ttt)
55
+ return output
 
 
 
 
56
  def format_doc(doc):
57
  formatted = []
58
  if doc['doc_len'] == doc['disp_len']:
 
130
  def search_docs(corpus_desc, engine_desc, query, maxnum, request: gr.Request):
131
  result = process('search_docs', corpus_desc, engine_desc, query, maxnum, request)
132
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
133
+ tokenization_info = format_tokenization_info(result)
134
  if 'error' in result:
135
  message = result['error']
136
  docs = [[] for _ in range(10)]
 
153
  '''<h1 text-align="center">Infini-gram: An Engine for n-gram / ∞-gram Language Modeling with Trillion-Token Corpora</h1>
154
 
155
  <p style='font-size: 16px;'>This is an engine that processes n-gram / ∞-gram queries on massive text corpora. Please first select the corpus and the type of query, then enter your query and submit.</p>
156
+ <p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng (Gary) Liu</a> and documented in our paper: <a href="https://huggingface.co/papers/2401.17377">Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens</a>. Feel free to check out our <a href="https://infini-gram.io">Project Homepage</a>.</p>
157
  <p style='font-size: 16px;'><b>API Endpoint:</b> If you'd like to issue batch queries to infini-gram, you may invoke our API endpoint. Please refer to the <a href="https://infini-gram.io/api_doc">API documentation</a>.</p>
158
  <p style='font-size: 16px;'><b>Note:</b> The query is <b>case-sensitive</b>. Your query will be tokenized with the Llama-2 tokenizer (unless otherwise specified).</p>
159
  '''