liujch1998 commited on
Commit
106f995
1 Parent(s): cb08e07

Sync changes

Browse files
Files changed (2) hide show
  1. app.py +4 -0
  2. constants.py +3 -3
app.py CHANGED
@@ -46,6 +46,8 @@ def prob(corpus_desc, engine_desc, query, request: gr.Request):
46
  tokenized = '' if 'tokenized' not in result else result['tokenized']
47
  if 'error' in result:
48
  prob = result['error']
 
 
49
  else:
50
  prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
51
  return latency, tokenized, prob
@@ -58,6 +60,8 @@ def ntd(corpus_desc, engine_desc, query, request: gr.Request):
58
  ntd = result['error']
59
  else:
60
  ntd = result['ntd']
 
 
61
  return latency, tokenized, ntd
62
 
63
  def infgram_prob(corpus_desc, engine_desc, query, request: gr.Request):
 
46
  tokenized = '' if 'tokenized' not in result else result['tokenized']
47
  if 'error' in result:
48
  prob = result['error']
49
+ elif result['prompt_cnt'] == 0:
50
+ prob = '(n-1)-gram is not found in the corpus'
51
  else:
52
  prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
53
  return latency, tokenized, prob
 
60
  ntd = result['error']
61
  else:
62
  ntd = result['ntd']
63
+ if ntd == {}:
64
+ ntd = '(n-1)-gram is not found in the corpus'
65
  return latency, tokenized, ntd
66
 
67
  def infgram_prob(corpus_desc, engine_desc, query, request: gr.Request):
constants.py CHANGED
@@ -2,10 +2,10 @@ import os
2
 
3
  # options
4
  CORPUS_BY_DESC = {
5
- 'RedPajama (LLaMA tokenizer), 1.4T tokens': 'v3_rpj_llama_c4',
6
  'Pile-train (LLaMA tokenizer), 380B tokens': 'v4_piletrain_llama',
7
- 'Pile-val (LLaMA tokenizer), 390M tokens': 'v3_pileval_llama',
8
- 'Pile-val (GPT-2 tokenizer), 380M tokens': 'v3_pileval_gpt2',
9
  'Dolma-sample (OLMo tokenizer), 8.0B tokens': 'v4_dolmasample_olmo',
10
  }
11
  CORPUS_DESCS = list(CORPUS_BY_DESC.keys())
 
2
 
3
  # options
4
  CORPUS_BY_DESC = {
5
+ 'RedPajama (LLaMA tokenizer), 1.4T tokens': 'v4_rpj_llama_s4',
6
  'Pile-train (LLaMA tokenizer), 380B tokens': 'v4_piletrain_llama',
7
+ 'Pile-val (LLaMA tokenizer), 390M tokens': 'v4_pileval_llama',
8
+ 'Pile-val (GPT-2 tokenizer), 380M tokens': 'v4_pileval_gpt2',
9
  'Dolma-sample (OLMo tokenizer), 8.0B tokens': 'v4_dolmasample_olmo',
10
  }
11
  CORPUS_DESCS = list(CORPUS_BY_DESC.keys())