pseudotensor commited on
Commit
83d9f95
1 Parent(s): 8910711

Update with h2oGPT hash d2fec0293c2259c210f6d808282cb70b2466130b

Browse files
Files changed (1) hide show
  1. app.py +54 -37
app.py CHANGED
@@ -34,6 +34,7 @@ admin_pass = os.getenv("ADMIN_PASS")
34
  # will sometimes appear in UI or sometimes actual generation, but maybe better than empty result
35
  raise_generate_gpu_exceptions = True
36
 
 
37
 
38
  def main(
39
  load_8bit: bool = False,
@@ -144,12 +145,12 @@ def main(
144
  if not gradio:
145
  if eval_sharegpt_prompts_only > 0:
146
  # override default examples with shareGPT ones for human-level eval purposes only
147
- filename = 'ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json'
148
- if not os.path.isfile(filename):
149
  os.system(
150
- 'wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' % filename)
151
  import json
152
- data = json.load(open(filename, 'rt'))
153
  # focus on data that starts with human, else likely chopped from other data
154
  turn_start = 0 # odd in general
155
  data = [x for x in data if len(x['conversations']) > turn_start + 1 and
@@ -165,12 +166,29 @@ def main(
165
  assert data[i]['conversations'][turn_start + 1]['from'] == 'gpt'
166
  output = data[i]['conversations'][turn_start + 1]['value']
167
  examplenew = example1.copy()
168
- examplenew[0] = instruction
169
- examplenew[1] = '' # no input
170
- examplenew[2] = '' # no context
 
171
  examples.append(examplenew)
172
  responses.append(output)
173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  with torch.device("cuda"):
175
  # ensure was set right above before examples generated
176
  assert not stream_output, "stream_output=True does not make sense with example loop"
@@ -183,7 +201,7 @@ def main(
183
  if not eval_sharegpt_as_output:
184
  model, tokenizer, device = get_model(**locals())
185
  model_state = [model, tokenizer, device, base_model]
186
- fun = partial(evaluate, model_state, debug=debug, chat=chat, save_dir=save_dir)
187
  else:
188
  assert eval_sharegpt_prompts_only > 0
189
 
@@ -194,15 +212,17 @@ def main(
194
  fun = get_response
195
  t0 = time.time()
196
  score_dump = []
197
- num_examples = len(examples)
198
 
199
  import matplotlib.pyplot as plt
200
 
201
  for exi, ex in enumerate(examples):
 
 
 
202
  clear_torch_cache()
203
  print("")
204
  print("START" + "=" * 100)
205
- print("Question: %s %s" % (ex[0], ('input=%s' % ex[1] if ex[1] else '')))
206
  print("-" * 105)
207
  # fun yields as generator, so have to iterate over it
208
  # Also means likely do NOT want --stream_output=True, else would show all generations
@@ -211,14 +231,14 @@ def main(
211
  if smodel:
212
  score_with_prompt = False
213
  if score_with_prompt:
214
- data_point = dict(instruction=ex[0], input=ex[1])
215
  prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
216
  prompt = prompter.generate_prompt(data_point)
217
  else:
218
  # just raw input and output
219
- assert ex[1] in [None, ''] # should be no iinput
220
- assert ex[2] in [None, ''] # should be no context
221
- prompt = ex[0]
222
  cutoff_len = 768 if is_low_mem else 2048
223
  inputs = stokenizer(prompt, res,
224
  return_tensors="pt",
@@ -246,30 +266,16 @@ def main(
246
  print("SCORE %s: %s" % (exi, score), flush=True)
247
  score_dump.append(ex + [prompt, res, score])
248
  # dump every score in case abort
249
- scoring_path = 'scoring'
250
- os.makedirs(scoring_path, exist_ok=True)
251
- if eval_sharegpt_as_output:
252
- used_base_model = 'gpt35'
253
- used_lora_weights = ''
254
- else:
255
- used_base_model = str(base_model.split('/')[-1])
256
- used_lora_weights = str(lora_weights.split('/')[-1])
257
  df_scores = pd.DataFrame(score_dump,
258
- columns=eval_func_param_names + ['prompt', 'response', 'score'])
259
- filename = "df_scores_%s_%s_%s_%s_%s_%s.parquet" % (num_examples, eval_sharegpt_prompts_only,
260
- eval_sharegpt_prompts_only_seed,
261
- eval_sharegpt_as_output,
262
- used_base_model,
263
- used_lora_weights)
264
- filename = os.path.join(scoring_path, filename)
265
- df_scores.to_parquet(filename, index=False)
266
  # plot histogram so far
267
  plt.figure(figsize=(10, 10))
268
  plt.hist(df_scores['score'], bins=20)
269
  score_avg = np.mean(df_scores['score'])
270
  score_median = np.median(df_scores['score'])
271
  plt.title("Score avg: %s median: %s" % (score_avg, score_median))
272
- plt.savefig(filename.replace('.parquet', '.png'))
273
  plt.close()
274
 
275
  print("END" + "=" * 102)
@@ -278,7 +284,8 @@ def main(
278
  print("Time taken so far: %.4f about %.4g per example" % (t2 - t0, (t2 - t0) / (1 + exi)))
279
  t1 = time.time()
280
  print("Total time taken: %.4f about %.4g per example" % (t1 - t0, (t1 - t0) / num_examples))
281
- return
 
282
  if gradio:
283
  go_gradio(**locals())
284
 
@@ -774,7 +781,7 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
774
  visible=not is_public and False)
775
  do_sample = gr.Checkbox(label="Sample", info="Enable sampler, required for use of temperature, top_p, top_k",
776
  value=kwargs['do_sample'])
777
- temperature = gr.Slider(minimum=0, maximum=3,
778
  value=kwargs['temperature'],
779
  label="Temperature",
780
  info="Lower is deterministic (but may lead to repeats), Higher more creative (but may lead to hallucinations)")
@@ -984,6 +991,11 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
984
  instruction_nochat_arg_id = eval_func_param_names.index('instruction_nochat')
985
  question = args_list[instruction_nochat_arg_id]
986
 
 
 
 
 
 
987
  question = question[-cutoff_len:]
988
  answer = answer[-cutoff_len:]
989
 
@@ -1307,10 +1319,12 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
1307
  outputs=[model_state, model_used, lora_used, prompt_type])
1308
  prompt_update_args = dict(fn=dropdown_prompt_type_list, inputs=prompt_type, outputs=prompt_type)
1309
  chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
 
1310
  if not is_public:
1311
  load_model_event = load_model_button.click(**load_model_args) \
1312
  .then(**prompt_update_args) \
1313
  .then(**chatbot_update_args) \
 
1314
  .then(clear_torch_cache)
1315
 
1316
  load_model_args2 = dict(fn=load_model,
@@ -1735,6 +1749,7 @@ def get_generate_params(model_lower, chat,
1735
  if not prompt_type and model_lower in inv_prompt_type_to_model_lower:
1736
  prompt_type = inv_prompt_type_to_model_lower[model_lower]
1737
 
 
1738
  if show_examples is None:
1739
  if chat:
1740
  show_examples = False
@@ -1831,6 +1846,7 @@ Philipp: ok, ok you can find everything here. https://huggingface.co/blog/the-pa
1831
  repetition_penalty = repetition_penalty or 1.07
1832
  num_return_sequences = min(num_beams, num_return_sequences or 1)
1833
  do_sample = False if do_sample is None else do_sample
 
1834
  params_list = ["", stream_output, prompt_type, temperature, top_p, top_k, num_beams, max_new_tokens, min_new_tokens,
1835
  early_stopping, max_time, repetition_penalty, num_return_sequences, do_sample]
1836
 
@@ -1874,10 +1890,11 @@ y = np.random.randint(0, 1, 100)
1874
  src_lang = "English"
1875
  tgt_lang = "Russian"
1876
 
1877
- # adjust examples if non-chat mode
1878
- if not chat:
1879
- # move to correct position
1880
- for example in examples:
 
1881
  example[eval_func_param_names.index('instruction_nochat')] = example[
1882
  eval_func_param_names.index('instruction')]
1883
  example[eval_func_param_names.index('instruction')] = ''
 
34
  # will sometimes appear in UI or sometimes actual generation, but maybe better than empty result
35
  raise_generate_gpu_exceptions = True
36
 
37
+ eval_extra_columns = ['prompt', 'response', 'score']
38
 
39
  def main(
40
  load_8bit: bool = False,
 
145
  if not gradio:
146
  if eval_sharegpt_prompts_only > 0:
147
  # override default examples with shareGPT ones for human-level eval purposes only
148
+ eval_filename = 'ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json'
149
+ if not os.path.isfile(eval_filename):
150
  os.system(
151
+ 'wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' % eval_filename)
152
  import json
153
+ data = json.load(open(eval_filename, 'rt'))
154
  # focus on data that starts with human, else likely chopped from other data
155
  turn_start = 0 # odd in general
156
  data = [x for x in data if len(x['conversations']) > turn_start + 1 and
 
166
  assert data[i]['conversations'][turn_start + 1]['from'] == 'gpt'
167
  output = data[i]['conversations'][turn_start + 1]['value']
168
  examplenew = example1.copy()
169
+ assert not chat, "No gradio must use chat=False, uses nochat isntruct"
170
+ examplenew[eval_func_param_names.index('instruction_nochat')] = instruction
171
+ examplenew[eval_func_param_names.index('iinput_nochat')] = '' # no input
172
+ examplenew[eval_func_param_names.index('context')] = '' # no context
173
  examples.append(examplenew)
174
  responses.append(output)
175
 
176
+ num_examples = len(examples)
177
+ scoring_path = 'scoring'
178
+ os.makedirs(scoring_path, exist_ok=True)
179
+ if eval_sharegpt_as_output:
180
+ used_base_model = 'gpt35'
181
+ used_lora_weights = ''
182
+ else:
183
+ used_base_model = str(base_model.split('/')[-1])
184
+ used_lora_weights = str(lora_weights.split('/')[-1])
185
+ eval_filename = "df_scores_%s_%s_%s_%s_%s_%s.parquet" % (num_examples, eval_sharegpt_prompts_only,
186
+ eval_sharegpt_prompts_only_seed,
187
+ eval_sharegpt_as_output,
188
+ used_base_model,
189
+ used_lora_weights)
190
+ eval_filename = os.path.join(scoring_path, eval_filename)
191
+
192
  with torch.device("cuda"):
193
  # ensure was set right above before examples generated
194
  assert not stream_output, "stream_output=True does not make sense with example loop"
 
201
  if not eval_sharegpt_as_output:
202
  model, tokenizer, device = get_model(**locals())
203
  model_state = [model, tokenizer, device, base_model]
204
+ fun = partial(evaluate, model_state, debug=debug, save_dir=save_dir)
205
  else:
206
  assert eval_sharegpt_prompts_only > 0
207
 
 
212
  fun = get_response
213
  t0 = time.time()
214
  score_dump = []
 
215
 
216
  import matplotlib.pyplot as plt
217
 
218
  for exi, ex in enumerate(examples):
219
+ instruction = ex[eval_func_param_names.index('instruction_nochat')]
220
+ iinput = ex[eval_func_param_names.index('iinput_nochat')]
221
+ context = ex[eval_func_param_names.index('context')]
222
  clear_torch_cache()
223
  print("")
224
  print("START" + "=" * 100)
225
+ print("Question: %s %s" % (instruction, ('input=%s' % iinput if iinput else '')))
226
  print("-" * 105)
227
  # fun yields as generator, so have to iterate over it
228
  # Also means likely do NOT want --stream_output=True, else would show all generations
 
231
  if smodel:
232
  score_with_prompt = False
233
  if score_with_prompt:
234
+ data_point = dict(instruction=instruction, input=iinput, context=context)
235
  prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
236
  prompt = prompter.generate_prompt(data_point)
237
  else:
238
  # just raw input and output
239
+ assert iinput in [None, ''] # should be no iinput
240
+ assert context in [None, ''] # should be no context
241
+ prompt = instruction
242
  cutoff_len = 768 if is_low_mem else 2048
243
  inputs = stokenizer(prompt, res,
244
  return_tensors="pt",
 
266
  print("SCORE %s: %s" % (exi, score), flush=True)
267
  score_dump.append(ex + [prompt, res, score])
268
  # dump every score in case abort
 
 
 
 
 
 
 
 
269
  df_scores = pd.DataFrame(score_dump,
270
+ columns=eval_func_param_names + eval_extra_columns)
271
+ df_scores.to_parquet(eval_filename, index=False)
 
 
 
 
 
 
272
  # plot histogram so far
273
  plt.figure(figsize=(10, 10))
274
  plt.hist(df_scores['score'], bins=20)
275
  score_avg = np.mean(df_scores['score'])
276
  score_median = np.median(df_scores['score'])
277
  plt.title("Score avg: %s median: %s" % (score_avg, score_median))
278
+ plt.savefig(eval_filename.replace('.parquet', '.png'))
279
  plt.close()
280
 
281
  print("END" + "=" * 102)
 
284
  print("Time taken so far: %.4f about %.4g per example" % (t2 - t0, (t2 - t0) / (1 + exi)))
285
  t1 = time.time()
286
  print("Total time taken: %.4f about %.4g per example" % (t1 - t0, (t1 - t0) / num_examples))
287
+ return eval_filename
288
+
289
  if gradio:
290
  go_gradio(**locals())
291
 
 
781
  visible=not is_public and False)
782
  do_sample = gr.Checkbox(label="Sample", info="Enable sampler, required for use of temperature, top_p, top_k",
783
  value=kwargs['do_sample'])
784
+ temperature = gr.Slider(minimum=0.01, maximum=3,
785
  value=kwargs['temperature'],
786
  label="Temperature",
787
  info="Lower is deterministic (but may lead to repeats), Higher more creative (but may lead to hallucinations)")
 
991
  instruction_nochat_arg_id = eval_func_param_names.index('instruction_nochat')
992
  question = args_list[instruction_nochat_arg_id]
993
 
994
+ if question is None:
995
+ return 'Response Score: Bad Question'
996
+ if answer is None:
997
+ return 'Response Score: Bad Answer'
998
+
999
  question = question[-cutoff_len:]
1000
  answer = answer[-cutoff_len:]
1001
 
 
1319
  outputs=[model_state, model_used, lora_used, prompt_type])
1320
  prompt_update_args = dict(fn=dropdown_prompt_type_list, inputs=prompt_type, outputs=prompt_type)
1321
  chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
1322
+ nochat_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output_nochat)
1323
  if not is_public:
1324
  load_model_event = load_model_button.click(**load_model_args) \
1325
  .then(**prompt_update_args) \
1326
  .then(**chatbot_update_args) \
1327
+ .then(**nochat_update_args) \
1328
  .then(clear_torch_cache)
1329
 
1330
  load_model_args2 = dict(fn=load_model,
 
1749
  if not prompt_type and model_lower in inv_prompt_type_to_model_lower:
1750
  prompt_type = inv_prompt_type_to_model_lower[model_lower]
1751
 
1752
+ # examples at first don't include chat, instruction_nochat, iinput_nochat, added at end
1753
  if show_examples is None:
1754
  if chat:
1755
  show_examples = False
 
1846
  repetition_penalty = repetition_penalty or 1.07
1847
  num_return_sequences = min(num_beams, num_return_sequences or 1)
1848
  do_sample = False if do_sample is None else do_sample
1849
+ # doesn't include chat, instruction_nochat, iinput_nochat, added later
1850
  params_list = ["", stream_output, prompt_type, temperature, top_p, top_k, num_beams, max_new_tokens, min_new_tokens,
1851
  early_stopping, max_time, repetition_penalty, num_return_sequences, do_sample]
1852
 
 
1890
  src_lang = "English"
1891
  tgt_lang = "Russian"
1892
 
1893
+ # move to correct position
1894
+ for example in examples:
1895
+ example += [chat, '', '']
1896
+ # adjust examples if non-chat mode
1897
+ if not chat:
1898
  example[eval_func_param_names.index('instruction_nochat')] = example[
1899
  eval_func_param_names.index('instruction')]
1900
  example[eval_func_param_names.index('instruction')] = ''