jijivski commited on
Commit
3fe3e10
1 Parent(s): 0bf42ca

okay on local phi-2

Browse files
__init__.py ADDED
File without changes
app.py CHANGED
@@ -1,25 +1,28 @@
1
  import gradio as gr
2
  import os
3
  from transformers import AutoTokenizer
4
- from .get_loss.get_loss_hf import run_get_loss
5
-
 
6
  # os.system('git clone https://github.com/EleutherAI/lm-evaluation-harness')
7
  # os.system('cd lm-evaluation-harness')
8
  # os.system('pip install -e .')
 
9
  # 第一个功能:基于输入文本和对应的损失值对文本进行着色展示
10
 
11
- def color_text(text_list=["hi", "FreshEval"], loss_list=[0.1,0.7]):
12
  """
13
  根据损失值为文本着色。
14
  """
15
  highlighted_text = []
 
16
  for text, loss in zip(text_list, loss_list):
17
  # color = "#FF0000" if float(loss) > 0.5 else "#00FF00"
18
- color=loss
19
  # highlighted_text.append({"text": text, "bg_color": color})
20
  highlighted_text.append((text, color))
21
 
22
- print(highlighted_text)
23
  return highlighted_text
24
 
25
  # 第二个功能:根据 ID 列表和 tokenizer 将 ID 转换为文本,并展示
@@ -27,32 +30,43 @@ def get_text(ids_list=[0.1,0.7], tokenizer=None):
27
  """
28
  给定一个 ID 列表和 tokenizer 名称,将这些 ID 转换成文本。
29
  """
30
- return ['Hi', 'Adam']
31
- # tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
32
- # text = tokenizer.decode(eval(ids_list), skip_special_tokens=True)
 
 
 
 
33
  # 这里只是简单地返回文本,但是可以根据实际需求添加颜色或其他样式
34
- # return text
 
35
 
36
 
37
- def get_ids_loss(text, tokenizer, model):
38
- """
39
- 给定一个文本,model and its tokenizer,返回其对应的 IDs 和损失值。
40
- """
41
- # tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
42
- # model = AutoModelForCausalLM.from_pretrained(model_name)
43
- # 这里只是简单地返回 IDs 和损失值,但是可以根据实际需求添加颜色或其他样式
44
- return [1, 2], [0.1, 0.7]
45
 
46
 
47
- def color_pipeline(text=["hi", "FreshEval"], model=None):
48
  """
49
  给定一个文本,返回其对应的着色文本。
50
  """
51
- # rtn_dic=run_get_loss()
 
 
 
 
 
52
  # {'logit':logit,'input_ids':input_chunk,'tokenizer':tokenizer,'neg_log_prob_temp':neg_log_prob_temp}
53
- tokenizer=None # get tokenizer
54
- ids, loss = get_ids_loss(text, tokenizer, model)
55
  text = get_text(ids, tokenizer)
 
56
  return color_text(text, loss)
57
 
58
 
@@ -67,20 +81,25 @@ with gr.Blocks() as demo:
67
 
68
  # loss_input = gr.Number(label="loss")
69
  model_input = gr.Textbox(label="model name", placeholder="input your model name here... now I am trying phi-2...")
 
 
 
 
 
 
 
 
 
 
 
 
70
  # TODO select models that can be used online
71
  # TODO maybe add our own models
72
 
73
 
74
  color_text_output = gr.HTML(label="colored text")
75
- # gr.Markdown("## Text Examples")
76
- # gr.Examples(
77
- # [["hi", "Adam"], [0.1,0.7]],
78
- # [text_input, loss_input],
79
- # cache_examples=True,
80
- # fn=color_text,
81
- # outputs=color_text_output
82
- # )
83
- color_text_button = gr.Button("color the text").click(color_pipeline, inputs=[text_input, model_input], outputs=gr.HighlightedText(label="colored text"))
84
 
85
 
86
  date_time_input = gr.Textbox(label="the date when the text is generated")#TODO add date time input
 
1
  import gradio as gr
2
  import os
3
  from transformers import AutoTokenizer
4
+ from get_loss.get_loss_hf import run_get_loss
5
+ import pdb
6
+ from types import SimpleNamespace
7
  # os.system('git clone https://github.com/EleutherAI/lm-evaluation-harness')
8
  # os.system('cd lm-evaluation-harness')
9
  # os.system('pip install -e .')
10
+ # -i https://pypi.tuna.tsinghua.edu.cn/simple
11
  # 第一个功能:基于输入文本和对应的损失值对文本进行着色展示
12
 
13
+ def color_text(text_list=["hi", "FreshEval","!"], loss_list=[0.1,0.7]):
14
  """
15
  根据损失值为文本着色。
16
  """
17
  highlighted_text = []
18
+ loss_list=[0]+loss_list
19
  for text, loss in zip(text_list, loss_list):
20
  # color = "#FF0000" if float(loss) > 0.5 else "#00FF00"
21
+ color=loss/25
22
  # highlighted_text.append({"text": text, "bg_color": color})
23
  highlighted_text.append((text, color))
24
 
25
+ print('highlighted_text',highlighted_text)
26
  return highlighted_text
27
 
28
  # 第二个功能:根据 ID 列表和 tokenizer 将 ID 转换为文本,并展示
 
30
  """
31
  给定一个 ID 列表和 tokenizer 名称,将这些 ID 转换成文本。
32
  """
33
+ # return ['Hi', 'Adam']
34
+ # tokenizer = AutoTokenizer.from_pretrained(tokenizer)
35
+ print('ids_list',ids_list)
36
+ # pdb.set_trace()
37
+ text=[]
38
+ for id in ids_list:
39
+ text.append( tokenizer.decode(id, skip_special_tokens=True))
40
  # 这里只是简单地返回文本,但是可以根据实际需求添加颜色或其他样式
41
+ print(f'L41:{text}')
42
+ return text
43
 
44
 
45
+ # def get_ids_loss(text, tokenizer, model):
46
+ # """
47
+ # 给定一个文本,model and its tokenizer,返回其对应的 IDs 和损失值。
48
+ # """
49
+ # # tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
50
+ # # model = AutoModelForCausalLM.from_pretrained(model_name)
51
+ # # 这里只是简单地返回 IDs 和损失值,但是可以根据实际需求添加颜色或其他样式
52
+ # return [1, 2], [0.1, 0.7]
53
 
54
 
55
+ def color_pipeline(texts=["Hi","FreshEval","!"], model=None):
56
  """
57
  给定一个文本,返回其对应的着色文本。
58
  """
59
+ print('text,model',texts,model)
60
+ args=SimpleNamespace(texts=texts,model=model)
61
+ print(f'L60,text:{texts}')
62
+ rtn_dic=run_get_loss(args)
63
+ # print(rtn_dic)
64
+ # pdb.set_trace()
65
  # {'logit':logit,'input_ids':input_chunk,'tokenizer':tokenizer,'neg_log_prob_temp':neg_log_prob_temp}
66
+ ids, loss =rtn_dic['input_ids'],rtn_dic['loss']#= get_ids_loss(text, tokenizer, model)
67
+ tokenizer=rtn_dic['tokenizer'] # get tokenizer
68
  text = get_text(ids, tokenizer)
69
+ # print('ids, loss ,text',ids, loss ,text)
70
  return color_text(text, loss)
71
 
72
 
 
81
 
82
  # loss_input = gr.Number(label="loss")
83
  model_input = gr.Textbox(label="model name", placeholder="input your model name here... now I am trying phi-2...")
84
+ output_box=gr.HighlightedText(label="colored text")
85
+ # gr.Examples(
86
+ # [
87
+ # # ["Hi FreshEval !", "microsoft/phi-2"],
88
+ # ["Hello FreshBench !", "/home/sribd/chenghao/models/phi-2"],
89
+ # ],
90
+ # [text_input, model_input],
91
+ # cache_examples=True,
92
+ # # cache_examples=False,
93
+ # fn=color_pipeline,
94
+ # outputs=output_box
95
+ # )
96
  # TODO select models that can be used online
97
  # TODO maybe add our own models
98
 
99
 
100
  color_text_output = gr.HTML(label="colored text")
101
+
102
+ color_text_button = gr.Button("color the text").click(color_pipeline, inputs=[text_input, model_input], outputs=output_box)
 
 
 
 
 
 
 
103
 
104
 
105
  date_time_input = gr.Textbox(label="the date when the text is generated")#TODO add date time input
get_loss/__pycache__/get_loss_hf.cpython-310.pyc ADDED
Binary file (3.76 kB). View file
 
get_loss/get_loss.py CHANGED
@@ -257,7 +257,7 @@ def run_get_loss(args):
257
  from types import SimpleNamespace
258
 
259
  if __name__ == '__main__':
260
- args=SimpleNamespace(model='microsft/phi-2',model_type='hf',data='data.json',log_path='./logs/',model_cache=None,chunk_size=1024)
261
 
262
 
263
 
 
257
  from types import SimpleNamespace
258
 
259
  if __name__ == '__main__':
260
+ args=SimpleNamespace(model='microsoft/phi-2',texts=['Hello FreshBench !'],model_type='hf',data='data.json',model_cache=None,chunk_size=1024)
261
 
262
 
263
 
get_loss/get_loss_hf.py CHANGED
@@ -10,13 +10,15 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
10
  from datetime import datetime
11
  import argparse
12
  from types import SimpleNamespace
13
-
14
  # import mamba_ssm
15
  # import rwkv
16
 
17
 
18
  # RWKV4_TOKENIZER_FILE = "./support/20B_tokenizer.json"
19
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
 
20
 
21
  def load_list_from_json(file_path):
22
  """
@@ -36,6 +38,7 @@ def calculate_loss(logits, target_token_ids):
36
  # log_probs = F.log_softmax(shifted_logits, dim=-1)
37
  loss = torch.nn.functional.cross_entropy(logits[:-1, :].view(-1, logits.shape[-1]),
38
  target_token_ids[1:].view(-1), reduction='none')
 
39
 
40
 
41
  # target_log_probs = -log_probs.gather(1, shifted_targets.unsqueeze(1)).squeeze()
@@ -44,7 +47,7 @@ def calculate_loss(logits, target_token_ids):
44
  # log_sum = torch.sum(target_log_probs, dim=-1)
45
  # print(perplexity_sum)
46
 
47
- return loss.item()
48
 
49
 
50
  def calculate_log_sum(logits, target_token_ids):
@@ -122,6 +125,7 @@ def print_model_parameters_in_billions(model):
122
  def load_hf_model(path, cache_path):
123
  hf_tokenizer = AutoTokenizer.from_pretrained(path)
124
  if cache_path is not None:
 
125
  hf_model = AutoModelForCausalLM.from_pretrained(path,
126
  device_map=device,
127
  trust_remote_code=True,
@@ -212,7 +216,6 @@ def eval_hf_model(model, tokenizer, texts, chunk_size):
212
  neg_log_prob_temp += log_sum
213
 
214
  loss = calculate_loss(logit, input_chunk.squeeze(0))
215
- neg_log_prob_temp += log_sum
216
 
217
  # token_length_list.append(seq_length)
218
  # data.append(neg_log_prob_temp)
@@ -224,8 +227,8 @@ def eval_hf_model(model, tokenizer, texts, chunk_size):
224
 
225
  # print(f'log probability sum: {sum(data) / len(data):.2f}')
226
  # print(f'avg tokens: {sum(token_length_list) / len(token_length_list):.0f}')
227
- rtn_dic={'logit':logit,'input_ids':input_chunk,'loss':loss,'tokenizer':tokenizer,'neg_log_prob_temp':neg_log_prob_temp}
228
- return rtn_dic
229
 
230
 
231
  # if __name__ == '__main__':
@@ -239,14 +242,28 @@ def eval_hf_model(model, tokenizer, texts, chunk_size):
239
  # parser.add_argument('--chunk_size', type=int, default=1024, help='chunk size')
240
 
241
 
242
- def run_get_loss(args):
243
  if args is None:
244
- args=SimpleNamespace(model='microsoft/phi-2',texts='Hello FreshBench !',model_type='hf',model_cache=None,chunk_size=1024)
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
  # args = parser.parse_args()
247
 
248
  # load data
249
  # texts = load_list_from_json(args.data)
 
250
  texts=args.texts
251
  print(f'data size: {len(texts)}')
252
 
@@ -264,6 +281,7 @@ def run_get_loss(args):
264
 
265
  # eval
266
  if args.model_type in ['hf', 'mamba']:
 
267
  return eval_hf_model(model=model, tokenizer=tokenizer, texts=texts, chunk_size=args.chunk_size)
268
  # elif args.model_type == 'rwkv':
269
  # return eval_rwkv(model=model, tokenizer=tokenizer, texts=texts, chunk_size=args.chunk_size)
 
10
  from datetime import datetime
11
  import argparse
12
  from types import SimpleNamespace
13
+ import pdb
14
  # import mamba_ssm
15
  # import rwkv
16
 
17
 
18
  # RWKV4_TOKENIZER_FILE = "./support/20B_tokenizer.json"
19
+ # device = 'cuda' if torch.cuda.is_available() else 'cpu'
20
+ device = 'cpu'
21
+
22
 
23
  def load_list_from_json(file_path):
24
  """
 
38
  # log_probs = F.log_softmax(shifted_logits, dim=-1)
39
  loss = torch.nn.functional.cross_entropy(logits[:-1, :].view(-1, logits.shape[-1]),
40
  target_token_ids[1:].view(-1), reduction='none')
41
+ # pdb.set_trace()
42
 
43
 
44
  # target_log_probs = -log_probs.gather(1, shifted_targets.unsqueeze(1)).squeeze()
 
47
  # log_sum = torch.sum(target_log_probs, dim=-1)
48
  # print(perplexity_sum)
49
 
50
+ return loss.cpu().numpy()
51
 
52
 
53
  def calculate_log_sum(logits, target_token_ids):
 
125
  def load_hf_model(path, cache_path):
126
  hf_tokenizer = AutoTokenizer.from_pretrained(path)
127
  if cache_path is not None:
128
+ # pdb.set_trace()
129
  hf_model = AutoModelForCausalLM.from_pretrained(path,
130
  device_map=device,
131
  trust_remote_code=True,
 
216
  neg_log_prob_temp += log_sum
217
 
218
  loss = calculate_loss(logit, input_chunk.squeeze(0))
 
219
 
220
  # token_length_list.append(seq_length)
221
  # data.append(neg_log_prob_temp)
 
227
 
228
  # print(f'log probability sum: {sum(data) / len(data):.2f}')
229
  # print(f'avg tokens: {sum(token_length_list) / len(token_length_list):.0f}')
230
+ rtn_dic={'logit':logit.cpu().numpy(),'input_ids':input_chunk.cpu().numpy()[0],'loss':loss,'tokenizer':tokenizer,'neg_log_prob_temp':neg_log_prob_temp}
231
+ return rtn_dic
232
 
233
 
234
  # if __name__ == '__main__':
 
242
  # parser.add_argument('--chunk_size', type=int, default=1024, help='chunk size')
243
 
244
 
245
+ def run_get_loss(args=None):
246
  if args is None:
247
+ # args=SimpleNamespace(model='microsoft/phi-2',texts='Hello FreshBench !',model_type='hf',model_cache=None,chunk_size=1024)
248
+ args=SimpleNamespace(model='/home/sribd/chenghao/models/phi-2',texts='Hello FreshBench !',model_type='hf',model_cache=None,chunk_size=1024)
249
+
250
+
251
+ if 'chunk_size' not in args.__dict__:
252
+ args.chunk_size=1024
253
+ if 'model_type' not in args.__dict__:
254
+ args.model_type='hf'
255
+ if 'model' not in args.__dict__ or len(args.model)<2:
256
+ # args.model='/home/sribd/chenghao/models/phi-2'
257
+ args.model='microsoft/phi-2'
258
+
259
+ if 'model_cache' not in args.__dict__:
260
+ args.model_cache=args.model
261
 
262
  # args = parser.parse_args()
263
 
264
  # load data
265
  # texts = load_list_from_json(args.data)
266
+ print('args',args)
267
  texts=args.texts
268
  print(f'data size: {len(texts)}')
269
 
 
281
 
282
  # eval
283
  if args.model_type in ['hf', 'mamba']:
284
+ print(f'eval hf')
285
  return eval_hf_model(model=model, tokenizer=tokenizer, texts=texts, chunk_size=args.chunk_size)
286
  # elif args.model_type == 'rwkv':
287
  # return eval_rwkv(model=model, tokenizer=tokenizer, texts=texts, chunk_size=args.chunk_size)
gradio_cached_examples/186/log.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ colored text,flag,username,timestamp
2
+ "[{""token"":""Hi"",""class_or_confidence"":13.59826946258545},{""token"":""Adam"",""class_or_confidence"":14.804081916809082}]",,,2024-03-14 14:05:40.149274
3
+ "[{""token"":""Hi"",""class_or_confidence"":13.59826946258545},{""token"":""Adam"",""class_or_confidence"":14.804081916809082}]",,,2024-03-14 14:05:42.364248
gradio_cached_examples/212/log.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ colored text,flag,username,timestamp
2
+ "[{""token"":""Hi"",""class_or_confidence"":13.59826946258545},{""token"":""Adam"",""class_or_confidence"":14.804081916809082}]",,,2024-03-14 14:05:44.632048
3
+ "[{""token"":""Hi"",""class_or_confidence"":13.59826946258545},{""token"":""Adam"",""class_or_confidence"":14.804081916809082}]",,,2024-03-14 14:05:46.813954