lewtun HF staff commited on
Commit
9d55eb4
1 Parent(s): 47a32e6

Use text-generation inference 🔥🔥

Browse files
Files changed (2) hide show
  1. app.py +107 -41
  2. requirements.txt +6 -6
app.py CHANGED
@@ -1,12 +1,16 @@
 
1
  import os
2
- from threading import Thread
3
 
4
  import gradio as gr
5
- import torch
6
- from transformers import (AutoModelForCausalLM, AutoTokenizer,
7
- TextIteratorStreamer, set_seed)
8
  from huggingface_hub import Repository
9
- import json
 
 
 
 
10
 
11
  theme = gr.themes.Monochrome(
12
  primary_hue="indigo",
@@ -16,27 +20,32 @@ theme = gr.themes.Monochrome(
16
  font=[gr.themes.GoogleFont("Open Sans"), "ui-sans-serif", "system-ui", "sans-serif"],
17
  )
18
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
19
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
20
  if HF_TOKEN:
21
  repo = Repository(
22
  local_dir="data", clone_from="trl-lib/stack-llama-prompts", use_auth_token=HF_TOKEN, repo_type="dataset"
23
  )
24
 
 
 
 
 
25
 
26
- device = "cuda" if torch.cuda.is_available() else "cpu"
27
- model_id = "trl-lib/llama-se-rl-merged"
28
- print(f"Loading model: {model_id}")
29
- if device == "cpu":
30
- model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, use_auth_token=HF_TOKEN)
31
- else:
32
- model = AutoModelForCausalLM.from_pretrained(
33
- model_id, device_map="auto", load_in_8bit=True, use_auth_token=HF_TOKEN
34
- )
35
 
36
- tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=HF_TOKEN)
37
 
38
  PROMPT_TEMPLATE = """Question: {prompt}\n\nAnswer:"""
39
 
 
40
  def save_inputs_and_outputs(inputs, outputs, generate_kwargs):
41
  with open(os.path.join("data", "prompts.jsonl"), "a") as f:
42
  json.dump({"inputs": inputs, "outputs": outputs, "generate_kwargs": generate_kwargs}, f, ensure_ascii=False)
@@ -44,44 +53,101 @@ def save_inputs_and_outputs(inputs, outputs, generate_kwargs):
44
  commit_url = repo.push_to_hub()
45
 
46
 
47
- def generate(instruction, temperature=0.9, max_new_tokens=128, top_p=0.95, top_k=100):
48
- set_seed(42)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  formatted_instruction = PROMPT_TEMPLATE.format(prompt=instruction)
50
 
51
  temperature = float(temperature)
52
  top_p = float(top_p)
53
- streamer = TextIteratorStreamer(tokenizer)
54
- model_inputs = tokenizer(formatted_instruction, return_tensors="pt", truncation=True, max_length=2048).to(device)
55
 
56
- generate_kwargs = dict(
57
- top_p=top_p,
58
  temperature=temperature,
 
59
  max_new_tokens=max_new_tokens,
60
- do_sample=True,
61
  top_k=top_k,
62
- eos_token_id=tokenizer.eos_token_id,
63
- pad_token_id=tokenizer.eos_token_id,
64
  )
65
- t = Thread(target=model.generate, kwargs={**dict(model_inputs, streamer=streamer), **generate_kwargs})
66
- t.start()
67
 
68
  output = ""
69
- hidden_output = ""
70
- for new_text in streamer:
71
- # skip streaming until new text is available
72
- if len(hidden_output) <= len(formatted_instruction):
73
- hidden_output += new_text
74
- continue
75
- # replace eos token
76
- # if tokenizer.eos_token in new_text:
77
- # new_text = new_text.replace(tokenizer.eos_token, "")
78
- output += new_text
79
  yield output
80
- if HF_TOKEN:
81
- print("Pushing prompt and completion to the Hub")
82
- save_inputs_and_outputs(formatted_instruction, output, generate_kwargs)
83
  return output
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  examples = [
87
  "A llama is in my lawn. How do I get rid of him?",
@@ -167,4 +233,4 @@ with gr.Blocks(theme=theme, analytics_enabled=False, css=".generating {visibilit
167
  instruction.submit(generate, inputs=[instruction, temperature, max_new_tokens, top_p, top_k], outputs=[output])
168
 
169
  demo.queue(concurrency_count=1)
170
- demo.launch(enable_queue=True)#, share=True)
 
1
+ import json
2
  import os
 
3
 
4
  import gradio as gr
5
+ # import torch
6
+ # from transformers import (AutoModelForCausalLM, AutoTokenizer,
7
+ # TextIteratorStreamer, set_seed)
8
  from huggingface_hub import Repository
9
+ from text_generation import Client
10
+
11
+ # from threading import Thread
12
+
13
+
14
 
15
  theme = gr.themes.Monochrome(
16
  primary_hue="indigo",
 
20
  font=[gr.themes.GoogleFont("Open Sans"), "ui-sans-serif", "system-ui", "sans-serif"],
21
  )
22
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
23
+ # os.environ["TOKENIZERS_PARALLELISM"] = "false"
24
  if HF_TOKEN:
25
  repo = Repository(
26
  local_dir="data", clone_from="trl-lib/stack-llama-prompts", use_auth_token=HF_TOKEN, repo_type="dataset"
27
  )
28
 
29
+ client = Client(
30
+ "https://api-inference.huggingface.co/models/trl-lib/llama-se-rl-merged",
31
+ headers={"Authorization": f"Bearer {HF_TOKEN}"},
32
+ )
33
 
34
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
35
+ # model_id = "trl-lib/llama-se-rl-merged"
36
+ # print(f"Loading model: {model_id}")
37
+ # if device == "cpu":
38
+ # model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, use_auth_token=HF_TOKEN)
39
+ # else:
40
+ # model = AutoModelForCausalLM.from_pretrained(
41
+ # model_id, device_map="auto", load_in_8bit=True, use_auth_token=HF_TOKEN
42
+ # )
43
 
44
+ # tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=HF_TOKEN)
45
 
46
  PROMPT_TEMPLATE = """Question: {prompt}\n\nAnswer:"""
47
 
48
+
49
  def save_inputs_and_outputs(inputs, outputs, generate_kwargs):
50
  with open(os.path.join("data", "prompts.jsonl"), "a") as f:
51
  json.dump({"inputs": inputs, "outputs": outputs, "generate_kwargs": generate_kwargs}, f, ensure_ascii=False)
 
53
  commit_url = repo.push_to_hub()
54
 
55
 
56
+ # def generate(instruction, temperature=0.9, max_new_tokens=128, top_p=0.95, top_k=100):
57
+ # set_seed(42)
58
+ # formatted_instruction = PROMPT_TEMPLATE.format(prompt=instruction)
59
+
60
+ # temperature = float(temperature)
61
+ # top_p = float(top_p)
62
+ # streamer = TextIteratorStreamer(tokenizer)
63
+ # model_inputs = tokenizer(formatted_instruction, return_tensors="pt", truncation=True, max_length=2048).to(device)
64
+
65
+ # generate_kwargs = dict(
66
+ # top_p=top_p,
67
+ # temperature=temperature,
68
+ # max_new_tokens=max_new_tokens,
69
+ # do_sample=True,
70
+ # top_k=top_k,
71
+ # eos_token_id=tokenizer.eos_token_id,
72
+ # pad_token_id=tokenizer.eos_token_id,
73
+ # )
74
+ # t = Thread(target=model.generate, kwargs={**dict(model_inputs, streamer=streamer), **generate_kwargs})
75
+ # t.start()
76
+
77
+ # output = ""
78
+ # hidden_output = ""
79
+ # for new_text in streamer:
80
+ # # skip streaming until new text is available
81
+ # if len(hidden_output) <= len(formatted_instruction):
82
+ # hidden_output += new_text
83
+ # continue
84
+ # # replace eos token
85
+ # # if tokenizer.eos_token in new_text:
86
+ # # new_text = new_text.replace(tokenizer.eos_token, "")
87
+ # output += new_text
88
+ # yield output
89
+ # if HF_TOKEN:
90
+ # print("Pushing prompt and completion to the Hub")
91
+ # save_inputs_and_outputs(formatted_instruction, output, generate_kwargs)
92
+ # return output
93
+
94
+
95
+ def generate(instruction, temperature=0.9, max_new_tokens=256, top_p=0.95, top_k=100):
96
+ # set_seed(42)
97
  formatted_instruction = PROMPT_TEMPLATE.format(prompt=instruction)
98
 
99
  temperature = float(temperature)
100
  top_p = float(top_p)
 
 
101
 
102
+ stream = client.generate_stream(
103
+ formatted_instruction,
104
  temperature=temperature,
105
+ truncate=999,
106
  max_new_tokens=max_new_tokens,
107
+ top_p=top_p,
108
  top_k=top_k,
109
+ # stop_sequences=["</s>"],
 
110
  )
 
 
111
 
112
  output = ""
113
+ for response in stream:
114
+ output += response.token.text
 
 
 
 
 
 
 
 
115
  yield output
116
+
 
 
117
  return output
118
 
119
+ # streamer = TextIteratorStreamer(tokenizer)
120
+ # model_inputs = tokenizer(formatted_instruction, return_tensors="pt", truncation=True, max_length=2048).to(device)
121
+
122
+ # generate_kwargs = dict(
123
+ # top_p=top_p,
124
+ # temperature=temperature,
125
+ # max_new_tokens=max_new_tokens,
126
+ # do_sample=True,
127
+ # top_k=top_k,
128
+ # # eos_token_id=tokenizer.eos_token_id,
129
+ # # pad_token_id=tokenizer.eos_token_id,
130
+ # )
131
+ # t = Thread(target=model.generate, kwargs={**dict(model_inputs, streamer=streamer), **generate_kwargs})
132
+ # t.start()
133
+
134
+ # output = ""
135
+ # hidden_output = ""
136
+ # for new_text in streamer:
137
+ # # skip streaming until new text is available
138
+ # if len(hidden_output) <= len(formatted_instruction):
139
+ # hidden_output += new_text
140
+ # continue
141
+ # # replace eos token
142
+ # # if tokenizer.eos_token in new_text:
143
+ # # new_text = new_text.replace(tokenizer.eos_token, "")
144
+ # output += new_text
145
+ # yield output
146
+ # if HF_TOKEN:
147
+ # print("Pushing prompt and completion to the Hub")
148
+ # save_inputs_and_outputs(formatted_instruction, output, generate_kwargs)
149
+ # return output
150
+
151
 
152
  examples = [
153
  "A llama is in my lawn. How do I get rid of him?",
 
233
  instruction.submit(generate, inputs=[instruction, temperature, max_new_tokens, top_p, top_k], outputs=[output])
234
 
235
  demo.queue(concurrency_count=1)
236
+ demo.launch(enable_queue=True) # , share=True)
requirements.txt CHANGED
@@ -1,7 +1,7 @@
1
  huggingface_hub
2
- bitsandbytes
3
- sentencepiece
4
- git+https://github.com/huggingface/transformers.git@98268b2e76189d65f7068625cf382ebe03b98480
5
- accelerate>=0.16.0
6
- bitsandbytes
7
- sentencepiece
 
1
  huggingface_hub
2
+ # bitsandbytes
3
+ # sentencepiece
4
+ # git+https://github.com/huggingface/transformers.git@98268b2e76189d65f7068625cf382ebe03b98480
5
+ # accelerate>=0.16.0
6
+ # bitsandbytes
7
+ # sentencepiece