AIO_Chat

Sleeping

eswardivi commited on Apr 19

Commit

ea9c0d3

•

1 Parent(s): 8ea3940

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -21,6 +21,10 @@ model = AutoModelForCausalLM.from_pretrained(
     "meta-llama/Meta-Llama-3-8B-Instruct", quantization_config=quantization_config, token=token
 )
 tok = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=token)
 if torch.cuda.is_available():
     device = torch.device("cuda")
@@ -34,7 +38,7 @@ else:
 @spaces.GPU(duration=150)
-def chat(message, history, temperature,do_sample, top_k, max_tokens):
     start_time = time.time()
     chat = []
     for item in history:
@@ -52,9 +56,13 @@ def chat(message, history, temperature,do_sample, top_k, max_tokens):
         streamer=streamer,
         max_new_tokens=max_tokens,
         do_sample=True,
-        top_k=top_k,
         temperature=temperature,
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
@@ -86,14 +94,11 @@ demo = gr.ChatInterface(
             minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature", render=False
         ),
         gr.Checkbox(label="Sampling",value=True),
-        gr.Slider(
-            minimum=1, maximum=10000, step=5, value=1000, label="top_k", render=False
-        ),
         gr.Slider(
             minimum=128,
             maximum=4096,
             step=1,
-            value=1024,
             label="Max new tokens",
             render=False,
         ),

     "meta-llama/Meta-Llama-3-8B-Instruct", quantization_config=quantization_config, token=token
 )
 tok = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=token)
+terminators = [
+    tokenizer.eos_token_id,
+    tokenizer.convert_tokens_to_ids("<|eot_id|>")
+]
 if torch.cuda.is_available():
     device = torch.device("cuda")
 @spaces.GPU(duration=150)
+def chat(message, history, temperature,do_sample, max_tokens):
     start_time = time.time()
     chat = []
     for item in history:
         streamer=streamer,
         max_new_tokens=max_tokens,
         do_sample=True,
         temperature=temperature,
+        eos_token_id=terminators,
     )
+    if temperature == 0:
+        generate_kwargs['do_sample'] = False
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
             minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature", render=False
         ),
         gr.Checkbox(label="Sampling",value=True),
         gr.Slider(
             minimum=128,
             maximum=4096,
             step=1,
+            value=512,
             label="Max new tokens",
             render=False,
         ),