zerogpu-2

Running on Zero

App Files Files Community

rphrp1985 commited on Jun 9

Commit

ab6fbd7

•

1 Parent(s): 23debf4

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -44

app.py CHANGED Viewed

@@ -19,33 +19,18 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 # model_id = "mistralai/Mistral-7B-v0.3"
-# model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
-from airllm import AirLLMLlama2
-MAX_LENGTH = 128
-from huggingface_hub import hf_hub_download
-from huggingface_hub import snapshot_download
-snapshot_download(
-    repo_id="CohereForAI/c4ai-command-r-plus-4bit",
-    # filename="Meta-Llama-3-70B-Instruct-Q3_K_M.gguf",
-    local_dir = "./models",
-    token= token
-)
-# could use hugging face model repo id:
-model = AirLLMLlama2("./models", )
-# tokenizer = AutoTokenizer.from_pretrained(model_id, token= token)
-# model = AutoModelForCausalLM.from_pretrained(model_id, token= token, torch_dtype=torch.bfloat16,
-#                                              # attn_implementation="flash_attention_2",
-#                                              # low_cpu_mem_usage=True,
-#                                              device_map="auto"
-#                                             )
@@ -58,28 +43,21 @@ def respond(
     temperature,
     top_p,
 ):
-    input_text = [
-        'What is the capital of United States?',
-    ]
-    input_tokens = model.tokenizer(input_text,
-    return_tensors="pt",
-    return_attention_mask=False,
-    truncation=True,
-    max_length=MAX_LENGTH,
-    padding=True)
-    generation_output = model.generate(
-    input_tokens['input_ids'].cuda(),
-    max_new_tokens=20,
-    use_cache=True,
-    return_dict_in_generate=True)
-    output = model.tokenizer.decode(generation_output.sequences[0])
-    print(output)
-    yield output
     messages = [
     {"role": "user", "content": "What is your favourite condiment?"},

 # model_id = "mistralai/Mistral-7B-v0.3"
+model_id = "CohereForAI/c4ai-command-r-plus-4bit"
+tokenizer = AutoTokenizer.from_pretrained(model_id, token= token)
+model = AutoModelForCausalLM.from_pretrained(model_id, token= token, torch_dtype=torch.bfloat16,
+                                             # attn_implementation="flash_attention_2",
+                                             # low_cpu_mem_usage=True,
+                                             device_map="auto"
+                                            )
     temperature,
     top_p,
 ):
+    messages = [{"role": "user", "content": "Hello, how are you?"}]
+    input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
+## <BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
+    gen_tokens = model.generate(
+    input_ids,
+    max_new_tokens=100,
+    do_sample=True,
+    temperature=0.3,
+    )
+    gen_text = tokenizer.decode(gen_tokens[0])
+    print(gen_text)
+    yield gen_text
     messages = [
     {"role": "user", "content": "What is your favourite condiment?"},