rphrp1985 commited on
Commit
ab6fbd7
1 Parent(s): 23debf4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -44
app.py CHANGED
@@ -19,33 +19,18 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
19
 
20
  # model_id = "mistralai/Mistral-7B-v0.3"
21
 
22
- # model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
23
 
24
- from airllm import AirLLMLlama2
25
 
26
- MAX_LENGTH = 128
27
- from huggingface_hub import hf_hub_download
28
- from huggingface_hub import snapshot_download
29
 
30
 
31
- snapshot_download(
32
- repo_id="CohereForAI/c4ai-command-r-plus-4bit",
33
- # filename="Meta-Llama-3-70B-Instruct-Q3_K_M.gguf",
34
- local_dir = "./models",
35
- token= token
36
- )
37
-
38
- # could use hugging face model repo id:
39
- model = AirLLMLlama2("./models", )
40
-
41
 
42
- # tokenizer = AutoTokenizer.from_pretrained(model_id, token= token)
43
-
44
- # model = AutoModelForCausalLM.from_pretrained(model_id, token= token, torch_dtype=torch.bfloat16,
45
- # # attn_implementation="flash_attention_2",
46
- # # low_cpu_mem_usage=True,
47
- # device_map="auto"
48
- # )
49
 
50
 
51
 
@@ -58,28 +43,21 @@ def respond(
58
  temperature,
59
  top_p,
60
  ):
61
- input_text = [
62
- 'What is the capital of United States?',
63
- ]
64
-
65
- input_tokens = model.tokenizer(input_text,
66
- return_tensors="pt",
67
- return_attention_mask=False,
68
- truncation=True,
69
- max_length=MAX_LENGTH,
70
- padding=True)
71
-
72
- generation_output = model.generate(
73
- input_tokens['input_ids'].cuda(),
74
- max_new_tokens=20,
75
- use_cache=True,
76
- return_dict_in_generate=True)
77
-
78
- output = model.tokenizer.decode(generation_output.sequences[0])
79
-
80
- print(output)
81
- yield output
82
-
83
 
84
  messages = [
85
  {"role": "user", "content": "What is your favourite condiment?"},
 
19
 
20
  # model_id = "mistralai/Mistral-7B-v0.3"
21
 
22
+ model_id = "CohereForAI/c4ai-command-r-plus-4bit"
23
 
 
24
 
 
 
 
25
 
26
 
27
+ tokenizer = AutoTokenizer.from_pretrained(model_id, token= token)
 
 
 
 
 
 
 
 
 
28
 
29
+ model = AutoModelForCausalLM.from_pretrained(model_id, token= token, torch_dtype=torch.bfloat16,
30
+ # attn_implementation="flash_attention_2",
31
+ # low_cpu_mem_usage=True,
32
+ device_map="auto"
33
+ )
 
 
34
 
35
 
36
 
 
43
  temperature,
44
  top_p,
45
  ):
46
+ messages = [{"role": "user", "content": "Hello, how are you?"}]
47
+ input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
48
+ ## <BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
49
+
50
+ gen_tokens = model.generate(
51
+ input_ids,
52
+ max_new_tokens=100,
53
+ do_sample=True,
54
+ temperature=0.3,
55
+ )
56
+
57
+ gen_text = tokenizer.decode(gen_tokens[0])
58
+ print(gen_text)
59
+ yield gen_text
60
+
 
 
 
 
 
 
 
61
 
62
  messages = [
63
  {"role": "user", "content": "What is your favourite condiment?"},