Mykes commited on
Commit
70be70c
1 Parent(s): 3854a85

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +141 -68
app.py CHANGED
@@ -2,75 +2,148 @@ import gradio as gr
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
 
5
- # Download the model
6
- model_name = "Mykes/med_tinyllama_gguf"
7
- filename = "unsloth.Q4_K_M.gguf"
8
- model_path = hf_hub_download(repo_id=model_name, filename=filename)
9
-
10
- # Initialize the model
11
- # model = Llama(model_path=model_path, n_ctx=2048, n_threads=4, n_batch=32, use_mmap=True, use_mlock=True, rope_freq_base=10000, rope_freq_scale=1.0)
12
- model = Llama(model_path=model_path, n_ctx=256, n_threads=2, n_batch=8, use_mlock=True)
13
- # def preload_model(model, preload_tokens=1024):
14
- # # Dummy call to load model into RAM by accessing parts of it
15
- # try:
16
- # dummy_input = " " * preload_tokens
17
- # _ = model(dummy_input, max_tokens=1)
18
- # print("Model preloaded into RAM.")
19
- # except Exception as e:
20
- # print(f"Error preloading model: {e}")
21
-
22
- # # Preload the model into RAM
23
- # preload_model(model)
24
- def respond(
25
- message,
26
- history: list[tuple[str, str]],
27
- system_message,
28
- max_tokens,
29
- temperature,
30
- top_p,
31
- ):
32
- history = history[-3:]
33
- # Construct the prompt
34
- prompt = f"<s>{system_message}\n\n"
35
- for user_msg, assistant_msg in history:
36
- prompt += f"<|user|>{user_msg}<|end|></s> <|assistant|>{assistant_msg}<|end|></s>"
37
- prompt += f"<|user|>{message}<|end|></s> <|assistant|>"
38
-
39
- # Generate response
40
- response = ""
41
- for token in model(
42
- prompt,
43
- max_tokens=max_tokens,
44
- temperature=temperature,
45
- top_p=top_p,
46
- stream=True,
47
- stop=["<|end|>", "</s>"]
48
  ):
49
- response += token['choices'][0]['text']
50
- yield response.strip()
51
-
52
- # Create the Gradio interface
53
- demo = gr.ChatInterface(
54
- respond,
55
- undo_btn="Отменить",
56
- clear_btn="Очистить",
57
- additional_inputs=[
58
- # gr.Textbox(value="You are a friendly medical assistant.", label="System message"),
59
- gr.Textbox(value="", label="System message"),
60
- gr.Slider(minimum=128, maximum=4096, value=2048, step=1, label="Max new tokens"),
61
- gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
62
- gr.Slider(
63
- minimum=0.1,
64
- maximum=1.0,
65
- value=0.9,
66
- step=0.05,
67
- label="Top-p (nucleus sampling)",
68
- ),
69
- ],
70
- title="Med TinyLlama Chat",
71
- description="Chat with the Med TinyLlama model for medical information.",
72
- )
73
 
74
  if __name__ == "__main__":
 
 
 
 
 
 
75
  model = Llama(model_path=model_path, n_ctx=256, n_threads=2, n_batch=8, use_mlock=True)
76
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
 
5
+ def create_responder(model):
6
+ def respond(
7
+ message,
8
+ history: list[tuple[str, str]],
9
+ system_message,
10
+ max_tokens,
11
+ temperature,
12
+ top_p,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  ):
14
+ history = history[-3:]
15
+ # Construct the prompt
16
+ prompt = f"<s>{system_message}\n\n"
17
+ for user_msg, assistant_msg in history:
18
+ prompt += f"<|user|>{user_msg}<|end|></s> <|assistant|>{assistant_msg}<|end|></s>"
19
+ prompt += f"<|user|>{message}<|end|></s> <|assistant|>"
20
+
21
+ # Generate response
22
+ response = ""
23
+ for token in model(
24
+ prompt,
25
+ max_tokens=max_tokens,
26
+ temperature=temperature,
27
+ top_p=top_p,
28
+ stream=True,
29
+ stop=["<|end|>", "</s>"]
30
+ ):
31
+ response += token['choices'][0]['text']
32
+ yield response.strip()
33
+
34
+ return respond
 
 
 
35
 
36
  if __name__ == "__main__":
37
+ # Download the model
38
+ model_name = "Mykes/med_tinyllama_gguf"
39
+ filename = "unsloth.Q4_K_M.gguf"
40
+ model_path = hf_hub_download(repo_id=model_name, filename=filename)
41
+
42
+ # Initialize the model
43
  model = Llama(model_path=model_path, n_ctx=256, n_threads=2, n_batch=8, use_mlock=True)
44
+
45
+ # Create a responder function with the model
46
+ respond = create_responder(model)
47
+
48
+ # Create the Gradio interface
49
+ demo = gr.ChatInterface(
50
+ respond,
51
+ undo_btn="Отменить",
52
+ clear_btn="Очистить",
53
+ additional_inputs=[
54
+ gr.Textbox(value="", label="System message"),
55
+ gr.Slider(minimum=128, maximum=4096, value=2048, step=1, label="Max new tokens"),
56
+ gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
57
+ gr.Slider(
58
+ minimum=0.1,
59
+ maximum=1.0,
60
+ value=0.9,
61
+ step=0.05,
62
+ label="Top-p (nucleus sampling)"
63
+ ),
64
+ ],
65
+ title="Med TinyLlama Chat",
66
+ description="Chat with the Med TinyLlama model for medical information.",
67
+ )
68
+
69
+ demo.launch()
70
+
71
+
72
+
73
+
74
+
75
+ # import gradio as gr
76
+ # from llama_cpp import Llama
77
+ # from huggingface_hub import hf_hub_download
78
+
79
+ # # Download the model
80
+ # model_name = "Mykes/med_tinyllama_gguf"
81
+ # filename = "unsloth.Q4_K_M.gguf"
82
+ # model_path = hf_hub_download(repo_id=model_name, filename=filename)
83
+
84
+ # # Initialize the model
85
+ # # model = Llama(model_path=model_path, n_ctx=2048, n_threads=4, n_batch=32, use_mmap=True, use_mlock=True, rope_freq_base=10000, rope_freq_scale=1.0)
86
+ # model = Llama(model_path=model_path, n_ctx=256, n_threads=2, n_batch=8, use_mlock=True)
87
+ # # def preload_model(model, preload_tokens=1024):
88
+ # # # Dummy call to load model into RAM by accessing parts of it
89
+ # # try:
90
+ # # dummy_input = " " * preload_tokens
91
+ # # _ = model(dummy_input, max_tokens=1)
92
+ # # print("Model preloaded into RAM.")
93
+ # # except Exception as e:
94
+ # # print(f"Error preloading model: {e}")
95
+
96
+ # # # Preload the model into RAM
97
+ # # preload_model(model)
98
+ # def respond(
99
+ # message,
100
+ # history: list[tuple[str, str]],
101
+ # system_message,
102
+ # max_tokens,
103
+ # temperature,
104
+ # top_p,
105
+ # ):
106
+ # history = history[-3:]
107
+ # # Construct the prompt
108
+ # prompt = f"<s>{system_message}\n\n"
109
+ # for user_msg, assistant_msg in history:
110
+ # prompt += f"<|user|>{user_msg}<|end|></s> <|assistant|>{assistant_msg}<|end|></s>"
111
+ # prompt += f"<|user|>{message}<|end|></s> <|assistant|>"
112
+
113
+ # # Generate response
114
+ # response = ""
115
+ # for token in model(
116
+ # prompt,
117
+ # max_tokens=max_tokens,
118
+ # temperature=temperature,
119
+ # top_p=top_p,
120
+ # stream=True,
121
+ # stop=["<|end|>", "</s>"]
122
+ # ):
123
+ # response += token['choices'][0]['text']
124
+ # yield response.strip()
125
+
126
+ # # Create the Gradio interface
127
+ # demo = gr.ChatInterface(
128
+ # respond,
129
+ # undo_btn="Отменить",
130
+ # clear_btn="Очистить",
131
+ # additional_inputs=[
132
+ # # gr.Textbox(value="You are a friendly medical assistant.", label="System message"),
133
+ # gr.Textbox(value="", label="System message"),
134
+ # gr.Slider(minimum=128, maximum=4096, value=2048, step=1, label="Max new tokens"),
135
+ # gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
136
+ # gr.Slider(
137
+ # minimum=0.1,
138
+ # maximum=1.0,
139
+ # value=0.9,
140
+ # step=0.05,
141
+ # label="Top-p (nucleus sampling)",
142
+ # ),
143
+ # ],
144
+ # title="Med TinyLlama Chat",
145
+ # description="Chat with the Med TinyLlama model for medical information.",
146
+ # )
147
+
148
+ # if __name__ == "__main__":
149
+ # demo.launch()