Mykes commited on
Commit
4bf96b6
1 Parent(s): 70be70c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -142
app.py CHANGED
@@ -2,148 +2,74 @@ import gradio as gr
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
 
5
- def create_responder(model):
6
- def respond(
7
- message,
8
- history: list[tuple[str, str]],
9
- system_message,
10
- max_tokens,
11
- temperature,
12
- top_p,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  ):
14
- history = history[-3:]
15
- # Construct the prompt
16
- prompt = f"<s>{system_message}\n\n"
17
- for user_msg, assistant_msg in history:
18
- prompt += f"<|user|>{user_msg}<|end|></s> <|assistant|>{assistant_msg}<|end|></s>"
19
- prompt += f"<|user|>{message}<|end|></s> <|assistant|>"
20
-
21
- # Generate response
22
- response = ""
23
- for token in model(
24
- prompt,
25
- max_tokens=max_tokens,
26
- temperature=temperature,
27
- top_p=top_p,
28
- stream=True,
29
- stop=["<|end|>", "</s>"]
30
- ):
31
- response += token['choices'][0]['text']
32
- yield response.strip()
33
-
34
- return respond
 
 
 
35
 
36
  if __name__ == "__main__":
37
- # Download the model
38
- model_name = "Mykes/med_tinyllama_gguf"
39
- filename = "unsloth.Q4_K_M.gguf"
40
- model_path = hf_hub_download(repo_id=model_name, filename=filename)
41
-
42
- # Initialize the model
43
- model = Llama(model_path=model_path, n_ctx=256, n_threads=2, n_batch=8, use_mlock=True)
44
-
45
- # Create a responder function with the model
46
- respond = create_responder(model)
47
-
48
- # Create the Gradio interface
49
- demo = gr.ChatInterface(
50
- respond,
51
- undo_btn="Отменить",
52
- clear_btn="Очистить",
53
- additional_inputs=[
54
- gr.Textbox(value="", label="System message"),
55
- gr.Slider(minimum=128, maximum=4096, value=2048, step=1, label="Max new tokens"),
56
- gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
57
- gr.Slider(
58
- minimum=0.1,
59
- maximum=1.0,
60
- value=0.9,
61
- step=0.05,
62
- label="Top-p (nucleus sampling)"
63
- ),
64
- ],
65
- title="Med TinyLlama Chat",
66
- description="Chat with the Med TinyLlama model for medical information.",
67
- )
68
-
69
- demo.launch()
70
-
71
-
72
-
73
-
74
-
75
- # import gradio as gr
76
- # from llama_cpp import Llama
77
- # from huggingface_hub import hf_hub_download
78
-
79
- # # Download the model
80
- # model_name = "Mykes/med_tinyllama_gguf"
81
- # filename = "unsloth.Q4_K_M.gguf"
82
- # model_path = hf_hub_download(repo_id=model_name, filename=filename)
83
-
84
- # # Initialize the model
85
- # # model = Llama(model_path=model_path, n_ctx=2048, n_threads=4, n_batch=32, use_mmap=True, use_mlock=True, rope_freq_base=10000, rope_freq_scale=1.0)
86
- # model = Llama(model_path=model_path, n_ctx=256, n_threads=2, n_batch=8, use_mlock=True)
87
- # # def preload_model(model, preload_tokens=1024):
88
- # # # Dummy call to load model into RAM by accessing parts of it
89
- # # try:
90
- # # dummy_input = " " * preload_tokens
91
- # # _ = model(dummy_input, max_tokens=1)
92
- # # print("Model preloaded into RAM.")
93
- # # except Exception as e:
94
- # # print(f"Error preloading model: {e}")
95
-
96
- # # # Preload the model into RAM
97
- # # preload_model(model)
98
- # def respond(
99
- # message,
100
- # history: list[tuple[str, str]],
101
- # system_message,
102
- # max_tokens,
103
- # temperature,
104
- # top_p,
105
- # ):
106
- # history = history[-3:]
107
- # # Construct the prompt
108
- # prompt = f"<s>{system_message}\n\n"
109
- # for user_msg, assistant_msg in history:
110
- # prompt += f"<|user|>{user_msg}<|end|></s> <|assistant|>{assistant_msg}<|end|></s>"
111
- # prompt += f"<|user|>{message}<|end|></s> <|assistant|>"
112
-
113
- # # Generate response
114
- # response = ""
115
- # for token in model(
116
- # prompt,
117
- # max_tokens=max_tokens,
118
- # temperature=temperature,
119
- # top_p=top_p,
120
- # stream=True,
121
- # stop=["<|end|>", "</s>"]
122
- # ):
123
- # response += token['choices'][0]['text']
124
- # yield response.strip()
125
-
126
- # # Create the Gradio interface
127
- # demo = gr.ChatInterface(
128
- # respond,
129
- # undo_btn="Отменить",
130
- # clear_btn="Очистить",
131
- # additional_inputs=[
132
- # # gr.Textbox(value="You are a friendly medical assistant.", label="System message"),
133
- # gr.Textbox(value="", label="System message"),
134
- # gr.Slider(minimum=128, maximum=4096, value=2048, step=1, label="Max new tokens"),
135
- # gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
136
- # gr.Slider(
137
- # minimum=0.1,
138
- # maximum=1.0,
139
- # value=0.9,
140
- # step=0.05,
141
- # label="Top-p (nucleus sampling)",
142
- # ),
143
- # ],
144
- # title="Med TinyLlama Chat",
145
- # description="Chat with the Med TinyLlama model for medical information.",
146
- # )
147
-
148
- # if __name__ == "__main__":
149
- # demo.launch()
 
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
 
5
+ # Download the model
6
+ model_name = "Mykes/med_tinyllama_gguf"
7
+ filename = "unsloth.Q4_K_M.gguf"
8
+ model_path = hf_hub_download(repo_id=model_name, filename=filename)
9
+
10
+ # Initialize the model
11
+ # model = Llama(model_path=model_path, n_ctx=2048, n_threads=4, n_batch=32, use_mmap=True, use_mlock=True, rope_freq_base=10000, rope_freq_scale=1.0)
12
+ model = Llama(model_path=model_path, n_ctx=256, n_threads=2, n_batch=8, use_mlock=True)
13
+ # def preload_model(model, preload_tokens=1024):
14
+ # # Dummy call to load model into RAM by accessing parts of it
15
+ # try:
16
+ # dummy_input = " " * preload_tokens
17
+ # _ = model(dummy_input, max_tokens=1)
18
+ # print("Model preloaded into RAM.")
19
+ # except Exception as e:
20
+ # print(f"Error preloading model: {e}")
21
+
22
+ # # Preload the model into RAM
23
+ # preload_model(model)
24
+ def respond(
25
+ message,
26
+ history: list[tuple[str, str]],
27
+ system_message,
28
+ max_tokens,
29
+ temperature,
30
+ top_p,
31
+ ):
32
+ history = history[-3:]
33
+ # Construct the prompt
34
+ prompt = f"<s>{system_message}\n\n"
35
+ for user_msg, assistant_msg in history:
36
+ prompt += f"<|user|>{user_msg}<|end|></s> <|assistant|>{assistant_msg}<|end|></s>"
37
+ prompt += f"<|user|>{message}<|end|></s> <|assistant|>"
38
+
39
+ # Generate response
40
+ response = ""
41
+ for token in model(
42
+ prompt,
43
+ max_tokens=max_tokens,
44
+ temperature=temperature,
45
+ top_p=top_p,
46
+ stream=True,
47
+ stop=["<|end|>", "</s>"]
48
  ):
49
+ response += token['choices'][0]['text']
50
+ yield response.strip()
51
+
52
+ # Create the Gradio interface
53
+ demo = gr.ChatInterface(
54
+ respond,
55
+ undo_btn="Отменить",
56
+ clear_btn="Очистить",
57
+ additional_inputs=[
58
+ # gr.Textbox(value="You are a friendly medical assistant.", label="System message"),
59
+ gr.Textbox(value="", label="System message"),
60
+ gr.Slider(minimum=128, maximum=4096, value=2048, step=1, label="Max new tokens"),
61
+ gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
62
+ gr.Slider(
63
+ minimum=0.1,
64
+ maximum=1.0,
65
+ value=0.9,
66
+ step=0.05,
67
+ label="Top-p (nucleus sampling)",
68
+ ),
69
+ ],
70
+ title="Med TinyLlama Chat",
71
+ description="Chat with the Med TinyLlama model for medical information.",
72
+ )
73
 
74
  if __name__ == "__main__":
75
+ demo.launch()