mobinln commited on
Commit
6da1c26
1 Parent(s): 21399a4

feat: setup local Qwen2 0.5

Browse files
Files changed (2) hide show
  1. app.py +9 -23
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,10 +1,8 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
 
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient(model="meta-llama/Meta-Llama-3-8B-Instruct")
8
 
9
 
10
  def respond(
@@ -25,19 +23,13 @@ def respond(
25
 
26
  messages.append({"role": "user", "content": message})
27
 
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
  max_tokens=max_tokens,
33
- stream=True,
34
  temperature=temperature,
35
  top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
 
42
 
43
  """
@@ -47,7 +39,7 @@ demo = gr.ChatInterface(
47
  respond,
48
  additional_inputs=[
49
  gr.Textbox(
50
- value="You are Marv, a chatbot that reluctantly answers questions with sarcastic responses in Persian only.",
51
  label="System message",
52
  ),
53
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
@@ -60,13 +52,7 @@ demo = gr.ChatInterface(
60
  label="Top-p (nucleus sampling)",
61
  ),
62
  ],
63
- description="A little bit of sarcasm doesn't kill anyone right? :)))",
64
- examples=[
65
- ["پایتخت فرانسه کجاست؟"],
66
- ["دو بعلاوه دو چند میشود؟"],
67
- ["یک جک بگو"],
68
- ],
69
- cache_examples=False,
70
  )
71
 
72
 
 
1
  import gradio as gr
2
+ from llama_cpp import Llama
3
 
4
+ model = "Qwen/Qwen1.5-0.5B-Chat-GGUF"
5
+ llm = Llama.from_pretrained(repo_id=model, filename="*q8_0.gguf", verbose=True)
 
 
6
 
7
 
8
  def respond(
 
23
 
24
  messages.append({"role": "user", "content": message})
25
 
26
+ response = llm.create_chat_completion(
27
+ messages=messages,
 
 
28
  max_tokens=max_tokens,
 
29
  temperature=temperature,
30
  top_p=top_p,
31
+ )
32
+ return response
 
 
 
33
 
34
 
35
  """
 
39
  respond,
40
  additional_inputs=[
41
  gr.Textbox(
42
+ value="You are a helpful assistant.",
43
  label="System message",
44
  ),
45
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
 
52
  label="Top-p (nucleus sampling)",
53
  ),
54
  ],
55
+ description=model,
 
 
 
 
 
 
56
  )
57
 
58
 
requirements.txt CHANGED
@@ -1 +1,2 @@
1
- huggingface_hub==0.22.2
 
 
1
+ huggingface_hub==0.22.2
2
+ llama-cpp-python==0.2.78