Neon-tech commited on
Commit
57cb2d0
·
verified ·
1 Parent(s): 9f1d2a4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -20
app.py CHANGED
@@ -1,13 +1,16 @@
1
  import os
2
- import torch
3
  import gradio as gr
4
  import psutil
5
- from transformers import AutoModelForCausalLM, AutoTokenizer
6
 
7
- os.environ["HF_HOME"] = "/data/hf_cache"
8
 
9
- tokenizer = AutoTokenizer.from_pretrained("/data/model2")
10
- model = AutoModelForCausalLM.from_pretrained("/data/model2", device_map="cpu", offload_folder="/tmp/offload")
 
 
 
 
11
 
12
  def get_stats():
13
  process = psutil.Process(os.getpid())
@@ -18,27 +21,20 @@ def get_stats():
18
  return f"RAM: {ram:.2f} GB | /tmp: {disk_tmp:.2f} GB | /data: {disk_data:.2f} GB | CPU: {cpu}%"
19
 
20
  def chat(message, history):
21
- messages = []
22
  for user, assistant in history:
23
  messages.append({"role": "user", "content": user})
24
  messages.append({"role": "assistant", "content": assistant})
25
  messages.append({"role": "user", "content": message})
26
 
27
- text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
28
- inputs = tokenizer([text], return_tensors="pt").to(model.device)
29
-
30
- from transformers import TextIteratorStreamer
31
- from threading import Thread
32
-
33
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
34
- generation_kwargs = dict(**inputs, max_new_tokens=512, streamer=streamer)
35
-
36
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
37
- thread.start()
38
-
39
  output = ""
40
- for token in streamer:
41
- output += token
 
 
 
 
 
42
  yield output
43
 
44
  with gr.Blocks() as demo:
 
1
  import os
 
2
  import gradio as gr
3
  import psutil
4
+ from llama_cpp import Llama
5
 
6
+ os.environ["HF_HOME"] = "/tmp/hf_cache"
7
 
8
+ model = Llama.from_pretrained(
9
+ repo_id="unsloth/Qwen3.5-35B-A3B-GGUF",
10
+ filename="Qwen3.5-35B-A3B-Q2_K.gguf",
11
+ n_ctx=2048,
12
+ n_threads=16,
13
+ )
14
 
15
  def get_stats():
16
  process = psutil.Process(os.getpid())
 
21
  return f"RAM: {ram:.2f} GB | /tmp: {disk_tmp:.2f} GB | /data: {disk_data:.2f} GB | CPU: {cpu}%"
22
 
23
  def chat(message, history):
24
+ messages = [{"role": "system", "content": "Reply directly without any reasoning or thinking process."}]
25
  for user, assistant in history:
26
  messages.append({"role": "user", "content": user})
27
  messages.append({"role": "assistant", "content": assistant})
28
  messages.append({"role": "user", "content": message})
29
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  output = ""
31
+ for chunk in model.create_chat_completion(
32
+ messages=messages,
33
+ max_tokens=2048,
34
+ stream=True
35
+ ):
36
+ delta = chunk["choices"][0]["delta"].get("content", "")
37
+ output += delta
38
  yield output
39
 
40
  with gr.Blocks() as demo: