Llama-3.2-1B-Instruct

Running on Zero

App Files Files Community

vilarin commited on Jun 28

Commit

e6367a7

•

1 Parent(s): b48b00e

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -38

app.py CHANGED Viewed

@@ -1,17 +1,29 @@
 import torch
-from PIL import Image
 import gradio as gr
 import spaces
-from transformers import AutoModelForCausalLM, GemmaTokenizerFast, TextIteratorStreamer,BitsAndBytesConfig
 import os
 from threading import Thread
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 MODEL_ID = "google/gemma-2-27b-it"
-MODELS = os.environ.get("MODELS")
-MODEL_NAME = MODELS.split("/")[-1]
-MAX_INPUT_TOKEN_LENGTH = int(os.environ.get("MAX_INPUT_TOKEN_LENGTH", "4096"))
 TITLE = "<h1><center>Chatbox</center></h1>"
@@ -36,15 +48,6 @@ h3 {
     text-align: center;
 }
 """
-if torch.cuda.is_available():
-    model = AutoModelForCausalLM.from_pretrained(
-        MODELS,
-        device_map="auto",
-        quantization_config=BitsAndBytesConfig(load_in_4bit=True)
-        )
-    tokenizer = GemmaTokenizerFast.from_pretrained(MODELS)
-    model.config.sliding_window = 4096
-    model.eval()
 @spaces.GPU(duration=90)
@@ -58,33 +61,20 @@ def stream_chat(message: str, history: list, temperature: float, max_new_tokens:
     print(f"Conversation is -\n{conversation}")
-    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
-    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
-        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
-        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
-    input_ids = input_ids.to(0)
-    streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
-    generate_kwargs = dict(
-        {"input_ids": input_ids},
-        streamer=streamer,
         top_k=top_k,
         top_p=top_p,
-        repetition_penalty=penalty,
-        max_new_tokens=max_new_tokens,
-        do_sample=True,
-        temperature=temperature,
-        num_beams=1,
     )
-    thread = Thread(target=model.generate, kwargs=generate_kwargs)
-    thread.start()
-    buffer = ""
-    for new_text in streamer:
-        buffer += new_text
-        yield buffer
@@ -113,7 +103,7 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
                 maximum=2048,
                 step=1,
                 value=1024,
-                label="Max new tokens",
                 render=False,
             ),
             gr.Slider(

 import torch
+import copy
 import gradio as gr
 import spaces
+from llama_cpp import Llama
 import os
+from huggingface_hub import hf_hub_download
 from threading import Thread
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 MODEL_ID = "google/gemma-2-27b-it"
+MODEL_NAME = MODEL_ID.split("/")[-1]
+MODEL_FILE = "gemma-2-27b-it-Q4_K_M.gguf"
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+llm = Llama(
+    model_path=hf_hub_download(
+        repo_id=os.environ.get(MODEL_ID),
+        filename=os.environ.get(MODEL_FILE),
+    ),
+    n_ctx=4096,
+    n_gpu_layers=-1,
+    chat_format="gemma",
+)
 TITLE = "<h1><center>Chatbox</center></h1>"
     text-align: center;
 }
 """
 @spaces.GPU(duration=90)
     print(f"Conversation is -\n{conversation}")
+    output = llm.create_chat_completion(
+        messages=conversation,
         top_k=top_k,
         top_p=top_p,
+        repeat_penalty=penalty,
+        max_tokens=max_new_tokens,
+        stream =True,
+        temperature=temperature,
     )
+    for out in output:
+        stream = copy.deepcopy(out)
+        temp += stream["choices"][0]["text"]
+        yield temp
                 maximum=2048,
                 step=1,
                 value=1024,
+                label="Max Tokens",
                 render=False,
             ),
             gr.Slider(