Llama-3.1-8B-Instruct

Running on Zero

App Files Files Community

vilarin commited on 8 days ago

Commit

7cb9567

•

1 Parent(s): a936635

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -8

app.py CHANGED Viewed

@@ -6,15 +6,16 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStream
 import gradio as gr
 from threading import Thread
-MODEL_LIST = ["meta-llama/Meta-Llama-3.1-8B-Instruct"]
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 MODEL = os.environ.get("MODEL_ID")
-TITLE = "<h1><center>Meta-Llama3.1-8B</center></h1>"
 PLACEHOLDER = """
 <center>
-<p>Hi! How can I help you today?</p>
 </center>
 """
@@ -33,16 +34,26 @@ h3 {
 device = "cuda" # for GPU usage or "cpu" for CPU usage
-quantization_config = BitsAndBytesConfig(load_in_8bit=True)
 tokenizer = AutoTokenizer.from_pretrained(MODEL)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL,
     torch_dtype=torch.bfloat16,
     device_map="auto",
     quantization_config=quantization_config)
-@spaces.GPU()
 def stream_chat(
     message: str,
     history: list,
@@ -52,6 +63,7 @@ def stream_chat(
     top_p: float = 1.0,
     top_k: int = 20,
     penalty: float = 1.2,
 ):
     print(f'message: {message}')
     print(f'history: {history}')
@@ -67,6 +79,11 @@ def stream_chat(
     conversation.append({"role": "user", "content": message})
     input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
@@ -101,7 +118,7 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
         fn=stream_chat,
         chatbot=chatbot,
         fill_height=True,
-        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
         additional_inputs=[
             gr.Textbox(
                 value="You are a helpful assistant",
@@ -148,6 +165,12 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
                 label="Repetition penalty",
                 render=False,
             ),
         ],
         examples=[
             ["Help me study vocabulary: write a sentence for me to fill in the blank, and I'll try to pick the correct option."],

 import gradio as gr
 from threading import Thread
+MODEL_LIST = ["meta-llama/Meta-Llama-3.1-8B-Instruct", "meta-llama/Meta-Llama-3.1-70B-Instruct"]
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 MODEL = os.environ.get("MODEL_ID")
+TITLE = "<h1><center>Meta-Llama3.1-Chat</center></h1>"
 PLACEHOLDER = """
 <center>
+<p>😊Hi! How can I help you today?</p><br>
+<p>✨Select Meta-Llama3.1-8B/70B in Advanced Options</p>
 </center>
 """
 device = "cuda" # for GPU usage or "cpu" for CPU usage
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type= "nf4")
 tokenizer = AutoTokenizer.from_pretrained(MODEL)
+model_8b = AutoModelForCausalLM.from_pretrained(
+    MODEL_LIST[0],
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=quantization_config)
+model_70b = AutoModelForCausalLM.from_pretrained(
+    MODEL_LIST[1],
     torch_dtype=torch.bfloat16,
     device_map="auto",
     quantization_config=quantization_config)
+@spaces.GPU(duration=120)
 def stream_chat(
     message: str,
     history: list,
     top_p: float = 1.0,
     top_k: int = 20,
     penalty: float = 1.2,
+    choice: str = "Meta-Llama-3.1-8B"
 ):
     print(f'message: {message}')
     print(f'history: {history}')
     conversation.append({"role": "user", "content": message})
+    if choice == "Meta-Llama-3.1-8B":
+        model = model_8b
+    else:
+        model = model_70b
     input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
         fn=stream_chat,
         chatbot=chatbot,
         fill_height=True,
+        additional_inputs_accordion=gr.Accordion(label="⚙️ Advanced Options", open=False, render=False),
         additional_inputs=[
             gr.Textbox(
                 value="You are a helpful assistant",
                 label="Repetition penalty",
                 render=False,
             ),
+            gr.Radio(
+                ["Meta-Llama-3.1-8B", "Meta-Llama-3.1-70B"],
+                value="Meta-Llama-3.1-8B",
+                label="Load Model",
+                render=False,
+            ),
         ],
         examples=[
             ["Help me study vocabulary: write a sentence for me to fill in the blank, and I'll try to pick the correct option."],