Spaces:

vikkaird
/

llama-3-updated

Sleeping

App Files Files Community

umair894 commited on May 28, 2024

Commit

720c059

verified ·

1 Parent(s): 9ab4214

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -41

app.py CHANGED Viewed

@@ -1,29 +1,18 @@
 import gradio as gr
 import os
-import spaces
-from transformers import GemmaTokenizer, AutoModelForCausalLM
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
-# Set an environment variable
-#HF_TOKEN = os.environ.get("HF_TOKEN", None)
-DESCRIPTION = '''
-<div>
-<h1 style="text-align: center;">Llama3 8B Fine-tuned</h1>
-'''
-# LICENSE = """
-# """
 PLACEHOLDER = """
 <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
-   <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">I am Vikk, AI legel Assistant, Ask me anything</p>
 </div>
 """
 css = """
 h1 {
   text-align: center;
@@ -37,20 +26,42 @@ h1 {
 }
 """
-# Load the tokenizer and model
-tokenizer = AutoTokenizer.from_pretrained("umair894/llama3")
-model = AutoModelForCausalLM.from_pretrained("umair894/llama3", device_map="cuda:0")  # to("auto")
 terminators = [
     tokenizer.eos_token_id,
-    tokenizer.convert_tokens_to_ids("<|eot_id|>") #eos_token
 ]
-#@spaces.GPU(duration=120)
 def chat_llama3_8b(message: str,
-              history: list,
-              temperature: float,
-              max_new_tokens: int
-             ) -> str:
     """
     Generate a streaming response using the llama3-8b model.
     Args:
@@ -61,24 +72,30 @@ def chat_llama3_8b(message: str,
     Returns:
         str: The generated response.
     """
     conversation = []
     for user, assistant in history:
-        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
-    conversation.append({"role": "user", "content": message})
-    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
-        input_ids= input_ids,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
         temperature=temperature,
         eos_token_id=terminators,
     )
-    # This will enforce greedy generation (do_sample=False) when the temperature is passed 0, avoiding the crash.
     if temperature == 0:
         generate_kwargs['do_sample'] = False
@@ -88,17 +105,13 @@ def chat_llama3_8b(message: str,
     outputs = []
     for text in streamer:
         outputs.append(text)
-        #print(outputs)
         yield "".join(outputs)
 # Gradio block
-chatbot=gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='ChatInterface')
 with gr.Blocks(fill_height=True, css=css) as demo:
-    gr.Markdown(DESCRIPTION)
-    #gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
     gr.ChatInterface(
         fn=chat_llama3_8b,
         chatbot=chatbot,
@@ -117,14 +130,13 @@ with gr.Blocks(fill_height=True, css=css) as demo:
                       value=512,
                       label="Max new tokens",
                       render=False ),
-            ],
         examples=[
-            ['I got a ticket.'],
-            ],
         cache_examples=False,
-                     )
-    #gr.Markdown(LICENSE)
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 import os
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
+from unsloth.chat_templates import get_chat_template
+from unsloth import FastLanguageModel
+import torch
 PLACEHOLDER = """
 <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
 </div>
 """
 css = """
 h1 {
   text-align: center;
 }
 """
+max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
+dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
+load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name="umair894/llama3",
+    max_seq_length=max_seq_length,
+    dtype=dtype,
+    load_in_4bit=load_in_4bit,
+)
+FastLanguageModel.for_inference(model)
+# Apply chat template to the tokenizer
+tokenizer = get_chat_template(
+    tokenizer,
+    chat_template="llama-3",  # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
+    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},  # ShareGPT style
+    map_eos_token=True,  # Maps to </s> instead
+)
 terminators = [
     tokenizer.eos_token_id,
+    tokenizer.convert_tokens_to_ids("")
 ]
+# Check if terminators are None and provide a default value if needed
+terminators = [token_id for token_id in terminators if token_id is not None]
+if not terminators:
+    terminators = [tokenizer.eos_token_id]  # Ensure there is a valid EOS token
 def chat_llama3_8b(message: str,
+                   history: list,
+                   temperature: float,
+                   max_new_tokens: int
+                  ) -> str:
     """
     Generate a streaming response using the llama3-8b model.
     Args:
     Returns:
         str: The generated response.
     """
     conversation = []
     for user, assistant in history:
+        conversation.extend([{"from": "human", "value": user}, {"from": "gpt", "value": assistant}])
+    conversation.append({"from": "human", "value": message})
+    input_ids = tokenizer.apply_chat_template(
+        conversation,
+        tokenize=True,
+        add_generation_prompt=True,  # Must add for generation
+        return_tensors="pt",
+    ).to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
+        input_ids=input_ids,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
         temperature=temperature,
         eos_token_id=terminators,
     )
     if temperature == 0:
         generate_kwargs['do_sample'] = False
     outputs = []
     for text in streamer:
         outputs.append(text)
         yield "".join(outputs)
 # Gradio block
+chatbot = gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface')
 with gr.Blocks(fill_height=True, css=css) as demo:
     gr.ChatInterface(
         fn=chat_llama3_8b,
         chatbot=chatbot,
                       value=512,
                       label="Max new tokens",
                       render=False ),
+        ],
         examples=[
+            ['How can i file for a student loan case?']
+        ],
         cache_examples=False,
+    )
 if __name__ == "__main__":
+    demo.launch(debug=True)