Spaces:

MakiAi
/

BOREA_DEMO

Sleeping

App Files Files Community

MakiAi commited on Sep 14

Commit

b2c8b1e

•

1 Parent(s): 8897312

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -29

app.py CHANGED Viewed

@@ -6,18 +6,22 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStream
 import gradio as gr
 from threading import Thread
-MODEL = "microsoft/Phi-3.5-mini-instruct"
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
-TITLE = "<h1><center>Phi 3.5 Mini</center></h1>"
 PLACEHOLDER = """
 <center>
-<p>Hi, I'm Phi. Ask me anything.</p>
 </center>
 """
 CSS = """
 .duplicate-button {
     margin: auto !important;
@@ -30,20 +34,26 @@ h3 {
 }
 """
-device = "cuda" # for GPU usage or "cpu" for CPU usage
 quantization_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_compute_dtype=torch.bfloat16,
     bnb_4bit_use_double_quant=True,
-    bnb_4bit_quant_type= "nf4")
-tokenizer = AutoTokenizer.from_pretrained(MODEL)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL,
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-    quantization_config=quantization_config)
 @spaces.GPU()
 def stream_chat(
@@ -55,7 +65,13 @@ def stream_chat(
     top_p: float = 1.0,
     top_k: int = 20,
     penalty: float = 1.2,
 ):
     print(f'message: {message}')
     print(f'history: {history}')
@@ -76,12 +92,13 @@ def stream_chat(
     generate_kwargs = dict(
         input_ids=input_ids,
-        max_new_tokens = max_new_tokens,
-        do_sample = False if temperature == 0 else True,
-        top_p = top_p,
-        top_k = top_k,
-        temperature = temperature,
-        eos_token_id=[128001,128008,128009],
         streamer=streamer,
     )
@@ -94,7 +111,6 @@ def stream_chat(
         buffer += new_text
         yield buffer
 chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
 with gr.Blocks(css=CSS, theme="Nymbo/Nymbo_Theme") as demo:
@@ -103,12 +119,15 @@ with gr.Blocks(css=CSS, theme="Nymbo/Nymbo_Theme") as demo:
         fn=stream_chat,
         chatbot=chatbot,
         fill_height=True,
-        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
         additional_inputs=[
             gr.Textbox(
                 value="You are a helpful assistant",
                 label="System Prompt",
-                render=False,
             ),
             gr.Slider(
                 minimum=0,
@@ -116,7 +135,6 @@ with gr.Blocks(css=CSS, theme="Nymbo/Nymbo_Theme") as demo:
                 step=0.1,
                 value=0.8,
                 label="Temperature",
-                render=False,
             ),
             gr.Slider(
                 minimum=128,
@@ -124,7 +142,6 @@ with gr.Blocks(css=CSS, theme="Nymbo/Nymbo_Theme") as demo:
                 step=1,
                 value=1024,
                 label="Max new tokens",
-                render=False,
             ),
             gr.Slider(
                 minimum=0.0,
@@ -132,7 +149,6 @@ with gr.Blocks(css=CSS, theme="Nymbo/Nymbo_Theme") as demo:
                 step=0.1,
                 value=1.0,
                 label="top_p",
-                render=False,
             ),
             gr.Slider(
                 minimum=1,
@@ -140,15 +156,13 @@ with gr.Blocks(css=CSS, theme="Nymbo/Nymbo_Theme") as demo:
                 step=1,
                 value=20,
                 label="top_k",
-                render=False,
             ),
             gr.Slider(
-                minimum=0.0,
                 maximum=2.0,
                 step=0.1,
                 value=1.2,
                 label="Repetition penalty",
-                render=False,
             ),
         ],
         examples=[
@@ -160,6 +174,5 @@ with gr.Blocks(css=CSS, theme="Nymbo/Nymbo_Theme") as demo:
         cache_examples=False,
     )
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 from threading import Thread
+MODELS = {
+    "Phi-3.5-mini": "microsoft/Phi-3.5-mini-instruct",
+    "Borea-Phi-3.5-mini-Jp": "AXCXEPT/Borea-Phi-3.5-mini-Instruct-Jp",
+    "EZO-Common-9B": "HODACHI/EZO-Common-9B-gemma-2-it"
+}
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
+TITLE = "<h1><center>Multi-Model Chat Interface</center></h1>"
 PLACEHOLDER = """
 <center>
+<p>Hi, I'm an AI assistant. Ask me anything.</p>
 </center>
 """
 CSS = """
 .duplicate-button {
     margin: auto !important;
 }
 """
+device = "cuda" if torch.cuda.is_available() else "cpu"
 quantization_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_compute_dtype=torch.bfloat16,
     bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4")
+model = None
+tokenizer = None
+def load_model(model_name):
+    global model, tokenizer
+    model_path = MODELS[model_name]
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+        quantization_config=quantization_config)
 @spaces.GPU()
 def stream_chat(
     top_p: float = 1.0,
     top_k: int = 20,
     penalty: float = 1.2,
+    model_name: str = "Phi-3.5-mini"
 ):
+    global model, tokenizer
+    if model is None or tokenizer is None or model.name_or_path != MODELS[model_name]:
+        load_model(model_name)
     print(f'message: {message}')
     print(f'history: {history}')
     generate_kwargs = dict(
         input_ids=input_ids,
+        max_new_tokens=max_new_tokens,
+        do_sample=False if temperature == 0 else True,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
+        repetition_penalty=penalty,
+        eos_token_id=tokenizer.eos_token_id,
         streamer=streamer,
     )
         buffer += new_text
         yield buffer
 chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
 with gr.Blocks(css=CSS, theme="Nymbo/Nymbo_Theme") as demo:
         fn=stream_chat,
         chatbot=chatbot,
         fill_height=True,
         additional_inputs=[
+            gr.Dropdown(
+                choices=list(MODELS.keys()),
+                value="Phi-3.5-mini",
+                label="Model",
+            ),
             gr.Textbox(
                 value="You are a helpful assistant",
                 label="System Prompt",
             ),
             gr.Slider(
                 minimum=0,
                 step=0.1,
                 value=0.8,
                 label="Temperature",
             ),
             gr.Slider(
                 minimum=128,
                 step=1,
                 value=1024,
                 label="Max new tokens",
             ),
             gr.Slider(
                 minimum=0.0,
                 step=0.1,
                 value=1.0,
                 label="top_p",
             ),
             gr.Slider(
                 minimum=1,
                 step=1,
                 value=20,
                 label="top_k",
             ),
             gr.Slider(
+                minimum=1.0,
                 maximum=2.0,
                 step=0.1,
                 value=1.2,
                 label="Repetition penalty",
             ),
         ],
         examples=[
         cache_examples=False,
     )
 if __name__ == "__main__":
+    demo.launch()