Spaces:

MohamedRashad
/

Arabic-Chatbot-Arena

Running on Zero

App Files Files Community

MohamedRashad commited on Aug 21

Commit

a5a26a2

•

1 Parent(s): 2f1457b

chore: Refactor model ID handling in app.py and update requirements.txt

Browse files

Files changed (2) hide show

app.py +19 -12
requirements.txt +6 -5

app.py CHANGED Viewed

@@ -1,10 +1,13 @@
 import spaces
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import torch
 import gradio as gr
 from threading import Thread
-import subprocess
-subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 models_available = [
     "MohamedRashad/Arabic-Orpo-Llama-3-8B-Instruct",
@@ -20,6 +23,9 @@ models_available = [
 tokenizer_a, model_a = None, None
 tokenizer_b, model_b = None, None
 def load_model_a(model_id):
     global tokenizer_a, model_a
@@ -29,19 +35,20 @@ def load_model_a(model_id):
     try:
         model_a = AutoModelForCausalLM.from_pretrained(
             model_id,
-            torch_dtype=torch.bfloat16,
             device_map="auto",
-            attn_implementation="flash_attention_2",
             trust_remote_code=True,
         ).eval()
     except:
         print(f"Using default attention implementation in {model_id}")
         model_a = AutoModelForCausalLM.from_pretrained(
             model_id,
-            torch_dtype=torch.bfloat16,
             device_map="auto",
             trust_remote_code=True,
         ).eval()
     return gr.update(label=model_id)
 def load_model_b(model_id):
@@ -52,19 +59,20 @@ def load_model_b(model_id):
     try:
         model_b = AutoModelForCausalLM.from_pretrained(
             model_id,
-            torch_dtype=torch.bfloat16,
             device_map="auto",
-            attn_implementation="flash_attention_2",
             trust_remote_code=True,
         ).eval()
     except:
         print(f"Using default attention implementation in {model_id}")
         model_b = AutoModelForCausalLM.from_pretrained(
             model_id,
-            torch_dtype=torch.bfloat16,
             device_map="auto",
             trust_remote_code=True,
         ).eval()
     return gr.update(label=model_id)
 @spaces.GPU()
@@ -105,8 +113,7 @@ def generate_both(system_prompt, input_text, chatbot_a, chatbot_b, max_new_token
         streamer=text_streamer_a,
         max_new_tokens=max_new_tokens,
         pad_token_id=tokenizer_a.eos_token_id,
-        do_sample=False,
-        # do_sample=True if temperature > 0 else False,
         temperature=temperature,
         top_p=top_p,
         repetition_penalty=repetition_penalty,
@@ -116,7 +123,7 @@ def generate_both(system_prompt, input_text, chatbot_a, chatbot_b, max_new_token
         streamer=text_streamer_b,
         max_new_tokens=max_new_tokens,
         pad_token_id=tokenizer_b.eos_token_id,
-        do_sample=False,
         temperature=temperature,
         top_p=top_p,
         repetition_penalty=repetition_penalty,
@@ -168,7 +175,7 @@ arena_notes = """Important Notes:
 - Sometimes an error may occur when generating the response, in this case, please try again.
 """
-with gr.Blocks(title="Arabic-ORPO-Llama3") as demo:
     with gr.Column():
         gr.HTML("<center><h1>Arabic Chatbot Comparison</h1></center>")
         gr.Markdown(arena_notes)

+import os
+os.environ["CUDA_LAUNCH_BLOCKING"]="1"
 import spaces
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import torch
 import gradio as gr
 from threading import Thread
+# import subprocess
+# subprocess.run('pip install -U flash-attn transformers sentencepiece', shell=True)
 models_available = [
     "MohamedRashad/Arabic-Orpo-Llama-3-8B-Instruct",
 tokenizer_a, model_a = None, None
 tokenizer_b, model_b = None, None
+torch_dtype = torch.bfloat16
+attn_implementation = "flash_attention_2"
+# attn_implementation = None
 def load_model_a(model_id):
     global tokenizer_a, model_a
     try:
         model_a = AutoModelForCausalLM.from_pretrained(
             model_id,
+            torch_dtype=torch_dtype,
             device_map="auto",
+            attn_implementation=attn_implementation,
             trust_remote_code=True,
         ).eval()
     except:
         print(f"Using default attention implementation in {model_id}")
         model_a = AutoModelForCausalLM.from_pretrained(
             model_id,
+            torch_dtype=torch_dtype,
             device_map="auto",
             trust_remote_code=True,
         ).eval()
+    model_a.gradient_checkpointing_enable()
     return gr.update(label=model_id)
 def load_model_b(model_id):
     try:
         model_b = AutoModelForCausalLM.from_pretrained(
             model_id,
+            torch_dtype=torch_dtype,
             device_map="auto",
+            attn_implementation=attn_implementation,
             trust_remote_code=True,
         ).eval()
     except:
         print(f"Using default attention implementation in {model_id}")
         model_b = AutoModelForCausalLM.from_pretrained(
             model_id,
+            torch_dtype=torch_dtype,
             device_map="auto",
             trust_remote_code=True,
         ).eval()
+    model_b.gradient_checkpointing_enable()
     return gr.update(label=model_id)
 @spaces.GPU()
         streamer=text_streamer_a,
         max_new_tokens=max_new_tokens,
         pad_token_id=tokenizer_a.eos_token_id,
+        do_sample=True,
         temperature=temperature,
         top_p=top_p,
         repetition_penalty=repetition_penalty,
         streamer=text_streamer_b,
         max_new_tokens=max_new_tokens,
         pad_token_id=tokenizer_b.eos_token_id,
+        do_sample=True,
         temperature=temperature,
         top_p=top_p,
         repetition_penalty=repetition_penalty,
 - Sometimes an error may occur when generating the response, in this case, please try again.
 """
+with gr.Blocks() as demo:
     with gr.Column():
         gr.HTML("<center><h1>Arabic Chatbot Comparison</h1></center>")
         gr.Markdown(arena_notes)

requirements.txt CHANGED Viewed

@@ -1,5 +1,6 @@
-transformers
-torch
-accelerate
-spaces
-sentencepiece

+transformers==4.44.1
+torch==2.4.0
+accelerate==0.33.0
+sentencepiece==0.2.0
+flash-attn==2.6.3
+spaces