Spaces:

5to9
/

bot-royale

Sleeping

App Files Files Community

5to9 commited on Sep 26

Commit

2569b24

•

1 Parent(s): eea02ee

0.10 removing flash attn.

Browse files

Files changed (1) hide show

app.py +14 -39

app.py CHANGED Viewed

@@ -7,8 +7,6 @@ from huggingface_hub import login
 import os
 from threading import Thread
-import subprocess
-subprocess.run('pip install -U flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 logging.basicConfig(level=logging.DEBUG)
@@ -23,7 +21,6 @@ models_available = [
 tokenizer_a, model_a = None, None
 tokenizer_b, model_b = None, None
 torch_dtype = torch.bfloat16
-attn_implementation = "flash_attention_2"
 def apply_chat_template(messages, add_generation_prompt=False):
     """
@@ -54,24 +51,13 @@ def load_model_a(model_id):
     global tokenizer_a, model_a, model_id_a
     model_id_a = model_id # need to access model_id with tokenizer
     tokenizer_a = AutoTokenizer.from_pretrained(model_id)
-    logging.debug(f"model A: {tokenizer_a.eos_token}")
-    try:
-        model_a = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            torch_dtype=torch_dtype,
-            device_map="auto",
-            attn_implementation=attn_implementation,
-            trust_remote_code=True,
-        ).eval()
-    except Exception as e:
-        logging.debug(f"Using default attention implementation in {model_id}")
-        logging.debug(f"Error: {e}")
-        model_a = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            torch_dtype=torch_dtype,
-            device_map="auto",
-            trust_remote_code=True,
-        ).eval()
     model_a.tie_weights()
     return gr.update(label=model_id)
@@ -79,24 +65,13 @@ def load_model_b(model_id):
     global tokenizer_b, model_b, model_id_b
     model_id_b = model_id
     tokenizer_b = AutoTokenizer.from_pretrained(model_id)
-    logging.debug(f"model B: {tokenizer_b.eos_token}")
-    try:
-        model_b = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            torch_dtype=torch_dtype,
-            device_map="auto",
-            attn_implementation=attn_implementation,
-            trust_remote_code=True,
-        ).eval()
-    except Exception as e:
-        logging.debug(f"Error: {e}")
-        logging.debug(f"Using default attention implementation in {model_id}")
-        model_b = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            torch_dtype=torch_dtype,
-            device_map="auto",
-            trust_remote_code=True,
-        ).eval()
     model_b.tie_weights()
     return gr.update(label=model_id)

 import os
 from threading import Thread
 logging.basicConfig(level=logging.DEBUG)
 tokenizer_a, model_a = None, None
 tokenizer_b, model_b = None, None
 torch_dtype = torch.bfloat16
 def apply_chat_template(messages, add_generation_prompt=False):
     """
     global tokenizer_a, model_a, model_id_a
     model_id_a = model_id # need to access model_id with tokenizer
     tokenizer_a = AutoTokenizer.from_pretrained(model_id)
+    logging.debug(f"***** model A eos_token: {tokenizer_a.eos_token}")
+    model_a = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        torch_dtype=torch_dtype,
+        device_map="auto",
+        trust_remote_code=True,
+    ).eval()
     model_a.tie_weights()
     return gr.update(label=model_id)
     global tokenizer_b, model_b, model_id_b
     model_id_b = model_id
     tokenizer_b = AutoTokenizer.from_pretrained(model_id)
+    logging.debug(f"***** model B eos_token: {tokenizer_b.eos_token}")
+    model_b = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        torch_dtype=torch_dtype,
+        device_map="auto",
+        trust_remote_code=True,
+    ).eval()
     model_b.tie_weights()
     return gr.update(label=model_id)