Spaces:
Sleeping
Sleeping
0.10 removing flash attn.
Browse files
app.py
CHANGED
@@ -7,8 +7,6 @@ from huggingface_hub import login
|
|
7 |
import os
|
8 |
|
9 |
from threading import Thread
|
10 |
-
import subprocess
|
11 |
-
subprocess.run('pip install -U flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
|
12 |
|
13 |
logging.basicConfig(level=logging.DEBUG)
|
14 |
|
@@ -23,7 +21,6 @@ models_available = [
|
|
23 |
tokenizer_a, model_a = None, None
|
24 |
tokenizer_b, model_b = None, None
|
25 |
torch_dtype = torch.bfloat16
|
26 |
-
attn_implementation = "flash_attention_2"
|
27 |
|
28 |
def apply_chat_template(messages, add_generation_prompt=False):
|
29 |
"""
|
@@ -54,24 +51,13 @@ def load_model_a(model_id):
|
|
54 |
global tokenizer_a, model_a, model_id_a
|
55 |
model_id_a = model_id # need to access model_id with tokenizer
|
56 |
tokenizer_a = AutoTokenizer.from_pretrained(model_id)
|
57 |
-
logging.debug(f"model A: {tokenizer_a.eos_token}")
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
trust_remote_code=True,
|
65 |
-
).eval()
|
66 |
-
except Exception as e:
|
67 |
-
logging.debug(f"Using default attention implementation in {model_id}")
|
68 |
-
logging.debug(f"Error: {e}")
|
69 |
-
model_a = AutoModelForCausalLM.from_pretrained(
|
70 |
-
model_id,
|
71 |
-
torch_dtype=torch_dtype,
|
72 |
-
device_map="auto",
|
73 |
-
trust_remote_code=True,
|
74 |
-
).eval()
|
75 |
model_a.tie_weights()
|
76 |
return gr.update(label=model_id)
|
77 |
|
@@ -79,24 +65,13 @@ def load_model_b(model_id):
|
|
79 |
global tokenizer_b, model_b, model_id_b
|
80 |
model_id_b = model_id
|
81 |
tokenizer_b = AutoTokenizer.from_pretrained(model_id)
|
82 |
-
logging.debug(f"model B: {tokenizer_b.eos_token}")
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
trust_remote_code=True,
|
90 |
-
).eval()
|
91 |
-
except Exception as e:
|
92 |
-
logging.debug(f"Error: {e}")
|
93 |
-
logging.debug(f"Using default attention implementation in {model_id}")
|
94 |
-
model_b = AutoModelForCausalLM.from_pretrained(
|
95 |
-
model_id,
|
96 |
-
torch_dtype=torch_dtype,
|
97 |
-
device_map="auto",
|
98 |
-
trust_remote_code=True,
|
99 |
-
).eval()
|
100 |
model_b.tie_weights()
|
101 |
return gr.update(label=model_id)
|
102 |
|
|
|
7 |
import os
|
8 |
|
9 |
from threading import Thread
|
|
|
|
|
10 |
|
11 |
logging.basicConfig(level=logging.DEBUG)
|
12 |
|
|
|
21 |
tokenizer_a, model_a = None, None
|
22 |
tokenizer_b, model_b = None, None
|
23 |
torch_dtype = torch.bfloat16
|
|
|
24 |
|
25 |
def apply_chat_template(messages, add_generation_prompt=False):
|
26 |
"""
|
|
|
51 |
global tokenizer_a, model_a, model_id_a
|
52 |
model_id_a = model_id # need to access model_id with tokenizer
|
53 |
tokenizer_a = AutoTokenizer.from_pretrained(model_id)
|
54 |
+
logging.debug(f"***** model A eos_token: {tokenizer_a.eos_token}")
|
55 |
+
model_a = AutoModelForCausalLM.from_pretrained(
|
56 |
+
model_id,
|
57 |
+
torch_dtype=torch_dtype,
|
58 |
+
device_map="auto",
|
59 |
+
trust_remote_code=True,
|
60 |
+
).eval()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
model_a.tie_weights()
|
62 |
return gr.update(label=model_id)
|
63 |
|
|
|
65 |
global tokenizer_b, model_b, model_id_b
|
66 |
model_id_b = model_id
|
67 |
tokenizer_b = AutoTokenizer.from_pretrained(model_id)
|
68 |
+
logging.debug(f"***** model B eos_token: {tokenizer_b.eos_token}")
|
69 |
+
model_b = AutoModelForCausalLM.from_pretrained(
|
70 |
+
model_id,
|
71 |
+
torch_dtype=torch_dtype,
|
72 |
+
device_map="auto",
|
73 |
+
trust_remote_code=True,
|
74 |
+
).eval()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
model_b.tie_weights()
|
76 |
return gr.update(label=model_id)
|
77 |
|