5to9 commited on
Commit
2569b24
1 Parent(s): eea02ee

0.10 removing flash attn.

Browse files
Files changed (1) hide show
  1. app.py +14 -39
app.py CHANGED
@@ -7,8 +7,6 @@ from huggingface_hub import login
7
  import os
8
 
9
  from threading import Thread
10
- import subprocess
11
- subprocess.run('pip install -U flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
12
 
13
  logging.basicConfig(level=logging.DEBUG)
14
 
@@ -23,7 +21,6 @@ models_available = [
23
  tokenizer_a, model_a = None, None
24
  tokenizer_b, model_b = None, None
25
  torch_dtype = torch.bfloat16
26
- attn_implementation = "flash_attention_2"
27
 
28
  def apply_chat_template(messages, add_generation_prompt=False):
29
  """
@@ -54,24 +51,13 @@ def load_model_a(model_id):
54
  global tokenizer_a, model_a, model_id_a
55
  model_id_a = model_id # need to access model_id with tokenizer
56
  tokenizer_a = AutoTokenizer.from_pretrained(model_id)
57
- logging.debug(f"model A: {tokenizer_a.eos_token}")
58
- try:
59
- model_a = AutoModelForCausalLM.from_pretrained(
60
- model_id,
61
- torch_dtype=torch_dtype,
62
- device_map="auto",
63
- attn_implementation=attn_implementation,
64
- trust_remote_code=True,
65
- ).eval()
66
- except Exception as e:
67
- logging.debug(f"Using default attention implementation in {model_id}")
68
- logging.debug(f"Error: {e}")
69
- model_a = AutoModelForCausalLM.from_pretrained(
70
- model_id,
71
- torch_dtype=torch_dtype,
72
- device_map="auto",
73
- trust_remote_code=True,
74
- ).eval()
75
  model_a.tie_weights()
76
  return gr.update(label=model_id)
77
 
@@ -79,24 +65,13 @@ def load_model_b(model_id):
79
  global tokenizer_b, model_b, model_id_b
80
  model_id_b = model_id
81
  tokenizer_b = AutoTokenizer.from_pretrained(model_id)
82
- logging.debug(f"model B: {tokenizer_b.eos_token}")
83
- try:
84
- model_b = AutoModelForCausalLM.from_pretrained(
85
- model_id,
86
- torch_dtype=torch_dtype,
87
- device_map="auto",
88
- attn_implementation=attn_implementation,
89
- trust_remote_code=True,
90
- ).eval()
91
- except Exception as e:
92
- logging.debug(f"Error: {e}")
93
- logging.debug(f"Using default attention implementation in {model_id}")
94
- model_b = AutoModelForCausalLM.from_pretrained(
95
- model_id,
96
- torch_dtype=torch_dtype,
97
- device_map="auto",
98
- trust_remote_code=True,
99
- ).eval()
100
  model_b.tie_weights()
101
  return gr.update(label=model_id)
102
 
 
7
  import os
8
 
9
  from threading import Thread
 
 
10
 
11
  logging.basicConfig(level=logging.DEBUG)
12
 
 
21
  tokenizer_a, model_a = None, None
22
  tokenizer_b, model_b = None, None
23
  torch_dtype = torch.bfloat16
 
24
 
25
  def apply_chat_template(messages, add_generation_prompt=False):
26
  """
 
51
  global tokenizer_a, model_a, model_id_a
52
  model_id_a = model_id # need to access model_id with tokenizer
53
  tokenizer_a = AutoTokenizer.from_pretrained(model_id)
54
+ logging.debug(f"***** model A eos_token: {tokenizer_a.eos_token}")
55
+ model_a = AutoModelForCausalLM.from_pretrained(
56
+ model_id,
57
+ torch_dtype=torch_dtype,
58
+ device_map="auto",
59
+ trust_remote_code=True,
60
+ ).eval()
 
 
 
 
 
 
 
 
 
 
 
61
  model_a.tie_weights()
62
  return gr.update(label=model_id)
63
 
 
65
  global tokenizer_b, model_b, model_id_b
66
  model_id_b = model_id
67
  tokenizer_b = AutoTokenizer.from_pretrained(model_id)
68
+ logging.debug(f"***** model B eos_token: {tokenizer_b.eos_token}")
69
+ model_b = AutoModelForCausalLM.from_pretrained(
70
+ model_id,
71
+ torch_dtype=torch_dtype,
72
+ device_map="auto",
73
+ trust_remote_code=True,
74
+ ).eval()
 
 
 
 
 
 
 
 
 
 
 
75
  model_b.tie_weights()
76
  return gr.update(label=model_id)
77