MohamedRashad commited on
Commit
a5a26a2
1 Parent(s): 2f1457b

chore: Refactor model ID handling in app.py and update requirements.txt

Browse files
Files changed (2) hide show
  1. app.py +19 -12
  2. requirements.txt +6 -5
app.py CHANGED
@@ -1,10 +1,13 @@
 
 
 
1
  import spaces
2
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
3
  import torch
4
  import gradio as gr
5
  from threading import Thread
6
- import subprocess
7
- subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
8
 
9
  models_available = [
10
  "MohamedRashad/Arabic-Orpo-Llama-3-8B-Instruct",
@@ -20,6 +23,9 @@ models_available = [
20
 
21
  tokenizer_a, model_a = None, None
22
  tokenizer_b, model_b = None, None
 
 
 
23
 
24
  def load_model_a(model_id):
25
  global tokenizer_a, model_a
@@ -29,19 +35,20 @@ def load_model_a(model_id):
29
  try:
30
  model_a = AutoModelForCausalLM.from_pretrained(
31
  model_id,
32
- torch_dtype=torch.bfloat16,
33
  device_map="auto",
34
- attn_implementation="flash_attention_2",
35
  trust_remote_code=True,
36
  ).eval()
37
  except:
38
  print(f"Using default attention implementation in {model_id}")
39
  model_a = AutoModelForCausalLM.from_pretrained(
40
  model_id,
41
- torch_dtype=torch.bfloat16,
42
  device_map="auto",
43
  trust_remote_code=True,
44
  ).eval()
 
45
  return gr.update(label=model_id)
46
 
47
  def load_model_b(model_id):
@@ -52,19 +59,20 @@ def load_model_b(model_id):
52
  try:
53
  model_b = AutoModelForCausalLM.from_pretrained(
54
  model_id,
55
- torch_dtype=torch.bfloat16,
56
  device_map="auto",
57
- attn_implementation="flash_attention_2",
58
  trust_remote_code=True,
59
  ).eval()
60
  except:
61
  print(f"Using default attention implementation in {model_id}")
62
  model_b = AutoModelForCausalLM.from_pretrained(
63
  model_id,
64
- torch_dtype=torch.bfloat16,
65
  device_map="auto",
66
  trust_remote_code=True,
67
  ).eval()
 
68
  return gr.update(label=model_id)
69
 
70
  @spaces.GPU()
@@ -105,8 +113,7 @@ def generate_both(system_prompt, input_text, chatbot_a, chatbot_b, max_new_token
105
  streamer=text_streamer_a,
106
  max_new_tokens=max_new_tokens,
107
  pad_token_id=tokenizer_a.eos_token_id,
108
- do_sample=False,
109
- # do_sample=True if temperature > 0 else False,
110
  temperature=temperature,
111
  top_p=top_p,
112
  repetition_penalty=repetition_penalty,
@@ -116,7 +123,7 @@ def generate_both(system_prompt, input_text, chatbot_a, chatbot_b, max_new_token
116
  streamer=text_streamer_b,
117
  max_new_tokens=max_new_tokens,
118
  pad_token_id=tokenizer_b.eos_token_id,
119
- do_sample=False,
120
  temperature=temperature,
121
  top_p=top_p,
122
  repetition_penalty=repetition_penalty,
@@ -168,7 +175,7 @@ arena_notes = """Important Notes:
168
  - Sometimes an error may occur when generating the response, in this case, please try again.
169
  """
170
 
171
- with gr.Blocks(title="Arabic-ORPO-Llama3") as demo:
172
  with gr.Column():
173
  gr.HTML("<center><h1>Arabic Chatbot Comparison</h1></center>")
174
  gr.Markdown(arena_notes)
 
1
+ import os
2
+ os.environ["CUDA_LAUNCH_BLOCKING"]="1"
3
+
4
  import spaces
5
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
6
  import torch
7
  import gradio as gr
8
  from threading import Thread
9
+ # import subprocess
10
+ # subprocess.run('pip install -U flash-attn transformers sentencepiece', shell=True)
11
 
12
  models_available = [
13
  "MohamedRashad/Arabic-Orpo-Llama-3-8B-Instruct",
 
23
 
24
  tokenizer_a, model_a = None, None
25
  tokenizer_b, model_b = None, None
26
+ torch_dtype = torch.bfloat16
27
+ attn_implementation = "flash_attention_2"
28
+ # attn_implementation = None
29
 
30
  def load_model_a(model_id):
31
  global tokenizer_a, model_a
 
35
  try:
36
  model_a = AutoModelForCausalLM.from_pretrained(
37
  model_id,
38
+ torch_dtype=torch_dtype,
39
  device_map="auto",
40
+ attn_implementation=attn_implementation,
41
  trust_remote_code=True,
42
  ).eval()
43
  except:
44
  print(f"Using default attention implementation in {model_id}")
45
  model_a = AutoModelForCausalLM.from_pretrained(
46
  model_id,
47
+ torch_dtype=torch_dtype,
48
  device_map="auto",
49
  trust_remote_code=True,
50
  ).eval()
51
+ model_a.gradient_checkpointing_enable()
52
  return gr.update(label=model_id)
53
 
54
  def load_model_b(model_id):
 
59
  try:
60
  model_b = AutoModelForCausalLM.from_pretrained(
61
  model_id,
62
+ torch_dtype=torch_dtype,
63
  device_map="auto",
64
+ attn_implementation=attn_implementation,
65
  trust_remote_code=True,
66
  ).eval()
67
  except:
68
  print(f"Using default attention implementation in {model_id}")
69
  model_b = AutoModelForCausalLM.from_pretrained(
70
  model_id,
71
+ torch_dtype=torch_dtype,
72
  device_map="auto",
73
  trust_remote_code=True,
74
  ).eval()
75
+ model_b.gradient_checkpointing_enable()
76
  return gr.update(label=model_id)
77
 
78
  @spaces.GPU()
 
113
  streamer=text_streamer_a,
114
  max_new_tokens=max_new_tokens,
115
  pad_token_id=tokenizer_a.eos_token_id,
116
+ do_sample=True,
 
117
  temperature=temperature,
118
  top_p=top_p,
119
  repetition_penalty=repetition_penalty,
 
123
  streamer=text_streamer_b,
124
  max_new_tokens=max_new_tokens,
125
  pad_token_id=tokenizer_b.eos_token_id,
126
+ do_sample=True,
127
  temperature=temperature,
128
  top_p=top_p,
129
  repetition_penalty=repetition_penalty,
 
175
  - Sometimes an error may occur when generating the response, in this case, please try again.
176
  """
177
 
178
+ with gr.Blocks() as demo:
179
  with gr.Column():
180
  gr.HTML("<center><h1>Arabic Chatbot Comparison</h1></center>")
181
  gr.Markdown(arena_notes)
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
- transformers
2
- torch
3
- accelerate
4
- spaces
5
- sentencepiece
 
 
1
+ transformers==4.44.1
2
+ torch==2.4.0
3
+ accelerate==0.33.0
4
+ sentencepiece==0.2.0
5
+ flash-attn==2.6.3
6
+ spaces