HawkClaws commited on
Commit
0371f16
1 Parent(s): d2f7ba3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -13
app.py CHANGED
@@ -8,45 +8,49 @@ import json
8
 
9
  FIREBASE_URL = os.getenv("FIREBASE_URL")
10
 
 
11
  def fetch_from_firebase(model_id, data_type):
12
  response = requests.get(f"{FIREBASE_URL}/{data_type}/{model_id}.json")
13
  if response.status_code == 200:
14
  return response.json()
15
  return None
16
 
 
17
  def save_to_firebase(model_id, data, data_type):
18
  response = requests.put(
19
  f"{FIREBASE_URL}/{data_type}/{model_id}.json", data=json.dumps(data)
20
  )
21
  return response.status_code == 200
22
 
 
23
  def get_model_structure(model_id) -> list[str]:
24
  struct_lines = fetch_from_firebase(model_id, "model_structures")
25
  if struct_lines:
26
  return struct_lines
27
  model = AutoModelForCausalLM.from_pretrained(
28
- model_id,
29
- torch_dtype=torch.bfloat16,
30
- device_map="cpu",
31
  )
32
  structure = {k: str(v.shape) for k, v in model.state_dict().items()}
33
  struct_lines = [f"{k}: {v}" for k, v in structure.items()]
34
  save_to_firebase(model_id, struct_lines, "model_structures")
35
  return struct_lines
36
 
 
37
  def get_tokenizer_vocab_size(model_id) -> int:
38
  vocab_size = fetch_from_firebase(model_id, "tokenizer_vocab_sizes")
39
  if vocab_size:
40
  return vocab_size
41
- tokenizer = AutoTokenizer.from_pretrained(model_id)
42
  vocab_size = tokenizer.vocab_size
43
  save_to_firebase(model_id, vocab_size, "tokenizer_vocab_sizes")
44
  return vocab_size
45
 
 
46
  def compare_structures(struct1_lines: list[str], struct2_lines: list[str]):
47
  diff = difflib.ndiff(struct1_lines, struct2_lines)
48
  return diff
49
 
 
50
  def display_diff(diff):
51
  left_lines = []
52
  right_lines = []
@@ -76,6 +80,7 @@ def display_diff(diff):
76
 
77
  return left_html, right_html, diff_found
78
 
 
79
  # Set Streamlit page configuration to wide mode
80
  st.set_page_config(layout="wide")
81
 
@@ -101,21 +106,23 @@ model_id1 = st.text_input("Enter the first HuggingFace Model ID")
101
  model_id2 = st.text_input("Enter the second HuggingFace Model ID")
102
 
103
  if st.button("Compare Models"):
104
- with st.spinner('Comparing models and loading tokenizers...'):
105
  if model_id1 and model_id2:
106
  # Get model structures
107
  struct1 = get_model_structure(model_id1)
108
  struct2 = get_model_structure(model_id2)
109
-
110
  # Compare model structures
111
  diff = compare_structures(struct1, struct2)
112
  left_html, right_html, diff_found = display_diff(diff)
113
-
114
  st.write("### Comparison Result")
115
  if not diff_found:
116
  st.success("The model structures are identical.")
117
-
118
- col1, col2 = st.columns([1.5, 1.5]) # Adjust the ratio to make columns wider
 
 
119
 
120
  with col1:
121
  st.write(f"### Model 1: {model_id1}")
@@ -124,20 +131,20 @@ if st.button("Compare Models"):
124
  with col2:
125
  st.write(f"### Model 2: {model_id2}")
126
  st.markdown(right_html, unsafe_allow_html=True)
127
-
128
  # Tokenizer verification
129
  try:
130
  vocab_size1 = get_tokenizer_vocab_size(model_id1)
131
  vocab_size2 = get_tokenizer_vocab_size(model_id2)
132
-
133
  if vocab_size1 == vocab_size2:
134
  st.success("The tokenizer vocab sizes are identical.")
135
  else:
136
  st.warning("The tokenizer vocab sizes are different.")
137
-
138
  st.write(f"**{model_id1} Tokenizer Vocab Size**: {vocab_size1}")
139
  st.write(f"**{model_id2} Tokenizer Vocab Size**: {vocab_size2}")
140
-
141
  except Exception as e:
142
  st.error(f"Error loading tokenizers: {e}")
143
  else:
 
8
 
9
  FIREBASE_URL = os.getenv("FIREBASE_URL")
10
 
11
+
12
  def fetch_from_firebase(model_id, data_type):
13
  response = requests.get(f"{FIREBASE_URL}/{data_type}/{model_id}.json")
14
  if response.status_code == 200:
15
  return response.json()
16
  return None
17
 
18
+
19
  def save_to_firebase(model_id, data, data_type):
20
  response = requests.put(
21
  f"{FIREBASE_URL}/{data_type}/{model_id}.json", data=json.dumps(data)
22
  )
23
  return response.status_code == 200
24
 
25
+
26
  def get_model_structure(model_id) -> list[str]:
27
  struct_lines = fetch_from_firebase(model_id, "model_structures")
28
  if struct_lines:
29
  return struct_lines
30
  model = AutoModelForCausalLM.from_pretrained(
31
+ model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True
 
 
32
  )
33
  structure = {k: str(v.shape) for k, v in model.state_dict().items()}
34
  struct_lines = [f"{k}: {v}" for k, v in structure.items()]
35
  save_to_firebase(model_id, struct_lines, "model_structures")
36
  return struct_lines
37
 
38
+
39
  def get_tokenizer_vocab_size(model_id) -> int:
40
  vocab_size = fetch_from_firebase(model_id, "tokenizer_vocab_sizes")
41
  if vocab_size:
42
  return vocab_size
43
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
44
  vocab_size = tokenizer.vocab_size
45
  save_to_firebase(model_id, vocab_size, "tokenizer_vocab_sizes")
46
  return vocab_size
47
 
48
+
49
  def compare_structures(struct1_lines: list[str], struct2_lines: list[str]):
50
  diff = difflib.ndiff(struct1_lines, struct2_lines)
51
  return diff
52
 
53
+
54
  def display_diff(diff):
55
  left_lines = []
56
  right_lines = []
 
80
 
81
  return left_html, right_html, diff_found
82
 
83
+
84
  # Set Streamlit page configuration to wide mode
85
  st.set_page_config(layout="wide")
86
 
 
106
  model_id2 = st.text_input("Enter the second HuggingFace Model ID")
107
 
108
  if st.button("Compare Models"):
109
+ with st.spinner("Comparing models and loading tokenizers..."):
110
  if model_id1 and model_id2:
111
  # Get model structures
112
  struct1 = get_model_structure(model_id1)
113
  struct2 = get_model_structure(model_id2)
114
+
115
  # Compare model structures
116
  diff = compare_structures(struct1, struct2)
117
  left_html, right_html, diff_found = display_diff(diff)
118
+
119
  st.write("### Comparison Result")
120
  if not diff_found:
121
  st.success("The model structures are identical.")
122
+
123
+ col1, col2 = st.columns(
124
+ [1.5, 1.5]
125
+ ) # Adjust the ratio to make columns wider
126
 
127
  with col1:
128
  st.write(f"### Model 1: {model_id1}")
 
131
  with col2:
132
  st.write(f"### Model 2: {model_id2}")
133
  st.markdown(right_html, unsafe_allow_html=True)
134
+
135
  # Tokenizer verification
136
  try:
137
  vocab_size1 = get_tokenizer_vocab_size(model_id1)
138
  vocab_size2 = get_tokenizer_vocab_size(model_id2)
139
+
140
  if vocab_size1 == vocab_size2:
141
  st.success("The tokenizer vocab sizes are identical.")
142
  else:
143
  st.warning("The tokenizer vocab sizes are different.")
144
+
145
  st.write(f"**{model_id1} Tokenizer Vocab Size**: {vocab_size1}")
146
  st.write(f"**{model_id2} Tokenizer Vocab Size**: {vocab_size2}")
147
+
148
  except Exception as e:
149
  st.error(f"Error loading tokenizers: {e}")
150
  else: