HawkClaws's picture
Update app.py
0371f16 verified
raw
history blame contribute delete
No virus
4.79 kB
import streamlit as st
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import difflib
import requests
import os
import json
FIREBASE_URL = os.getenv("FIREBASE_URL")
def fetch_from_firebase(model_id, data_type):
response = requests.get(f"{FIREBASE_URL}/{data_type}/{model_id}.json")
if response.status_code == 200:
return response.json()
return None
def save_to_firebase(model_id, data, data_type):
response = requests.put(
f"{FIREBASE_URL}/{data_type}/{model_id}.json", data=json.dumps(data)
)
return response.status_code == 200
def get_model_structure(model_id) -> list[str]:
struct_lines = fetch_from_firebase(model_id, "model_structures")
if struct_lines:
return struct_lines
model = AutoModelForCausalLM.from_pretrained(
model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True
)
structure = {k: str(v.shape) for k, v in model.state_dict().items()}
struct_lines = [f"{k}: {v}" for k, v in structure.items()]
save_to_firebase(model_id, struct_lines, "model_structures")
return struct_lines
def get_tokenizer_vocab_size(model_id) -> int:
vocab_size = fetch_from_firebase(model_id, "tokenizer_vocab_sizes")
if vocab_size:
return vocab_size
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
vocab_size = tokenizer.vocab_size
save_to_firebase(model_id, vocab_size, "tokenizer_vocab_sizes")
return vocab_size
def compare_structures(struct1_lines: list[str], struct2_lines: list[str]):
diff = difflib.ndiff(struct1_lines, struct2_lines)
return diff
def display_diff(diff):
left_lines = []
right_lines = []
diff_found = False
for line in diff:
if line.startswith("- "):
left_lines.append(
f'<span style="background-color: #ffdddd;">{line[2:]}</span>'
)
right_lines.append("")
diff_found = True
elif line.startswith("+ "):
right_lines.append(
f'<span style="background-color: #ddffdd;">{line[2:]}</span>'
)
left_lines.append("")
diff_found = True
elif line.startswith(" "):
left_lines.append(line[2:])
right_lines.append(line[2:])
else:
pass
left_html = "<br>".join(left_lines)
right_html = "<br>".join(right_lines)
return left_html, right_html, diff_found
# Set Streamlit page configuration to wide mode
st.set_page_config(layout="wide")
# Apply custom CSS for wider layout
st.markdown(
"""
<style>
.reportview-container .main .block-container {
max-width: 100%;
padding-left: 10%;
padding-right: 10%;
}
.stMarkdown {
white-space: pre-wrap;
}
</style>
""",
unsafe_allow_html=True,
)
st.title("Model Structure Comparison Tool")
model_id1 = st.text_input("Enter the first HuggingFace Model ID")
model_id2 = st.text_input("Enter the second HuggingFace Model ID")
if st.button("Compare Models"):
with st.spinner("Comparing models and loading tokenizers..."):
if model_id1 and model_id2:
# Get model structures
struct1 = get_model_structure(model_id1)
struct2 = get_model_structure(model_id2)
# Compare model structures
diff = compare_structures(struct1, struct2)
left_html, right_html, diff_found = display_diff(diff)
st.write("### Comparison Result")
if not diff_found:
st.success("The model structures are identical.")
col1, col2 = st.columns(
[1.5, 1.5]
) # Adjust the ratio to make columns wider
with col1:
st.write(f"### Model 1: {model_id1}")
st.markdown(left_html, unsafe_allow_html=True)
with col2:
st.write(f"### Model 2: {model_id2}")
st.markdown(right_html, unsafe_allow_html=True)
# Tokenizer verification
try:
vocab_size1 = get_tokenizer_vocab_size(model_id1)
vocab_size2 = get_tokenizer_vocab_size(model_id2)
if vocab_size1 == vocab_size2:
st.success("The tokenizer vocab sizes are identical.")
else:
st.warning("The tokenizer vocab sizes are different.")
st.write(f"**{model_id1} Tokenizer Vocab Size**: {vocab_size1}")
st.write(f"**{model_id2} Tokenizer Vocab Size**: {vocab_size2}")
except Exception as e:
st.error(f"Error loading tokenizers: {e}")
else:
st.error("Please enter both model IDs.")