import streamlit as st import torch from transformers import AutoModelForCausalLM, AutoTokenizer import difflib import requests import os import json FIREBASE_URL = os.getenv("FIREBASE_URL") def fetch_from_firebase(model_id, data_type): response = requests.get(f"{FIREBASE_URL}/{data_type}/{model_id}.json") if response.status_code == 200: return response.json() return None def save_to_firebase(model_id, data, data_type): response = requests.put( f"{FIREBASE_URL}/{data_type}/{model_id}.json", data=json.dumps(data) ) return response.status_code == 200 def get_model_structure(model_id) -> list[str]: struct_lines = fetch_from_firebase(model_id, "model_structures") if struct_lines: return struct_lines model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True ) structure = {k: str(v.shape) for k, v in model.state_dict().items()} struct_lines = [f"{k}: {v}" for k, v in structure.items()] save_to_firebase(model_id, struct_lines, "model_structures") return struct_lines def get_tokenizer_vocab_size(model_id) -> int: vocab_size = fetch_from_firebase(model_id, "tokenizer_vocab_sizes") if vocab_size: return vocab_size tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) vocab_size = tokenizer.vocab_size save_to_firebase(model_id, vocab_size, "tokenizer_vocab_sizes") return vocab_size def compare_structures(struct1_lines: list[str], struct2_lines: list[str]): diff = difflib.ndiff(struct1_lines, struct2_lines) return diff def display_diff(diff): left_lines = [] right_lines = [] diff_found = False for line in diff: if line.startswith("- "): left_lines.append( f'{line[2:]}' ) right_lines.append("") diff_found = True elif line.startswith("+ "): right_lines.append( f'{line[2:]}' ) left_lines.append("") diff_found = True elif line.startswith(" "): left_lines.append(line[2:]) right_lines.append(line[2:]) else: pass left_html = "
".join(left_lines) right_html = "
".join(right_lines) return left_html, right_html, diff_found # Set Streamlit page configuration to wide mode st.set_page_config(layout="wide") # Apply custom CSS for wider layout st.markdown( """ """, unsafe_allow_html=True, ) st.title("Model Structure Comparison Tool") model_id1 = st.text_input("Enter the first HuggingFace Model ID") model_id2 = st.text_input("Enter the second HuggingFace Model ID") if st.button("Compare Models"): with st.spinner("Comparing models and loading tokenizers..."): if model_id1 and model_id2: # Get model structures struct1 = get_model_structure(model_id1) struct2 = get_model_structure(model_id2) # Compare model structures diff = compare_structures(struct1, struct2) left_html, right_html, diff_found = display_diff(diff) st.write("### Comparison Result") if not diff_found: st.success("The model structures are identical.") col1, col2 = st.columns( [1.5, 1.5] ) # Adjust the ratio to make columns wider with col1: st.write(f"### Model 1: {model_id1}") st.markdown(left_html, unsafe_allow_html=True) with col2: st.write(f"### Model 2: {model_id2}") st.markdown(right_html, unsafe_allow_html=True) # Tokenizer verification try: vocab_size1 = get_tokenizer_vocab_size(model_id1) vocab_size2 = get_tokenizer_vocab_size(model_id2) if vocab_size1 == vocab_size2: st.success("The tokenizer vocab sizes are identical.") else: st.warning("The tokenizer vocab sizes are different.") st.write(f"**{model_id1} Tokenizer Vocab Size**: {vocab_size1}") st.write(f"**{model_id2} Tokenizer Vocab Size**: {vocab_size2}") except Exception as e: st.error(f"Error loading tokenizers: {e}") else: st.error("Please enter both model IDs.")