Spaces:

HawkClaws
/

llm_stracture_diff

Sleeping

App Files Files Community

llm_stracture_diff / app.py

HawkClaws

Update app.py

0371f16 verified 3 months ago

raw

history blame contribute delete

No virus

4.79 kB

	import streamlit as st
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import difflib
	import requests
	import os
	import json

	FIREBASE_URL = os.getenv("FIREBASE_URL")


	def fetch_from_firebase(model_id, data_type):
	response = requests.get(f"{FIREBASE_URL}/{data_type}/{model_id}.json")
	if response.status_code == 200:
	return response.json()
	return None


	def save_to_firebase(model_id, data, data_type):
	response = requests.put(
	f"{FIREBASE_URL}/{data_type}/{model_id}.json", data=json.dumps(data)
	)
	return response.status_code == 200


	def get_model_structure(model_id) -> list[str]:
	struct_lines = fetch_from_firebase(model_id, "model_structures")
	if struct_lines:
	return struct_lines
	model = AutoModelForCausalLM.from_pretrained(
	model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True
	)
	structure = {k: str(v.shape) for k, v in model.state_dict().items()}
	struct_lines = [f"{k}: {v}" for k, v in structure.items()]
	save_to_firebase(model_id, struct_lines, "model_structures")
	return struct_lines


	def get_tokenizer_vocab_size(model_id) -> int:
	vocab_size = fetch_from_firebase(model_id, "tokenizer_vocab_sizes")
	if vocab_size:
	return vocab_size
	tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
	vocab_size = tokenizer.vocab_size
	save_to_firebase(model_id, vocab_size, "tokenizer_vocab_sizes")
	return vocab_size


	def compare_structures(struct1_lines: list[str], struct2_lines: list[str]):
	diff = difflib.ndiff(struct1_lines, struct2_lines)
	return diff


	def display_diff(diff):
	left_lines = []
	right_lines = []
	diff_found = False

	for line in diff:
	if line.startswith("- "):
	left_lines.append(
	f'<span style="background-color: #ffdddd;">{line[2:]}</span>'
	)
	right_lines.append("")
	diff_found = True
	elif line.startswith("+ "):
	right_lines.append(
	f'<span style="background-color: #ddffdd;">{line[2:]}</span>'
	)
	left_lines.append("")
	diff_found = True
	elif line.startswith(" "):
	left_lines.append(line[2:])
	right_lines.append(line[2:])
	else:
	pass

	left_html = "<br>".join(left_lines)
	right_html = "<br>".join(right_lines)

	return left_html, right_html, diff_found


	# Set Streamlit page configuration to wide mode
	st.set_page_config(layout="wide")

	# Apply custom CSS for wider layout
	st.markdown(
	"""
	<style>
	.reportview-container .main .block-container {
	max-width: 100%;
	padding-left: 10%;
	padding-right: 10%;
	}
	.stMarkdown {
	white-space: pre-wrap;
	}
	</style>
	""",
	unsafe_allow_html=True,
	)

	st.title("Model Structure Comparison Tool")
	model_id1 = st.text_input("Enter the first HuggingFace Model ID")
	model_id2 = st.text_input("Enter the second HuggingFace Model ID")

	if st.button("Compare Models"):
	with st.spinner("Comparing models and loading tokenizers..."):
	if model_id1 and model_id2:
	# Get model structures
	struct1 = get_model_structure(model_id1)
	struct2 = get_model_structure(model_id2)

	# Compare model structures
	diff = compare_structures(struct1, struct2)
	left_html, right_html, diff_found = display_diff(diff)

	st.write("### Comparison Result")
	if not diff_found:
	st.success("The model structures are identical.")

	col1, col2 = st.columns(
	[1.5, 1.5]
	) # Adjust the ratio to make columns wider

	with col1:
	st.write(f"### Model 1: {model_id1}")
	st.markdown(left_html, unsafe_allow_html=True)

	with col2:
	st.write(f"### Model 2: {model_id2}")
	st.markdown(right_html, unsafe_allow_html=True)

	# Tokenizer verification
	try:
	vocab_size1 = get_tokenizer_vocab_size(model_id1)
	vocab_size2 = get_tokenizer_vocab_size(model_id2)

	if vocab_size1 == vocab_size2:
	st.success("The tokenizer vocab sizes are identical.")
	else:
	st.warning("The tokenizer vocab sizes are different.")

	st.write(f"{model_id1} Tokenizer Vocab Size: {vocab_size1}")
	st.write(f"{model_id2} Tokenizer Vocab Size: {vocab_size2}")

	except Exception as e:
	st.error(f"Error loading tokenizers: {e}")
	else:
	st.error("Please enter both model IDs.")