Spaces:

Fourwheels2512
/

zero-forgetting-benchmarks

Sleeping

App Files Files Community

zero-forgetting-benchmarks / app.py

Fourwheels2512

Zero forgetting benchmark dashboard

736d089 21 days ago

raw

history blame contribute delete

12.3 kB

	"""ModelBrew AI — Zero Forgetting Benchmark Results Dashboard"""
	import gradio as gr
	import plotly.graph_objects as go

	# ── Brand colors ──
	BLUE = "#1F4E79"
	GREEN = "#4CAF50"
	RED = "#E53935"
	GOLD = "#F9A825"
	GRAY = "#757575"

	# ── Data ──
	SEEDS = ["Seed 0", "Seed 42", "Seed 1234"]
	NAIVE_FORGET = [38.1, 41.7, 49.0]
	MODELBREW_DRIFT = [-0.03, -0.10, -0.37]

	WALMART_DOMAINS = ["Customer Service", "Product Knowledge", "HR Policy", "Financial Analytics"]
	WALMART_GEN = [0.92, 0.94, 0.88, 0.83]
	WALMART_RET = [None, 0.83, 0.86, 0.82]

	SF_DOMAINS = ["CRM Ops", "Sales Ops", "Reporting", "Support", "Admin & Dev"]
	SF_GEN = [0.882, 0.897, 0.890, 0.885, 0.897]
	SF_RET = [None, 0.889, 0.891, 0.897, 0.907]
	SF_GN = [3.68, 2.15, 3.16, 2.53, 2.11]
	SF_LOSS = [1.33, 1.05, 1.24, 0.96, 0.66]

	DENTAL_DOMAINS = [f"Domain {i+1}" for i in range(8)]
	DENTAL_MB_GN = [3.8, 4.2, 5.1, 4.5, 5.5, 4.8, 6.1, 5.2]
	DENTAL_NAIVE_GN = [4.8, 5.6, 6.3, 6.9, 7.2, 8.1, 8.8, 9.4]


	def make_seed_chart():
	fig = go.Figure()
	fig.add_trace(go.Bar(
	name="Naive LoRA (forgetting)",
	x=SEEDS, y=NAIVE_FORGET,
	marker_color=RED,
	text=[f"+{v}%" for v in NAIVE_FORGET],
	textposition="outside",
	))
	fig.add_trace(go.Bar(
	name="ModelBrew (drift)",
	x=SEEDS, y=[abs(v) for v in MODELBREW_DRIFT],
	marker_color=GREEN,
	text=[f"{v}%" for v in MODELBREW_DRIFT],
	textposition="outside",
	))
	fig.update_layout(
	title="Benchmark 1: Multi-Seed Research — 5 Domains on Mistral-7B",
	yaxis_title="Knowledge Lost (%)",
	barmode="group",
	template="plotly_white",
	height=450,
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5),
	)
	return fig


	def make_walmart_chart():
	fig = go.Figure()
	fig.add_trace(go.Bar(
	name="Gen BERTScore",
	x=WALMART_DOMAINS, y=WALMART_GEN,
	marker_color=BLUE,
	text=[f"{v:.2f}" for v in WALMART_GEN],
	textposition="outside",
	))
	ret_vals = [v if v else 0 for v in WALMART_RET]
	ret_text = [f"{v:.2f}" if v else "—" for v in WALMART_RET]
	fig.add_trace(go.Bar(
	name="Retention BERTScore",
	x=WALMART_DOMAINS, y=ret_vals,
	marker_color=GREEN,
	text=ret_text,
	textposition="outside",
	))
	fig.update_layout(
	title="Benchmark 2: Walmart Enterprise — 4 Domains on Mistral-7B",
	yaxis_title="BERTScore",
	yaxis_range=[0.5, 1.0],
	barmode="group",
	template="plotly_white",
	height=450,
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5),
	)
	return fig


	def make_salesforce_chart():
	fig = go.Figure()
	fig.add_trace(go.Scatter(
	name="Gen BERTScore",
	x=SF_DOMAINS, y=SF_GEN,
	mode="lines+markers+text",
	marker=dict(size=10, color=BLUE),
	text=[f"{v:.3f}" for v in SF_GEN],
	textposition="top center",
	))
	ret_x = SF_DOMAINS[1:]
	ret_y = [v for v in SF_RET if v]
	fig.add_trace(go.Scatter(
	name="Retention BERTScore",
	x=ret_x, y=ret_y,
	mode="lines+markers+text",
	marker=dict(size=10, color=GREEN),
	text=[f"{v:.3f}" for v in ret_y],
	textposition="bottom center",
	line=dict(dash="dot"),
	))
	fig.update_layout(
	title="Benchmark 3: Salesforce Enterprise — 5 Domains, Positive Backward Transfer",
	yaxis_title="BERTScore",
	yaxis_range=[0.85, 0.92],
	template="plotly_white",
	height=450,
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5),
	)
	return fig


	def make_salesforce_gn_chart():
	fig = go.Figure()
	fig.add_trace(go.Bar(
	name="Peak Gradient Norm",
	x=SF_DOMAINS, y=SF_GN,
	marker_color=BLUE,
	text=[f"{v:.2f}" for v in SF_GN],
	textposition="outside",
	))
	fig.add_hline(y=263, line_dash="dash", line_color=RED,
	annotation_text="Naive LoRA: 263 (crashed)", annotation_position="top left")
	fig.update_layout(
	title="Salesforce — Gradient Stability (Naive LoRA crashed at 263)",
	yaxis_title="Peak Gradient Norm",
	template="plotly_white",
	height=400,
	)
	return fig


	def make_dental_chart():
	fig = go.Figure()
	fig.add_trace(go.Scatter(
	name="Naive LoRA",
	x=DENTAL_DOMAINS, y=DENTAL_NAIVE_GN,
	mode="lines+markers",
	marker=dict(size=8, color=RED),
	line=dict(width=2),
	))
	fig.add_trace(go.Scatter(
	name="ModelBrew",
	x=DENTAL_DOMAINS, y=DENTAL_MB_GN,
	mode="lines+markers",
	marker=dict(size=8, color=GREEN),
	line=dict(width=2),
	))
	fig.update_layout(
	title="Benchmark 4: Dental Stress Test — 8 Sequential Domains",
	yaxis_title="Peak Gradient Norm",
	template="plotly_white",
	height=450,
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5),
	)
	return fig


	def make_summary_chart():
	experiments = ["Research\n(5 domains)", "Walmart\n(4 domains)", "Salesforce\n(5 domains)", "Dental\n(8 domains)"]
	naive = [43.0, None, None, None]
	ours = [0.17, 0, 0, 0]
	fig = go.Figure()
	fig.add_trace(go.Bar(
	name="Naive LoRA Forgetting",
	x=["Research\n(5 domains)"], y=[43.0],
	marker_color=RED,
	text=["+43.0%"],
	textposition="outside",
	width=0.3,
	))
	fig.add_trace(go.Bar(
	name="ModelBrew Drift",
	x=experiments, y=[0.17, 0, 0, 0],
	marker_color=GREEN,
	text=["-0.17%", "Zero", "Zero\n(positive transfer)", "Zero"],
	textposition="outside",
	width=0.3,
	))
	fig.update_layout(
	title="Zero Forgetting Across All 4 Benchmarks",
	yaxis_title="Knowledge Lost (%)",
	barmode="group",
	template="plotly_white",
	height=450,
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5),
	)
	return fig


	OVERVIEW_MD = """
	# Zero Forgetting in LLM Fine-Tuning

	Every fine-tuning run destroys what the model already knew. Train on medical, then legal — medical is gone.

	ModelBrew is a continual learning adapter (~0.1% additional parameters) that solves catastrophic forgetting. Train one model on domain after domain — it keeps everything.

	---

	### 4 Benchmarks on Mistral-7B. Zero Forgetting. Every Single One.

	\| Benchmark \| Domains \| Seeds \| Result \|
	\|-----------\|---------\|-------\|--------\|
	\| Research \| 5 (Medical → Legal → Financial → Code → Science) \| 3 \| -0.17% drift vs +43% naive forgetting \|
	\| Walmart \| 4 (Customer Service → Product → HR → Finance) \| 1 \| BERTScores 0.82–0.94 across all domains \|
	\| Salesforce \| 5 (CRM → Sales → Reporting → Support → Admin) \| 1 \| Positive backward transfer (0.889 → 0.907) \|
	\| Dental \| 8 sequential domains \| 2 \| Gradient norms stable, zero explosions \|

	- Spectral norm locked at 1.0 across every experiment
	- Naive LoRA crashed at step 43 with gradient norm 263. Ours: peak under 6
	- No replay buffers. No EWC. No knowledge distillation. No retraining from scratch.

	---

	### What This Means

	Right now every AI team in the world throws away learned knowledge every time they fine-tune. That's billions of dollars in wasted compute and a fundamental barrier to AI that actually builds on what it knows over time.

	- A hospital trains one model across radiology, pathology, cardiology — it keeps learning, never forgets
	- A legal AI learns new case law without losing old precedent
	- Models in developing countries accumulate knowledge across languages and domains on limited hardware

	---

	### What's Shipped

	- Live product — processing real training runs today
	- 196 automated tests — CI pipeline on GitHub Actions
	- US patent pending — provisional filed February 2026
	- 7 technical reports — from 50+ failed experiments to the working method
	- Free tier — try it right now, no credit card needed

	Google published Nested Learning at NeurIPS 2025. Meta has Sparse Memory Finetuning. Neither is available to use. This is.

	---

	[Try it live](https://mhc-finetune-saas-zrtokzlkbnue9zsk7jfgad.streamlit.app) \| [API](https://fourwheels2512--crma-finetune-fastapi-app.modal.run/docs) \| Patent Pending (US Provisional, Feb 2026)

	Kiran Nayudu — ModelBrew AI — fourwheels2512@gmail.com
	"""

	# ── Build app ──
	with gr.Blocks(
	title="ModelBrew AI — Zero Forgetting Benchmarks",
	) as demo:

	gr.Markdown("# ModelBrew AI — Zero Forgetting Benchmark Results")
	gr.Markdown("4 independent benchmarks on Mistral-7B. Zero catastrophic forgetting across all of them.")

	with gr.Tabs():
	with gr.Tab("Overview"):
	gr.Markdown(OVERVIEW_MD)
	gr.Plot(make_summary_chart)

	with gr.Tab("Research Benchmark"):
	gr.Markdown("""
	### Multi-Seed Research — 5 Domains, 3 Seeds
	Medical → Legal → Financial → Code → Science on Mistral-7B.
	Repeated across 3 random seeds to confirm reproducibility.

	Naive LoRA destroyed 38–49% of prior knowledge with every new domain.
	ModelBrew drifted less than 0.4%. The negative sign means the model actually improved on old domains.

	Naive LoRA crashed at step 43 with gradient norm 263.
	ModelBrew completed every run with peak gradient norm under 6. Spectral norm locked at 1.0.
	""")
	gr.Plot(make_seed_chart)

	with gr.Tab("Walmart Enterprise"):
	gr.Markdown("""
	### Walmart Enterprise — 4 Domains
	Customer Service → Product Knowledge → HR Policy → Financial Analytics.

	One model. Four enterprise domains. All retained.
	The final model answers questions across all four with BERTScores of 0.82–0.94.
	""")
	gr.Plot(make_walmart_chart)

	with gr.Tab("Salesforce Enterprise"):
	gr.Markdown("""
	### Salesforce Enterprise — 5 Domains, Cumulative Adapter
	CRM Operations → Sales Ops → Reporting & Analytics → Customer Support → Admin & Dev.

	Retention BERTScores went UP with each new domain — 0.889 → 0.891 → 0.897 → 0.907.
	The model gets better at old domains as it learns new ones. Positive backward transfer.

	Peak gradient norms stayed between 2.1 and 3.7. Zero gradient explosions.
	""")
	gr.Plot(make_salesforce_chart)
	gr.Plot(make_salesforce_gn_chart)

	with gr.Tab("Dental Stress Test"):
	gr.Markdown("""
	### Dental Stress Test — 8 Sequential Domains, 2 Seeds
	The longest chain we've tested. Eight sequential domains on Mistral-7B.

	Peak gradient norms stayed between 3.8 and 6.1 across all 8 domains.
	Naive LoRA gradient norms grew monotonically to 9.4.
	Spectral norm: 1.0 throughout. Zero crashes. Zero NaN losses.
	""")
	gr.Plot(make_dental_chart)

	with gr.Tab("Salesforce Details"):
	gr.Markdown("""
	### Salesforce — Full Per-Domain Breakdown

	\| Domain \| Training Loss \| Gen BERTScore \| Retention BERTScore \| Peak Grad Norm \|
	\|--------\|:---:\|:---:\|:---:\|:---:\|
	\| 1. CRM Operations \| 1.33 \| 0.882 \| — \| 3.68 \|
	\| 2. Sales Ops \| 1.05 \| 0.897 \| 0.889 \| 2.15 \|
	\| 3. Reporting & Analytics \| 1.24 \| 0.890 \| 0.891 \| 3.16 \|
	\| 4. Customer Support \| 0.96 \| 0.885 \| 0.897 \| 2.53 \|
	\| 5. Admin & Dev \| 0.66 \| 0.897 \| 0.907 \| 2.11 \|

	Key findings:
	- Retention BERTScores improved as domains accumulated — evidence of positive backward transfer
	- Training loss decreased across domains (1.33 → 0.66) — the model learns faster with more accumulated knowledge
	- Peak gradient norms stayed between 2.1–3.7 — zero gradient explosions, zero NaN losses
	- Final adapter answers questions from all 5 Salesforce domains
	""")

	gr.Markdown("---")
	gr.Markdown(
	"*ModelBrew AI — Patent Pending (US Provisional, Feb 2026) — "
	"[Try it live](https://mhc-finetune-saas-zrtokzlkbnue9zsk7jfgad.streamlit.app) — "
	"fourwheels2512@gmail.com*"
	)

	demo.launch(theme=gr.themes.Soft(primary_hue="blue"))