lingoly-too

Running

lingoly-too / data_loader.py

Jude Khouja

Update number of models

38b1155 8 months ago

15.3 kB

	import pandas as pd

	def load_data():
	"""Load and preprocess the data."""
	df = pd.read_csv("leaderboard.csv").dropna()

	return df

	df = load_data()
	MODELS = [x.strip() for x in df["Model"].unique().tolist()]

	COMMON = """
	<style>
	@media (prefers-color-scheme: dark) {
	:root {
	--bg-primary: #0B0B19;
	--bg-secondary: rgba(19, 19, 37, 0.4);
	--bg-hover: rgba(30, 30, 45, 0.95);
	--text-primary: #ffffff;
	--text-secondary: #e2e8f0;
	--text-tertiary: #e2e8f0;
	--card-bg: rgba(17, 17, 27, 0.4);
	--border-color: rgba(31, 41, 55, 0.5);
	--border-hover: rgba(79, 70, 229, 0.4);
	--accent-color: #ffffff;
	--accent-bg: rgba(79, 70, 229, 0.1);
	--blue-gradient: linear-gradient(45deg, #3B82F6, #A8C4F0);;
	--orange-gradient: linear-gradient(45deg, #E05205, #FAD8D2);
	--green-gradient: linear-gradient(45deg, #60cc1c, #a0e65e);
	--shadow-color: rgba(0, 0, 0, 0.2);
	}
	}

	@media (prefers-color-scheme: light) {
	:root {
	--bg-primary: #ffffff;
	--bg-secondary: rgba(243, 244, 246, 0.4);
	--bg-hover: rgba(229, 231, 235, 0.95);
	--text-primary: #1F2937;
	--text-secondary: #4B5563;
	--text-tertiary: #6B7280;
	--card-bg: rgba(249, 250, 251, 0.4);
	--border-color: rgba(209, 213, 219, 0.5);
	--border-hover: rgba(79, 70, 229, 0.4);
	--accent-color: #4F46E5;
	--accent-bg: rgba(79, 70, 229, 0.1);
	--blue-gradient: linear-gradient(45deg, #3B82F6, #A8C4F0);;
	--orange-gradient: linear-gradient(45deg, #E05205, #FF8340);
	--green-gradient: linear-gradient(45deg, #60cc1c, #a0e65e);
	--shadow-color: rgba(0, 0, 0, 0.1);
	}
	}
	</style>
	"""


	# Define constants for the links
	PAPER_LINK = "https://arxiv.org/abs/2503.02972"
	CODE_LINK = "https://github.com/jkhouja/L2"
	BLOG_LINK = "https://www.lesswrong.com/posts/pbt8GYpdip7NkuwGy/are-recent-llms-better-at-reasoning-or-better-at-memorizing"
	DATASET_LINK = "https://huggingface.co/datasets/jkhouja/LingOly-TOO"
	ADD_MODEL_LINK = (
	"https://mail.google.com/mail/?view=cm&fs=1&to=jude.khouja@oii.ox.ac.uk"
	"&su=Get%20Model%20Added%20to%20Leaderboard&body=Hi%20there%2C%0A%0AI%20"
	"would%20like%20to%20add%20my%20model%20to%20the%20Lingoly-TOO%20Leaderboard.%0A%0AModel%20Name%3A%0AModel%20URL%3A%0A%0ABest%20regards"
	)

	HEADER_CONTENT = (
	COMMON
	+ f"""
	<style>

	.header-wrapper {{
	position: relative;
	background: var(--bg-primary);
	border-radius: 16px;
	margin-bottom: 0;
	transition: all 0.3s ease;
	}}

	.header-content {{
	max-width: 72rem;
	margin: 0 auto;
	}}

	.title-section {{
	position: relative;
	display: flex;
	align-items: center;
	justify-content: center;
	margin-bottom: 3rem;
	}}

	.title-gradient {{
	font-size: 5rem;
	font-weight: 800;
	line-height: 1.25;
	background: var(--orange-gradient);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	margin-bottom: 0.5rem;
	}}

	.title-image {{
	position: absolute;
	top: 30px;
	left: 30px;
	width: 100px;
	height: 100px;
	/* To make it look ok on dark mode */
	background-color: #ffffffd0;
	padding: 10px;
	border-radius: 6px;
	}}

	.subtitle-white {{
	font-size: 5rem;
	font-weight: 800;
	line-height: 1.1;
	color: var(--text-primary);
	margin-bottom: 3rem;
	transition: color 0.3s ease;
	}}

	.description {{
	color: var(--text-secondary);
	font-size: 1.25rem;
	line-height: 1.75;
	max-width: 800px;
	margin: 0 auto;
	text-align: center;
	transition: color 0.3s ease;
	}}

	.highlight-question {{
	background: var(--blue-gradient);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	display: block;
	margin-top: 1rem;
	font-size: 1.5rem;
	font-weight: 500;
	}}

	.metrics-grid {{
	display: grid;
	grid-template-columns: repeat(3, 1fr);
	gap: 1.5rem;
	margin-top: 4rem;
	}}

	.metric-card {{
	background: var(--bg-secondary);
	border: 1px solid var(--border-color);
	text-align: center;
	border-radius: 1rem;
	padding: 2rem;
	transition: all 0.3s ease;
	align-items: center;
	}}

	.metric-card:hover {{
	transform: translateY(-5px);
	border-color: var(--border-hover);
	box-shadow: 0 4px 20px var(--shadow-color);
	}}

	.metric-number {{
	font-size: 4rem;
	font-weight: 800;
	margin-bottom: 1rem;
	}}

	.metric-blue {{
	background: var(--blue-gradient);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	}}

	.metric-purple {{
	background: var(--orange-gradient);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	}}

	.metric-green {{
	background: var(--green-gradient);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	}}

	.metric-label {{
	color: var(--text-secondary);
	font-size: 1.5rem;
	margin-bottom: 1.5rem;
	transition: color 0.3s ease;
	}}

	.metric-detail {{
	font-size: 1.125rem;
	line-height: 1.75;
	margin-top: 0.5rem;
	transition: color 0.3s ease;
	}}

	.metric-detail.primary {{
	color: var(--accent-color);
	}}

	.metric-detail.secondary {{
	color: var(--text-secondary);
	}}

	.actions {{
	display: flex;
	gap: 1rem;
	justify-content: center;
	margin-top: 3rem;
	}}

	.action-button {{
	display: flex;
	align-items: center;
	gap: 0.5rem;
	padding: 0.75rem 1.5rem;
	background: var(--bg-secondary);
	border: 1px solid var(--border-color);
	border-radius: 100px;
	color: var(--text-primary) !important;
	text-decoration: none !important;
	font-size: 0.95rem;
	transition: all 0.3s ease;
	}}

	.action-button:hover {{
	transform: translateY(-2px);
	border-color: var(--accent-color);
	background: var(--accent-bg);
	}}

	@media (max-width: 1024px) {{
	.title-image {{
	top: 20px;
	left: 20px;
	width: 80px;
	height: 80px;
	}}
	.title-gradient, .subtitle-white {{
	font-size: 3rem;
	}}
	}}

	@media (max-width: 620px) {{
	.title-image {{
	position: relative;
	margin-top: -30px !important;
	margin-bottom: 20px !important;
	top: 0;
	left: 0;
	}}
	}}
	</style>

	<div class="header-wrapper">
	<div class="header-content">
	<div class="title-section">
	<div class="title-gradient">LingOly-TOO</div>
	</div>

	<div class="description">
	LingOly-TOO (L2) is a challenging linguistics reasoning benchmark designed to counteracts answering without reasoning (e.g. by guessing or memorizing answers).
	We accomplish this by permuting <b>Ling</b>uistics <b>Oly</b>mpiad problems with <b>T</b>emplates and <b>O</b>rthographic <b>O</b>bfuscations. By rewriting (obfuscating) parts of questions and answers, the chance of benchmark leakage in training data is minimized.
	<div class="highlight-question">
	"How do top LLMs reason on unseen linguistic questions?"
	</div>
	</div>
	</div>

	<div class="actions">
	<a href="{PAPER_LINK}" class="action-button">
	<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
	<path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
	<line x1="8" y1="12" x2="16" y2="12"/>
	</svg>
	Paper
	</a>
	<a href="{CODE_LINK}" class="action-button">
	<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
	<path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
	</svg>
	Code
	</a>
	<a href="{BLOG_LINK}" class="action-button">
	<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
	<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
	<polyline points="7 10 12 15 17 10"/>
	<line x1="12" y1="15" x2="12" y2="3"/>
	</svg>
	Blog
	</a>
	<a href="{DATASET_LINK}" class="action-button">
	<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
	<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
	<polyline points="7 10 12 15 17 10"/>
	<line x1="12" y1="15" x2="12" y2="3"/>
	</svg>
	Dataset
	</a>
	<a href="{ADD_MODEL_LINK}" class="action-button" target="_blank" rel="noopener noreferrer">
	<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
	<path d="M19 3H5a2 2 0 0 0-2 2v14a2 2 0 0 0 2 2h14a2 2 0 0 0 2-2V5a2 2 0 0 0-2-2z"/>
	<line x1="12" y1="8" x2="12" y2="16"/>
	<line x1="8" y1="12" x2="16" y2="12"/>
	</svg>
	Add Your Model
	</a>
	</div>
	</div>
	"""
	)

	CARDS = """ <div class="metrics-grid">
	<div class="metric-card">
	<div class="metric-number metric-blue">13</div>
	<div class="metric-label">Total Models</div>
	<div class="metric-detail primary">6 Reasoning Models</div>
	<div class="metric-detail primary">4 Open Source Models</div>
	</div>

	<div class="metric-card">
	<div class="metric-number metric-purple">82</div>
	<div class="metric-label">Linguistics Problems</div>
	<div class="metric-detail primary">6 Permutations per problem</div>
	<div class="metric-detail primary">Problems from Low-resource Languages</div>
	</div>

	<div class="metric-card">
	<div class="metric-number metric-green">1.2k</div>
	<div class="metric-label">Total Questions</div>
	<div class="metric-detail primary">Includes Match-Up, Multiple Choice and Completion</div>
	</div>
	</div>"""

	METHODOLOGY = """
	<style>
	@media (prefers-color-scheme: dark) {
	:root {
	--bg-primary: #0B0B19;
	--bg-secondary: rgba(19, 19, 37, 0.4);
	--text-primary: #ffffff;
	--text-secondary: #94A3B8;
	--border-primary: rgba(31, 41, 55, 0.5);
	--accent-blue: #60A5FA;
	--accent-purple: #A78BFA;
	--card-hover-bg: rgba(79, 70, 229, 0.1);
	--shadow-color: rgba(79, 70, 229, 0.1);
	}
	}

	@media (prefers-color-scheme: light) {
	:root {
	--bg-primary: #ffffff;
	--bg-secondary: rgba(243, 244, 246, 0.4);
	--text-primary: #111827;
	--text-secondary: #4B5563;
	--border-primary: rgba(209, 213, 219, 0.5);
	--accent-blue: #3B82F6;
	--accent-purple: #8B5CF6;
	--card-hover-bg: rgba(243, 244, 246, 0.8);
	--shadow-color: rgba(0, 0, 0, 0.1);
	}
	}

	.dataset-table {
	width: 100%;
	border-collapse: separate;
	border-spacing: 0;
	margin: 2rem 0;
	background: var(--bg-secondary);
	border-radius: 1rem;
	overflow: hidden;
	box-shadow: 0 4px 20px var(--shadow-color);
	}

	.dataset-table thead {
	background: linear-gradient(90deg, var(--accent-blue), var(--accent-purple));
	}

	.dataset-table th {
	padding: 1.25rem 1rem;
	text-align: left;
	color: white;
	font-weight: 600;
	font-size: 1rem;
	}

	.dataset-table td {
	padding: 1rem;
	border-bottom: 1px solid var(--border-primary);
	color: var(--text-secondary);
	transition: all 0.2s ease;
	}

	.dataset-table tbody tr:hover td {
	background: var(--card-hover-bg);
	color: var(--text-primary);
	}

	.methodology-content {
	max-width: 1200px;
	margin: 0 auto;
	padding: 2rem;
	color: var(--text-secondary);
	line-height: 1.7;
	font-size: 1rem;
	}

	.section-title {
	font-size: 2.5rem;
	font-weight: 700;
	margin: 3rem 0 1.5rem;
	color: var(--text-primary);
	background: linear-gradient(to right, var(--accent-blue), var(--accent-purple));
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	letter-spacing: -0.02em;
	}
	</style>

	<div class="section-divider"></div>
	<h1 class="section-title">Citation</h2>
	<div class="bibtex-citation" style="font-family: monospace; white-space: pre; padding: 1em; background-color: rgba(128, 128, 128, 0.1); border: 1px solid rgba(128, 128, 128, 0.2); border-radius: 4px; color: currentColor;">
	@misc{khouja2025lingolytoodisentanglingmemorisationreasoning,
	title={LINGOLY-TOO: Disentangling Memorisation from Reasoning with Linguistic Templatisation and Orthographic Obfuscation},
	author={Jude Khouja and Karolina Korgul and Simi Hellsten
	and Lingyi Yang and Vlad Neacsu and Harry Mayne and Ryan Kearns and Andrew Bean and Adam Mahdi},
	year={2025},
	eprint={2503.02972},
	archivePrefix={arXiv},
	primaryClass={cs.CL},
	url={https://arxiv.org/abs/2503.02972},
	}
	</div>

	"""

	UNUSED = """
	<!-- Insights Section -->
	<h1 class="section-title">Key insights</h1>
	<p>
	We use orthographic templatisation on Linguistics Olympiad problems to create obfuscated variants
	that maintain the same reasoning steps. Through extensive experiments, we show that obfuscation
	reduces measurement bias from data exposure and provides reasoning estimates that correlate with
	the ability to solve linguistic reasoning problems. Additionally, we find that state-of-the-art
	models exhibit inconsistent reasoning abilities and that simple fine-tuning does not necessarily
	equip models with context-free and robust problem-solving skills. This work establishes a reasoning
	measure that is resilient to data exposure effects and supports ongoing efforts to fully understand
	response generation in advanced models.
	</p>
	"""