HuggingFaceFW/fineweb-edu
Viewer • Updated • 3.5B • 499k • 1.14k
Task: Text-Generation
Total training time: 38 hours
Inputs: text
Outputs: text
Params: 1.33M
Final Loss: 3.024
Important Benchmark Scores:
1. ARC Easy - 30.13%
2. BLiMP - 66.20%
3. HellaSwag - 27.58%
4. ArithMark-2.0 - 27.12%
Framework: PyTorch, transformers
Authors: Paul Courneya, Jonathan Ly
‘Er-Tiny’ is a 1.33M-parameter Small Language Model trained on 34.8B tokens from a nine-source dataset. Its name, “Er,” is the reverse of “Re,” the prefix of Re:Zero – Starting Life in Another World, the light novel series that inspired the organization’s name.
| Source | Bytes (GB) | Share (%) | What it is |
|---|---|---|---|
| FineWeb-edu | 35.0 | 28.2% | Educational-filtered Common Crawl |
| DCLM-Edu | 20.0 | 16.1% | Educational-filtered webtext |
| The Pile Deduped | 20.0 | 16.1% | Broad, diverse 23-source dataset |
| FineWeb-HQ | 20.0 | 16.1% | Knowledge-filtered webtext |
| FineMath | 13.0 | 10.5% | Math-filtered Common Crawl |
| Cosmopedia-v2 | 7.0 | 5.6% | Synthetic textbooks |
| Wikipedia | 5.0 | 4.0% | Wikipedia articles |
| NpSetPython-Edu | 3.5 | 2.8% | Normalized Python code |
| Misc | 0.6 | 0.5% | LessWrong + HF configs + HF dataset/model cards |
max-autotune-no-cudagraphs(0.9, 0.95)bfloat16| Task | Value |
|---|---|
| BLiMP | 66.20% |
| ARC Easy | 30.13% |
| HellaSwag | 27.58% |
| PiQA | 52.39% |
| SciQ | 57.50% |
| SWAG | 32.50% |
| Winogrande | 49.80% |
| ArithMark-2.0 | 27.12% |
For a comparison with other small language models like this one, go here.
Before using, distributing, selling, or modifying this software, you must read the license here.
#!/usr/bin/env python3
MODEL_DIR = "fromziro/Er-Tiny-1.3M"
TOKENIZER_PATH = MODEL_DIR
PROMPT = "Artificial intelligence is"
MAX_NEW_TOKENS = 256
TEMPERATURE = 0.7
TOP_P = 0.95
TOP_K = 30
REPETITION_PENALTY = 1.2
DO_SAMPLE = True
import torch
from pathlib import Path
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerFast
device = (
"cuda" if torch.cuda.is_available() else
"mps" if torch.backends.mps.is_available() else
"cpu"
)
print(f"Device : {device}")
def load_tokenizer(path_or_repo: str):
p = Path(path_or_repo)
if p.exists() and p.is_file() and p.suffix.lower() == ".json":
tok = PreTrainedTokenizerFast(tokenizer_file=str(p.resolve()))
else:
tok = AutoTokenizer.from_pretrained(path_or_repo, use_fast=True)
if tok.bos_token is None:
tok.add_special_tokens({"bos_token": "<|bos|>"})
if tok.eos_token is None:
tok.add_special_tokens({"eos_token": "<|eos|>"})
if tok.unk_token is None:
tok.add_special_tokens({"unk_token": "<|unk|>"})
if tok.pad_token is None:
tok.pad_token = tok.eos_token if tok.eos_token is not None else "<|pad|>"
tok.padding_side = "left"
return tok
print("Loading tokenizer...")
tokenizer = load_tokenizer(TOKENIZER_PATH)
print(f" Vocab size : {len(tokenizer)}")
print(f" BOS : {tokenizer.bos_token!r}")
print(f" EOS : {tokenizer.eos_token!r}")
print(f" PAD : {tokenizer.pad_token!r} (id={tokenizer.pad_token_id})")
print(f"\nLoading model from {MODEL_DIR} ...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_DIR,
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
low_cpu_mem_usage=True,
)
model.eval()
model.to(device)
model.config.use_cache = False
if hasattr(model, "generation_config") and model.generation_config is not None:
model.generation_config.use_cache = False
total_params = sum(p.numel() for p in model.parameters())
print(f" Parameters : {total_params:,}")
def generate(
prompt: str = PROMPT,
max_new_tokens: int = MAX_NEW_TOKENS,
temperature: float = TEMPERATURE,
top_p: float = TOP_P,
top_k: int = TOP_K,
repetition_penalty: float = REPETITION_PENALTY,
do_sample: bool = DO_SAMPLE,
) -> str:
bos = tokenizer.bos_token or ""
full_prompt = bos + prompt
inputs = tokenizer(
full_prompt,
return_tensors="pt",
add_special_tokens=False,
).to(device)
inputs.pop("token_type_ids", None)
gen_kwargs = dict(
max_new_tokens=max_new_tokens,
do_sample=do_sample,
repetition_penalty=repetition_penalty,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
use_cache=False,
)
if do_sample:
gen_kwargs["temperature"] = temperature
gen_kwargs["top_p"] = top_p
gen_kwargs["top_k"] = top_k
with torch.inference_mode():
output_ids = model.generate(**inputs, **gen_kwargs)
prompt_len = inputs["input_ids"].shape[-1]
new_ids = output_ids[0][prompt_len:]
return tokenizer.decode(new_ids, skip_special_tokens=True)
if __name__ == "__main__":
print(f"\nPrompt : {PROMPT!r}")
print("-" * 60)
output = generate(PROMPT)
print("Generated:")
print(output)
Copyright (c) 2026 FromZero
Copyright (c) 2026 Paul Courneya
Copyright (c) 2026 Jonathan LY
@misc{er-tiny-1.3m,
title = {Er-Tiny-1.3M},
author = {FromZero},
year = {2026},
url = {https://huggingface.co/fromziro/Er-Tiny-1.3M}
}