frankenstallm / source /scripts /prepare_3b_data.sh
pathcosmos's picture
Upload folder using huggingface_hub (#17)
48ecd01
#!/usr/bin/env bash
# =============================================================================
# prepare_3b_data.sh β€” 3B λͺ¨λΈ ν•™μŠ΅ 데이터 전체 νŒŒμ΄ν”„λΌμΈ
#
# μ‚¬μš©λ²•:
# bash scripts/prepare_3b_data.sh [--step N] [--jobs 72]
#
# μŠ€ν…:
# 1 = CulturaX 토큰화
# 2 = cc100 ν•΄μ œ + 토큰화
# 3 = OSCAR 토큰화
# 4 = korean_webtext 토큰화
# 5 = HPLT ν•œκ΅­μ–΄ μΆ”μΆœ + 토큰화
# 6 = textbooks + finepdfs + kovast 토큰화
# 7 = 전체 병합
# 8 = train/val split 검증
# =============================================================================
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
cd "${PROJECT_ROOT}"
# ─── μ„€μ • ────────────────────────────────────────────────────────────────
DATA_DIR="data"
EXTRA_DIR="data/korean_extra"
TOKENIZER="tokenizer/tokenizer.json"
VAL_SPLIT=0.002
SEED=42
JOBS=72
FROM_STEP=0
LOG_FILE="data/prepare_3b.log"
while [[ $# -gt 0 ]]; do
case $1 in
--step) FROM_STEP="$2"; shift 2 ;;
--jobs) JOBS="$2"; shift 2 ;;
*) echo "Unknown arg: $1"; exit 1 ;;
esac
done
mkdir -p "$(dirname "$LOG_FILE")"
exec > >(tee -a "$LOG_FILE") 2>&1
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
# ─── 토큰화 헬퍼 (parquet β†’ bin) ─────────────────────────────────────────
tokenize_parquet() {
local name="$1"
local input_pattern="$2"
local text_col="$3"
local output="${DATA_DIR}/${name}_train.bin"
if [[ -f "$output" && $FROM_STEP -le 0 ]]; then
log "[SKIP] $output already exists ($(du -h "$output" | cut -f1))"
return
fi
log "[START] Tokenizing $name from parquet..."
python3 - <<PYEOF
import glob, os, sys
import numpy as np
from tokenizers import Tokenizer
import pyarrow.parquet as pq
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
import multiprocessing as mp
tokenizer_path = "${TOKENIZER}"
input_pattern = "${input_pattern}"
text_col = "${text_col}"
output_train = "${output}"
output_val = output_train.replace("_train.bin", "_val.bin")
val_split = ${VAL_SPLIT}
seed = ${SEED}
files = sorted(glob.glob(input_pattern))
print(f"Found {len(files)} parquet files")
tokenizer = Tokenizer.from_file(tokenizer_path)
all_tokens = []
total_docs = 0
for f in tqdm(files, desc="${name}"):
try:
table = pq.read_table(f, columns=[text_col])
for text in table.column(text_col):
t = text.as_py()
if t and len(t) > 50:
ids = tokenizer.encode(t).ids
all_tokens.extend(ids)
total_docs += 1
except Exception as e:
print(f"Error processing {f}: {e}", file=sys.stderr)
continue
print(f"Total: {total_docs:,} docs, {len(all_tokens):,} tokens")
# Split
import random
random.seed(seed)
random.shuffle(all_tokens) # Not ideal but matches existing code
n_val = int(len(all_tokens) * val_split)
val_tokens = all_tokens[:n_val]
train_tokens = all_tokens[n_val:]
np.array(train_tokens, dtype=np.uint16).tofile(output_train)
np.array(val_tokens, dtype=np.uint16).tofile(output_val)
print(f"Saved: {output_train} ({len(train_tokens):,} tokens)")
print(f"Saved: {output_val} ({len(val_tokens):,} tokens)")
PYEOF
log "[DONE] $name β†’ $output"
}
# ─── Step 1: CulturaX ────────────────────────────────────────────────────
if [[ $FROM_STEP -le 1 ]]; then
log "=== Step 1: CulturaX 토큰화 ==="
tokenize_parquet "culturax" \
"${EXTRA_DIR}/culturax_ko/ko/*.parquet" \
"text"
fi
# ─── Step 2: cc100 ν•΄μ œ + 토큰화 ─────────────────────────────────────────
if [[ $FROM_STEP -le 2 ]]; then
log "=== Step 2: cc100 ν•΄μ œ + 토큰화 ==="
CC100_XZ="${EXTRA_DIR}/cc100_ko/ko.txt.xz"
CC100_TXT="${EXTRA_DIR}/cc100_ko/ko.txt"
CC100_OUT="${DATA_DIR}/cc100_train.bin"
if [[ -f "$CC100_OUT" && $FROM_STEP -le 0 ]]; then
log "[SKIP] cc100 already tokenized"
else
# ν•΄μ œ
if [[ ! -f "$CC100_TXT" ]]; then
log "Decompressing cc100 xz (14GB β†’ 54GB)..."
xz -dk "$CC100_XZ"
log "Decompression done"
fi
# 토큰화 (λŒ€μš©λŸ‰ β†’ 슀트리밍)
log "Tokenizing cc100 (54GB text)..."
python3 - <<'PYEOF'
import numpy as np
from tokenizers import Tokenizer
from tqdm import tqdm
import random
tokenizer = Tokenizer.from_file("tokenizer/tokenizer.json")
input_file = "data/korean_extra/cc100_ko/ko.txt"
output_train = "data/cc100_train.bin"
output_val = "data/cc100_val.bin"
# Stream tokenize in chunks
all_tokens = []
doc_buffer = []
doc_count = 0
with open(input_file, 'r', encoding='utf-8', errors='replace') as f:
for line in tqdm(f, desc="cc100", unit=" lines"):
line = line.strip()
if not line:
# Document boundary
if doc_buffer:
text = '\n'.join(doc_buffer)
if len(text) > 50:
ids = tokenizer.encode(text).ids
all_tokens.extend(ids)
doc_count += 1
doc_buffer = []
else:
doc_buffer.append(line)
# Last doc
if doc_buffer:
text = '\n'.join(doc_buffer)
if len(text) > 50:
all_tokens.extend(tokenizer.encode(text).ids)
doc_count += 1
print(f"Total: {doc_count:,} docs, {len(all_tokens):,} tokens")
# Split
n_val = int(len(all_tokens) * 0.002)
np.array(all_tokens[n_val:], dtype=np.uint16).tofile(output_train)
np.array(all_tokens[:n_val], dtype=np.uint16).tofile(output_val)
print(f"Saved train: {len(all_tokens)-n_val:,} tokens")
print(f"Saved val: {n_val:,} tokens")
PYEOF
log "[DONE] cc100"
fi
fi
# ─── Step 3: OSCAR ───────────────────────────────────────────────────────
if [[ $FROM_STEP -le 3 ]]; then
log "=== Step 3: OSCAR 토큰화 ==="
OSCAR_OUT="${DATA_DIR}/oscar_train.bin"
if [[ -f "$OSCAR_OUT" && $FROM_STEP -le 0 ]]; then
log "[SKIP] OSCAR already tokenized"
else
python3 - <<'PYEOF'
import glob, numpy as np
from tokenizers import Tokenizer
import pyarrow.parquet as pq
from tqdm import tqdm
tokenizer = Tokenizer.from_file("tokenizer/tokenizer.json")
files = sorted(glob.glob("data/korean_extra/oscar_ko/data/kor_Hang/*.parquet"))
all_tokens = []
doc_count = 0
for f in tqdm(files, desc="OSCAR"):
table = pq.read_table(f, columns=['text'])
for row in table.column('text'):
if row is None:
continue
parts = row.as_py()
if parts:
text = '\n'.join(item['text'] for item in parts if item and item.get('text'))
if len(text) > 50:
all_tokens.extend(tokenizer.encode(text).ids)
doc_count += 1
print(f"OSCAR: {doc_count:,} docs, {len(all_tokens):,} tokens")
n_val = int(len(all_tokens) * 0.002)
np.array(all_tokens[n_val:], dtype=np.uint16).tofile("data/oscar_train.bin")
np.array(all_tokens[:n_val], dtype=np.uint16).tofile("data/oscar_val.bin")
PYEOF
log "[DONE] OSCAR"
fi
fi
# ─── Step 4: korean_webtext ──────────────────────────────────────────────
if [[ $FROM_STEP -le 4 ]]; then
log "=== Step 4: korean_webtext 토큰화 ==="
tokenize_parquet "webtext" \
"${EXTRA_DIR}/korean_webtext/data/*.parquet" \
"text"
fi
# ─── Step 5: HPLT ν•œκ΅­μ–΄ μΆ”μΆœ + 토큰화 ──────────────────────────────────
if [[ $FROM_STEP -le 5 ]]; then
log "=== Step 5: HPLT ν•œκ΅­μ–΄ μΆ”μΆœ + 토큰화 ==="
HPLT_OUT="${DATA_DIR}/hplt_ko_train.bin"
if [[ -f "$HPLT_OUT" && $FROM_STEP -le 0 ]]; then
log "[SKIP] HPLT already tokenized"
else
python3 - <<'PYEOF'
import glob, numpy as np
from tokenizers import Tokenizer
import pyarrow.parquet as pq
from tqdm import tqdm
tokenizer = Tokenizer.from_file("tokenizer/tokenizer.json")
files = sorted(glob.glob("data/korean_extra/hplt_ko/en-ko/*.parquet"))
all_tokens = []
doc_count = 0
for f in tqdm(files, desc="HPLT"):
table = pq.read_table(f, columns=['tgt_doc'])
for row in table.column('tgt_doc'):
d = row.as_py()
if d and d.get('sentences'):
text = '\n'.join(s for s in d['sentences'] if s)
if len(text) > 50:
all_tokens.extend(tokenizer.encode(text).ids)
doc_count += 1
print(f"HPLT Korean: {doc_count:,} docs, {len(all_tokens):,} tokens")
n_val = int(len(all_tokens) * 0.002)
np.array(all_tokens[n_val:], dtype=np.uint16).tofile("data/hplt_ko_train.bin")
np.array(all_tokens[:n_val], dtype=np.uint16).tofile("data/hplt_ko_val.bin")
PYEOF
log "[DONE] HPLT"
fi
fi
# ─── Step 6: textbooks + finepdfs + kovast ───────────────────────────────
if [[ $FROM_STEP -le 6 ]]; then
log "=== Step 6: 기타 μ†ŒμŠ€ 토큰화 ==="
EXTRA_OUT="${DATA_DIR}/extra_misc_train.bin"
if [[ -f "$EXTRA_OUT" && $FROM_STEP -le 0 ]]; then
log "[SKIP] extra_misc already tokenized"
else
python3 - <<'PYEOF'
import glob, numpy as np, os
from tokenizers import Tokenizer
import pyarrow.parquet as pq
from tqdm import tqdm
tokenizer = Tokenizer.from_file("tokenizer/tokenizer.json")
all_tokens = []
doc_count = 0
# korean_textbooks (MMLU-style: look for text columns)
tb_files = glob.glob("data/korean_extra/korean_textbooks/**/*.parquet", recursive=True)
for f in tqdm(tb_files, desc="textbooks"):
try:
table = pq.read_table(f)
# Try common text columns
for col in ['question', 'text', 'input', 'instruction']:
if col in table.column_names:
for val in table.column(col):
t = val.as_py()
if t and len(t) > 20:
all_tokens.extend(tokenizer.encode(t).ids)
doc_count += 1
break
except:
continue
# finepdfs
pdf_files = glob.glob("data/korean_extra/finepdfs_edu_ko/*.parquet")
for f in tqdm(pdf_files, desc="finepdfs"):
try:
table = pq.read_table(f)
for col in ['text', 'content']:
if col in table.column_names:
for val in table.column(col):
t = val.as_py()
if t and len(t) > 50:
all_tokens.extend(tokenizer.encode(t).ids)
doc_count += 1
break
except:
continue
print(f"Extra: {doc_count:,} docs, {len(all_tokens):,} tokens")
n_val = int(len(all_tokens) * 0.002)
np.array(all_tokens[n_val:], dtype=np.uint16).tofile("data/extra_misc_train.bin")
np.array(all_tokens[:n_val], dtype=np.uint16).tofile("data/extra_misc_val.bin")
PYEOF
log "[DONE] extra_misc"
fi
fi
# ─── Step 7: 전체 병합 ──────────────────────────────────────────────────
if [[ $FROM_STEP -le 7 ]]; then
log "=== Step 7: 전체 병합 ==="
TRAIN_BINS=""
for f in \
"${DATA_DIR}/korean_train.bin" \
"${DATA_DIR}/culturax_train.bin" \
"${DATA_DIR}/cc100_train.bin" \
"${DATA_DIR}/oscar_train.bin" \
"${DATA_DIR}/webtext_train.bin" \
"${DATA_DIR}/hplt_ko_train.bin" \
"${DATA_DIR}/extra_misc_train.bin"; do
if [[ -f "$f" ]]; then
TRAIN_BINS="$TRAIN_BINS $f"
log " Including: $f ($(du -h "$f" | cut -f1))"
else
log " [WARN] Missing: $f"
fi
done
if [[ -n "$TRAIN_BINS" ]]; then
python3 data/merge_bins.py $TRAIN_BINS "${DATA_DIR}/merged_3b_train.bin"
log "[DONE] merged_3b_train.bin created"
fi
# Val 병합
VAL_BINS=""
for f in \
"${DATA_DIR}/korean_val.bin" \
"${DATA_DIR}/culturax_val.bin" \
"${DATA_DIR}/cc100_val.bin" \
"${DATA_DIR}/oscar_val.bin" \
"${DATA_DIR}/webtext_val.bin" \
"${DATA_DIR}/hplt_ko_val.bin" \
"${DATA_DIR}/extra_misc_val.bin"; do
if [[ -f "$f" ]]; then
VAL_BINS="$VAL_BINS $f"
fi
done
if [[ -n "$VAL_BINS" ]]; then
python3 data/merge_bins.py $VAL_BINS "${DATA_DIR}/merged_3b_val.bin"
log "[DONE] merged_3b_val.bin created"
fi
fi
# ─── Step 8: 검증 ────────────────────────────────────────────────────────
if [[ $FROM_STEP -le 8 ]]; then
log "=== Step 8: μ΅œμ’… 검증 ==="
python3 - <<'PYEOF'
import os, glob
import numpy as np
print("=== 토큰화 κ²°κ³Ό ===")
total_train = 0
total_val = 0
for f in sorted(glob.glob("data/*_train.bin") + glob.glob("data/train.bin")):
n = os.path.getsize(f) // 2
total_train += n
print(f" {os.path.basename(f):30s}: {n:>15,} tokens ({os.path.getsize(f)/1e9:.2f} GB)")
for f in sorted(glob.glob("data/*_val.bin") + glob.glob("data/val.bin")):
n = os.path.getsize(f) // 2
total_val += n
print(f"\n Total train: {total_train:,} tokens ({total_train/1e9:.1f}B)")
print(f" Total val: {total_val:,} tokens ({total_val/1e6:.1f}M)")
print(f"\n 3B Chinchilla minimum: 60B tokens")
print(f" Epochs needed for 60B: {60e9/total_train:.1f}")
print(f" Epochs needed for 100B: {100e9/total_train:.1f}")
PYEOF
fi
log "=== νŒŒμ΄ν”„λΌμΈ μ™„λ£Œ ==="