Tok4

Paused

App Files Files Community

Neon-tech commited on 4 days ago

Commit

541a74a

verified ·

1 Parent(s): 1807b95

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -45

app.py CHANGED Viewed

@@ -4,9 +4,10 @@ import time
 import socket
 import threading
 import gc
 import multiprocessing as mp
 from pathlib import Path
-import pandas as pd
 from tokenizers import Tokenizer
 # ── Config ───────────────────────────────────────────────────────────────────
@@ -56,7 +57,7 @@ def claim_shard(state):
                 return name, raw_path
     return None, None
-# ── Tokenize chunk (subprocess) ───────────────────────────────────────────────
 _worker_tokenizer = None
 def init_worker(tok_path):
@@ -64,71 +65,62 @@ def init_worker(tok_path):
     _worker_tokenizer = Tokenizer.from_file(tok_path)
 def tokenize_chunk(texts):
-    results = []
-    for text in texts:
-        if not text or not text.strip():
-            continue
-        enc = _worker_tokenizer.encode(text)
-        ids = enc.ids
-        if len(ids) >= 2:
-            results.append(ids)
-    return results
-# ── Process shard using both cores ───────────────────────────────────────────
 def process_shard(name, raw_path, pool):
     print(f"  [{WORKER_ID}] Processing: {name}")
     try:
-        df = pd.read_parquet(raw_path, columns=["text"])
     except Exception as e:
         return False, f"read_failed: {e}"
-    total = len(df)
-    print(f"  [{WORKER_ID}] {total:,} rows — splitting across 2 cores")
-    mid    = total // 2
-    texts1 = df.iloc[:mid]["text"].tolist()
-    texts2 = df.iloc[mid:]["text"].tolist()
-    del df
-    gc.collect()
     try:
-        results = pool.map(tokenize_chunk, [texts1, texts2])
-    except Exception as e:
-        return False, f"tokenize_failed: {e}"
-    all_ids = results[0] + results[1]
-    del results, texts1, texts2
-    gc.collect()
-    if not all_ids:
-        return False, "no_tokens_produced"
-    out_name = name.replace(".parquet", ".jsonl")
-    out_path = Path(OUT_DIR) / out_name
-    total_tokens = 0
-    try:
-        with open(out_path, "w", encoding="utf-8") as f:
-            for ids in all_ids:
-                f.write(json.dumps({"input_ids": ids}) + "\n")
-                total_tokens += len(ids)
     except Exception as e:
         return False, f"write_failed: {e}"
-    del all_ids
-    gc.collect()
     print(f"  ✓ [{WORKER_ID}] {out_name} | {total_tokens:,} tokens")
     return True, None
 # ── Worker loop ───────────────────────────────────────────────────────────────
 def worker_loop():
     print(f"✓ [{WORKER_ID}] Loading tokenizer...")
     tok = Tokenizer.from_file(TOK_PATH)
     print(f"✓ [{WORKER_ID}] Tokenizer ready | vocab: {tok.get_vocab_size():,}")
     del tok
-    gc.collect()
     pool = mp.Pool(processes=2, initializer=init_worker, initargs=(TOK_PATH,))
     print(f"✓ [{WORKER_ID}] Worker pool ready")
@@ -184,9 +176,9 @@ def worker_loop():
                 state["shards"][name]["claimed_at"] = None
                 state["shards"][name]["error"]      = error
                 save_state(state)
-                print(f"  [{WORKER_ID}] Shard failed ({error}), left on disk for retry: {name}")
-            gc.collect()
             time.sleep(5)
     finally:

 import socket
 import threading
 import gc
+import ctypes
 import multiprocessing as mp
 from pathlib import Path
+import pyarrow.parquet as pq
 from tokenizers import Tokenizer
 # ── Config ───────────────────────────────────────────────────────────────────
                 return name, raw_path
     return None, None
+# ── Tokenizer subprocess ──────────────────────────────────────────────────────
 _worker_tokenizer = None
 def init_worker(tok_path):
     _worker_tokenizer = Tokenizer.from_file(tok_path)
 def tokenize_chunk(texts):
+    encs = _worker_tokenizer.encode_batch(texts)
+    return [e.ids for e in encs if len(e.ids) >= 2]
+# ── Process shard ─────────────────────────────────────────────────────────────
 def process_shard(name, raw_path, pool):
     print(f"  [{WORKER_ID}] Processing: {name}")
+    out_name = name.replace(".parquet", ".jsonl")
+    out_path = Path(OUT_DIR) / out_name
+    total_tokens = 0
     try:
+        pf = pq.ParquetFile(raw_path)
     except Exception as e:
+        raw_path.unlink(missing_ok=True)
         return False, f"read_failed: {e}"
     try:
+        with open(out_path, "w", encoding="utf-8") as f:
+            for batch in pf.iter_batches(batch_size=5_000, columns=["text"]):
+                texts = batch.column("text").to_pylist()
+                mid   = len(texts) // 2
+                try:
+                    results = pool.map(tokenize_chunk, [texts[:mid], texts[mid:]])
+                except Exception as e:
+                    return False, f"tokenize_failed: {e}"
+                for ids in results[0] + results[1]:
+                    f.write(json.dumps({"input_ids": ids}) + "\n")
+                    total_tokens += len(ids)
+                del texts, results
+                gc.collect()
     except Exception as e:
         return False, f"write_failed: {e}"
     print(f"  ✓ [{WORKER_ID}] {out_name} | {total_tokens:,} tokens")
     return True, None
+# ── Force full memory flush ───────────────────────────────────────────────────
+def flush_memory():
+    gc.collect()
+    try:
+        ctypes.CDLL("libc.so.6").malloc_trim(0)
+    except Exception:
+        pass
 # ── Worker loop ───────────────────────────────────────────────────────────────
 def worker_loop():
     print(f"✓ [{WORKER_ID}] Loading tokenizer...")
     tok = Tokenizer.from_file(TOK_PATH)
     print(f"✓ [{WORKER_ID}] Tokenizer ready | vocab: {tok.get_vocab_size():,}")
     del tok
+    flush_memory()
     pool = mp.Pool(processes=2, initializer=init_worker, initargs=(TOK_PATH,))
     print(f"✓ [{WORKER_ID}] Worker pool ready")
                 state["shards"][name]["claimed_at"] = None
                 state["shards"][name]["error"]      = error
                 save_state(state)
+                print(f"  [{WORKER_ID}] Failed ({error}) — left on disk for retry: {name}")
+            flush_memory()
             time.sleep(5)
     finally: