Tok4

Paused

App Files Files Community

Neon-tech commited on 4 days ago

Commit

69cda21

verified ·

1 Parent(s): 541a74a

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -4

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import gc
 import ctypes
 import multiprocessing as mp
 from pathlib import Path
 import pyarrow.parquet as pq
 from tokenizers import Tokenizer
@@ -72,8 +73,9 @@ def tokenize_chunk(texts):
 def process_shard(name, raw_path, pool):
     print(f"  [{WORKER_ID}] Processing: {name}")
-    out_name = name.replace(".parquet", ".jsonl")
     out_path = Path(OUT_DIR) / out_name
     total_tokens = 0
     try:
@@ -83,7 +85,7 @@ def process_shard(name, raw_path, pool):
         return False, f"read_failed: {e}"
     try:
-        with open(out_path, "w", encoding="utf-8") as f:
             for batch in pf.iter_batches(batch_size=5_000, columns=["text"]):
                 texts = batch.column("text").to_pylist()
                 mid   = len(texts) // 2
@@ -91,18 +93,22 @@ def process_shard(name, raw_path, pool):
                 try:
                     results = pool.map(tokenize_chunk, [texts[:mid], texts[mid:]])
                 except Exception as e:
                     return False, f"tokenize_failed: {e}"
                 for ids in results[0] + results[1]:
-                    f.write(json.dumps({"input_ids": ids}) + "\n")
                     total_tokens += len(ids)
                 del texts, results
                 gc.collect()
     except Exception as e:
         return False, f"write_failed: {e}"
     print(f"  ✓ [{WORKER_ID}] {out_name} | {total_tokens:,} tokens")
     return True, None
@@ -141,7 +147,7 @@ def worker_loop():
             total = len(state["shards"]) + len(state.get("queue", []))
             done  = sum(1 for v in state["shards"].values() if v["status"] == "done")
-            if done == len(state["shards"]) and not state.get("queue") and total > 0:
                 print(f"  [{WORKER_ID}] All done. Sleeping.")
                 time.sleep(300)
                 continue

 import ctypes
 import multiprocessing as mp
 from pathlib import Path
+import numpy as np
 import pyarrow.parquet as pq
 from tokenizers import Tokenizer
 def process_shard(name, raw_path, pool):
     print(f"  [{WORKER_ID}] Processing: {name}")
+    out_name = name.replace(".parquet", ".bin")
     out_path = Path(OUT_DIR) / out_name
+    tmp_path = Path(OUT_DIR) / f"{out_name}.tmp"
     total_tokens = 0
     try:
         return False, f"read_failed: {e}"
     try:
+        with open(tmp_path, "wb") as f:
             for batch in pf.iter_batches(batch_size=5_000, columns=["text"]):
                 texts = batch.column("text").to_pylist()
                 mid   = len(texts) // 2
                 try:
                     results = pool.map(tokenize_chunk, [texts[:mid], texts[mid:]])
                 except Exception as e:
+                    tmp_path.unlink(missing_ok=True)
                     return False, f"tokenize_failed: {e}"
                 for ids in results[0] + results[1]:
+                    arr = np.array(ids, dtype=np.uint16)
+                    arr.tofile(f)
                     total_tokens += len(ids)
                 del texts, results
                 gc.collect()
     except Exception as e:
+        tmp_path.unlink(missing_ok=True)
         return False, f"write_failed: {e}"
+    tmp_path.rename(out_path)  # ← atomic, only visible when complete
     print(f"  ✓ [{WORKER_ID}] {out_name} | {total_tokens:,} tokens")
     return True, None
             total = len(state["shards"]) + len(state.get("queue", []))
             done  = sum(1 for v in state["shards"].values() if v["status"] == "done")
+            if total > 0 and done == total:
                 print(f"  [{WORKER_ID}] All done. Sleeping.")
                 time.sleep(300)
                 continue