Tok4

Paused

App Files Files Community

Neon-tech commited on 9 days ago

Commit

1807b95

verified ·

1 Parent(s): 558cde6

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -68

app.py CHANGED Viewed

@@ -57,28 +57,31 @@ def claim_shard(state):
     return None, None
 # ── Tokenize chunk (subprocess) ───────────────────────────────────────────────
-def tokenize_chunk(args):
-    texts, tok_path = args
-    tokenizer = Tokenizer.from_file(tok_path)
     results = []
     for text in texts:
         if not text or not text.strip():
             continue
-        enc = tokenizer.encode(text)
         ids = enc.ids
         if len(ids) >= 2:
             results.append(ids)
     return results
 # ── Process shard using both cores ───────────────────────────────────────────
-def process_shard(name, raw_path):
     print(f"  [{WORKER_ID}] Processing: {name}")
     try:
         df = pd.read_parquet(raw_path, columns=["text"])
     except Exception as e:
-        print(f"  ✗ Read failed: {e}")
-        return False
     total = len(df)
     print(f"  [{WORKER_ID}] {total:,} rows — splitting across 2 cores")
@@ -89,34 +92,35 @@ def process_shard(name, raw_path):
     del df
     gc.collect()
-    with mp.Pool(processes=2) as pool:
-        results = pool.map(tokenize_chunk, [
-            (texts1, TOK_PATH),
-            (texts2, TOK_PATH),
-        ])
     all_ids = results[0] + results[1]
     del results, texts1, texts2
     gc.collect()
     if not all_ids:
-        print(f"  ✗ No tokens produced")
-        return False
     out_name = name.replace(".parquet", ".jsonl")
     out_path = Path(OUT_DIR) / out_name
     total_tokens = 0
-    with open(out_path, "w", encoding="utf-8") as f:
-        for ids in all_ids:
-            f.write(json.dumps({"input_ids": ids}) + "\n")
-            total_tokens += len(ids)
     del all_ids
     gc.collect()
     print(f"  ✓ [{WORKER_ID}] {out_name} | {total_tokens:,} tokens")
-    return True
 # ── Worker loop ───────────────────────────────────────────────────────────────
 def worker_loop():
@@ -126,58 +130,68 @@ def worker_loop():
     del tok
     gc.collect()
-    while True:
-        if not os.path.exists(STATE_FILE):
-            print(f"  [{WORKER_ID}] Waiting for state.json...")
-            time.sleep(POLL_INTERVAL)
-            continue
-        try:
-            state = load_state()
-        except Exception as e:
-            print(f"  [{WORKER_ID}] State read error: {e}")
-            time.sleep(POLL_INTERVAL)
-            continue
-        total = len(state["shards"]) + len(state.get("queue", []))
-        done  = sum(1 for v in state["shards"].values() if v["status"] == "done")
-        if done == len(state["shards"]) and not state.get("queue") and total > 0:
-            print(f"  [{WORKER_ID}] All done. Sleeping.")
-            time.sleep(300)
-            continue
-        name, raw_path = claim_shard(state)
-        if not name:
-            print(f"  [{WORKER_ID}] Nothing ready — polling in {POLL_INTERVAL}s")
-            time.sleep(POLL_INTERVAL)
-            continue
-        print(f"  [{WORKER_ID}] Claimed: {name}")
-        success = process_shard(name, raw_path)
-        try:
-            state = load_state()
-        except Exception:
-            pass
-        if success:
-            state["shards"][name]["status"] = "done"
-        else:
-            state["shards"][name]["status"]     = "pending"
-            state["shards"][name]["worker"]     = None
-            state["shards"][name]["claimed_at"] = None
-        save_state(state)
-        try:
-            raw_path.unlink()
-            print(f"  [{WORKER_ID}] Deleted: {raw_path.name}")
-        except Exception as e:
-            print(f"  [{WORKER_ID}] Delete failed: {e}")
-        gc.collect()
-        time.sleep(5)
 # ── Entry point ───────────────────────────────────────────────────────────────
 if __name__ == "__main__":

     return None, None
 # ── Tokenize chunk (subprocess) ───────────────────────────────────────────────
+_worker_tokenizer = None
+def init_worker(tok_path):
+    global _worker_tokenizer
+    _worker_tokenizer = Tokenizer.from_file(tok_path)
+def tokenize_chunk(texts):
     results = []
     for text in texts:
         if not text or not text.strip():
             continue
+        enc = _worker_tokenizer.encode(text)
         ids = enc.ids
         if len(ids) >= 2:
             results.append(ids)
     return results
 # ── Process shard using both cores ───────────────────────────────────────────
+def process_shard(name, raw_path, pool):
     print(f"  [{WORKER_ID}] Processing: {name}")
     try:
         df = pd.read_parquet(raw_path, columns=["text"])
     except Exception as e:
+        return False, f"read_failed: {e}"
     total = len(df)
     print(f"  [{WORKER_ID}] {total:,} rows — splitting across 2 cores")
     del df
     gc.collect()
+    try:
+        results = pool.map(tokenize_chunk, [texts1, texts2])
+    except Exception as e:
+        return False, f"tokenize_failed: {e}"
     all_ids = results[0] + results[1]
     del results, texts1, texts2
     gc.collect()
     if not all_ids:
+        return False, "no_tokens_produced"
     out_name = name.replace(".parquet", ".jsonl")
     out_path = Path(OUT_DIR) / out_name
     total_tokens = 0
+    try:
+        with open(out_path, "w", encoding="utf-8") as f:
+            for ids in all_ids:
+                f.write(json.dumps({"input_ids": ids}) + "\n")
+                total_tokens += len(ids)
+    except Exception as e:
+        return False, f"write_failed: {e}"
     del all_ids
     gc.collect()
     print(f"  ✓ [{WORKER_ID}] {out_name} | {total_tokens:,} tokens")
+    return True, None
 # ── Worker loop ───────────────────────────────────────────────────────────────
 def worker_loop():
     del tok
     gc.collect()
+    pool = mp.Pool(processes=2, initializer=init_worker, initargs=(TOK_PATH,))
+    print(f"✓ [{WORKER_ID}] Worker pool ready")
+    try:
+        while True:
+            if not os.path.exists(STATE_FILE):
+                print(f"  [{WORKER_ID}] Waiting for state.json...")
+                time.sleep(POLL_INTERVAL)
+                continue
+            try:
+                state = load_state()
+            except Exception as e:
+                print(f"  [{WORKER_ID}] State read error: {e}")
+                time.sleep(POLL_INTERVAL)
+                continue
+            total = len(state["shards"]) + len(state.get("queue", []))
+            done  = sum(1 for v in state["shards"].values() if v["status"] == "done")
+            if done == len(state["shards"]) and not state.get("queue") and total > 0:
+                print(f"  [{WORKER_ID}] All done. Sleeping.")
+                time.sleep(300)
+                continue
+            name, raw_path = claim_shard(state)
+            if not name:
+                print(f"  [{WORKER_ID}] Nothing ready — polling in {POLL_INTERVAL}s")
+                time.sleep(POLL_INTERVAL)
+                continue
+            print(f"  [{WORKER_ID}] Claimed: {name}")
+            success, error = process_shard(name, raw_path, pool)
+            try:
+                state = load_state()
+            except Exception:
+                pass
+            if success:
+                state["shards"][name]["status"] = "done"
+                state["shards"][name]["error"]  = None
+                save_state(state)
+                try:
+                    raw_path.unlink()
+                    print(f"  [{WORKER_ID}] Deleted: {raw_path.name}")
+                except Exception as e:
+                    print(f"  [{WORKER_ID}] Delete failed: {e}")
+            else:
+                state["shards"][name]["status"]     = "pending"
+                state["shards"][name]["worker"]     = None
+                state["shards"][name]["claimed_at"] = None
+                state["shards"][name]["error"]      = error
+                save_state(state)
+                print(f"  [{WORKER_ID}] Shard failed ({error}), left on disk for retry: {name}")
+            gc.collect()
+            time.sleep(5)
+    finally:
+        pool.terminate()
+        pool.join()
 # ── Entry point ───────────────────────────────────────────────────────────────
 if __name__ == "__main__":