Spaces:

Neon-coding
/

Cor

Running

App Files Files Community

Neon-tech commited on 1 day ago

Commit

675be9e

verified ·

1 Parent(s): 1340c9c

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -124

app.py CHANGED Viewed

@@ -3,10 +3,8 @@ import json
 import time
 import socket
 import threading
-import re
 import requests
 import pyarrow.parquet as pq
-import pyarrow as pa
 import gc
 from pathlib import Path
 from huggingface_hub import HfApi
@@ -16,25 +14,20 @@ HF_TOKEN       = os.environ.get("HF_TOKEN")
 RAW_DIR        = "/data/raw"
 STATE_FILE     = "/data/state.json"
 WORKER_TIMEOUT = 600
-MAX_BUFFERED   = 50
-ROWS_PER_CHUNK = 50_000
 os.makedirs(RAW_DIR, exist_ok=True)
-api = HfApi(token=HF_TOKEN)
 AUTH_HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
 # ── Sources ───────────────────────────────────────────────────────────────────
-# Each source: (name, type, urls_or_config)
-# Types: parquet_list, hf_list
-# For hf_list: uses HF API to discover files
 SOURCES = [
     {
         "name"    : "fineweb",
         "type"    : "hf_list",
         "repo"    : "HuggingFaceFW/fineweb-edu",
         "prefix"  : "data/CC-MAIN-2025-26",
-        "skip"    : 0,       # already in state from prev run — coordinator skips done ones
         "take"    : 10,
         "text_col": "text",
     },
@@ -43,7 +36,7 @@ SOURCES = [
         "type"    : "hf_list",
         "repo"    : "wikimedia/wikipedia",
         "prefix"  : "20231101.en/train-",
-        "skip"    : 2,       # first 2 already done in 50M
         "take"    : 18,
         "text_col": "text",
     },
@@ -56,31 +49,16 @@ SOURCES = [
         "take"    : 6,
         "text_col": "text",
     },
-    {
-        "name"    : "phi",
-        "type"    : "url_list",
-        "urls"    : [
-            "https://huggingface.co/datasets/open-phi/programming_books_llama/resolve/main/data/train-00000-of-00004-ea05c5cb63b570a8.parquet?download=true",
-            "https://huggingface.co/datasets/open-phi/programming_books_llama/resolve/main/data/train-00001-of-00004-d99cbe052bab0d4e.parquet?download=true",
-            "https://huggingface.co/datasets/open-phi/programming_books_llama/resolve/main/data/train-00002-of-00004-2c25f0e11d537eaf.parquet?download=true",
-            "https://huggingface.co/datasets/open-phi/programming_books_llama/resolve/main/data/train-00003-of-00004-faa8dbb07e5f02e8.parquet?download=true",
-        ],
-        "text_col": "markdown",
-    },
     {
         "name"    : "code",
         "type"    : "url_list",
-        "urls"    : [
-            # 12 new languages × 2 shards = 24 files
-            # Base: https://huggingface.co/datasets/Neon-tech/Dataset-arranger/resolve/main/by-language
-            *[
-                f"https://huggingface.co/datasets/Neon-tech/Dataset-arranger/resolve/main/by-language/{lang}/shard_{str(i).zfill(6)}.jsonl?download=true"
-                for lang in ["C", "C++", "Java", "Go", "Rust", "Ruby", "PHP", "SQL", "C#", "Scala", "Lua", "Perl"]
-                for i in range(2)
-            ],
-        ],
         "text_col": "text",
         "fmt"     : "jsonl",
     },
 ]
@@ -96,12 +74,6 @@ def serve():
         conn.send(b"HTTP/1.1 200 OK\r\nContent-Length: 2\r\n\r\nOK")
         conn.close()
-# ── Friendly name ─────────────────────────────────────────────────────────────
-def friendly_name(source_name, url_or_path):
-    # Strip query string
-    base = url_or_path.split("?")[0].split("/")[-1]
-    return f"{source_name}__{base}"
 # ── State ─────────────────────────────────────────────────────────────────────
 def load_state():
     if os.path.exists(STATE_FILE):
@@ -124,46 +96,50 @@ def save_state(state):
         json.dump(state, f, indent=2)
     os.replace(tmp, STATE_FILE)
-# ── Discover all sources ──────────────────────────────────────────────────────
 def discover_all(state):
-    known = {v["url"] for v in state["shards"].values()} | {e["url"] for e in state.get("queue", [])}
-    new_count = 0
     for src in SOURCES:
         name = src["name"]
         print(f"\nDiscovering: {name}")
         if src["type"] == "hf_list":
-            files = list(api.list_repo_files(src["repo"], repo_type="dataset"))
-            files = [f for f in files if f.startswith(src["prefix"]) and f.endswith(".parquet")]
-            files = sorted(files)
-            files = files[src["skip"]: src["skip"] + src["take"]]
-            base  = f"https://huggingface.co/datasets/{src['repo']}/resolve/main/"
-            urls  = [base + f for f in files]
-            fmt   = "parquet"
         else:
             urls = src["urls"]
             fmt  = src.get("fmt", "parquet")
         for url in urls:
-            if url not in known:
                 state["queue"].append({
                     "url"      : url,
                     "source"   : name,
                     "text_col" : src["text_col"],
                     "fmt"      : fmt,
                 })
-                known.add(url)
                 new_count += 1
-        print(f"  {name}: {len(urls)} files | {new_count} new queued")
     save_state(state)
     print(f"\nTotal queued: {len(state['queue'])} | In state: {len(state['shards'])}")
 # ── Reclaim stale ─────────────────────────────────────────────────────────────
 def reclaim_stale(state):
-    now = time.time()
     for name, info in state["shards"].items():
         if info["status"] == "claimed" and info.get("claimed_at"):
             if now - info["claimed_at"] > WORKER_TIMEOUT:
@@ -171,50 +147,29 @@ def reclaim_stale(state):
                 info["status"]     = "pending"
                 info["worker"]     = None
                 info["claimed_at"] = None
-    save_state(state)
-# ── Split parquet into chunks ─────────────────────────────────────────────────
-def split_parquet(src_path, name, text_col):
-    pf          = pq.ParquetFile(src_path)
-    chunk_paths = []
-    chunk_idx   = 0
-    current     = []
-    for batch in pf.iter_batches(batch_size=10_000, columns=[text_col]):
-        current.append(batch)
-        if sum(len(b) for b in current) >= ROWS_PER_CHUNK:
-            chunk_name = name.replace(".parquet", f"_chunk{chunk_idx:03d}.parquet")
-            chunk_path = Path(RAW_DIR) / chunk_name
-            table = pa.Table.from_batches(current)
-            pq.write_table(table, chunk_path)
-            print(f"    ✓ {chunk_name} ({len(table):,} rows)")
-            chunk_paths.append((chunk_name, text_col, "parquet"))
-            chunk_idx += 1
-            current = []
-            del table; gc.collect()
-    if current:
-        chunk_name = name.replace(".parquet", f"_chunk{chunk_idx:03d}.parquet")
-        chunk_path = Path(RAW_DIR) / chunk_name
-        table = pa.Table.from_batches(current)
-        pq.write_table(table, chunk_path)
-        print(f"    ✓ {chunk_name} ({len(table):,} rows)")
-        chunk_paths.append((chunk_name, text_col, "parquet"))
-        del table; gc.collect()
-    return chunk_paths
-def copy_jsonl(src_path, name):
-    """JSONL files are small enough to use directly — just copy to raw dir."""
-    import shutil
-    dst = Path(RAW_DIR) / name
-    shutil.copy2(src_path, dst)
-    return [(name, "text", "jsonl")]
 # ── Download loop ─────────────────────────────────────────────────────────────
 def download_loop(state):
     while True:
-        # Reload state
         try:
             with open(STATE_FILE) as f:
                 fresh = json.load(f)
@@ -245,55 +200,51 @@ def download_loop(state):
         source   = entry["source"]
         text_col = entry["text_col"]
         fmt      = entry.get("fmt", "parquet")
-        ext      = ".jsonl" if fmt == "jsonl" else ".parquet"
-        name     = friendly_name(source, url)
-        if not name.endswith(ext):
-            name = name.split(".")[0] + ext
-        raw_path = Path(RAW_DIR) / name
-        tmp_path = Path(RAW_DIR) / f"{name}.tmp"
-        print(f"  Downloading: {source} | {url.split('/')[-1].split('?')[0]}")
         try:
             resp = requests.get(url, headers=AUTH_HEADERS, timeout=300, stream=True)
             resp.raise_for_status()
             with open(tmp_path, "wb") as f:
                 for chunk in resp.iter_content(chunk_size=8 * 1024 * 1024):
                     f.write(chunk)
-            tmp_path.rename(raw_path)
         except Exception as e:
             print(f"  ✗ Download failed: {e} — retrying in 30s")
             tmp_path.unlink(missing_ok=True)
             time.sleep(30)
             continue
-        print(f"  Processing: {name}")
-        try:
-            if fmt == "parquet":
-                chunks = split_parquet(raw_path, name, text_col)
-            else:
-                chunks = copy_jsonl(raw_path, name)
-        except Exception as e:
-            print(f"  ✗ Processing failed: {e}")
-            raw_path.unlink(missing_ok=True)
-            time.sleep(30)
-            continue
-        raw_path.unlink(missing_ok=True)
         state["queue"].pop(0)
-        for chunk_name, col, chunk_fmt in chunks:
-            state["shards"][chunk_name] = {
-                "status"    : "pending",
-                "url"       : url,
-                "source"    : source,
-                "text_col"  : col,
-                "fmt"       : chunk_fmt,
-                "worker"    : None,
-                "claimed_at": None,
-                "error"     : None,
-            }
         save_state(state)
-        print(f"  ✓ {len(chunks)} chunks ready from {name}")
         time.sleep(3)
 # ── Monitor ───────────────────────────────────────────────────────────────────
@@ -311,7 +262,6 @@ def monitor_loop():
             total   = len(shards) + len(queue)
             pct     = (done / total * 100) if total else 0
-            # Per-source breakdown
             src_done = {}
             for v in shards.values():
                 src = v.get("source", "?")

 import time
 import socket
 import threading
 import requests
 import pyarrow.parquet as pq
 import gc
 from pathlib import Path
 from huggingface_hub import HfApi
 RAW_DIR        = "/data/raw"
 STATE_FILE     = "/data/state.json"
 WORKER_TIMEOUT = 600
+MAX_BUFFERED   = 999999
 os.makedirs(RAW_DIR, exist_ok=True)
+api          = HfApi(token=HF_TOKEN)
 AUTH_HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
 # ── Sources ───────────────────────────────────────────────────────────────────
 SOURCES = [
     {
         "name"    : "fineweb",
         "type"    : "hf_list",
         "repo"    : "HuggingFaceFW/fineweb-edu",
         "prefix"  : "data/CC-MAIN-2025-26",
+        "skip"    : 5,
         "take"    : 10,
         "text_col": "text",
     },
         "type"    : "hf_list",
         "repo"    : "wikimedia/wikipedia",
         "prefix"  : "20231101.en/train-",
+        "skip"    : 2,
         "take"    : 18,
         "text_col": "text",
     },
         "take"    : 6,
         "text_col": "text",
     },
     {
         "name"    : "code",
         "type"    : "url_list",
         "text_col": "text",
         "fmt"     : "jsonl",
+        "urls"    : [
+            f"https://huggingface.co/datasets/Neon-tech/Dataset-arranger/resolve/main/by-language/{lang}/shard_{str(i).zfill(6)}.jsonl?download=true"
+            for lang in ["C", "C++", "Java", "Go", "Rust", "Ruby", "PHP", "SQL", "C#", "Scala", "Lua", "Perl"]
+            for i in range(2)
+        ],
     },
 ]
         conn.send(b"HTTP/1.1 200 OK\r\nContent-Length: 2\r\n\r\nOK")
         conn.close()
 # ── State ─────────────────────────────────────────────────────────────────────
 def load_state():
     if os.path.exists(STATE_FILE):
         json.dump(state, f, indent=2)
     os.replace(tmp, STATE_FILE)
+# ── Discover ──────────────────────────────────────────────────────────────────
 def discover_all(state):
+    known_urls = {v["url"] for v in state["shards"].values()} | {e["url"] for e in state.get("queue", [])}
+    new_count  = 0
     for src in SOURCES:
         name = src["name"]
         print(f"\nDiscovering: {name}")
         if src["type"] == "hf_list":
+            all_files = sorted([
+                f for f in api.list_repo_files(src["repo"], repo_type="dataset")
+                if f.startswith(src["prefix"]) and f.endswith(".parquet")
+            ])
+            selected = all_files[src["skip"]: src["skip"] + src["take"]]
+            base_url = f"https://huggingface.co/datasets/{src['repo']}/resolve/main/"
+            urls     = [base_url + f for f in selected]
+            fmt      = "parquet"
         else:
             urls = src["urls"]
             fmt  = src.get("fmt", "parquet")
+        added = 0
         for url in urls:
+            if url not in known_urls:
                 state["queue"].append({
                     "url"      : url,
                     "source"   : name,
                     "text_col" : src["text_col"],
                     "fmt"      : fmt,
                 })
+                known_urls.add(url)
                 new_count += 1
+                added     += 1
+        print(f"  {name}: {len(urls)} files | {added} new added to queue")
     save_state(state)
     print(f"\nTotal queued: {len(state['queue'])} | In state: {len(state['shards'])}")
 # ── Reclaim stale ─────────────────────────────────────────────────────────────
 def reclaim_stale(state):
+    now       = time.time()
+    reclaimed = 0
     for name, info in state["shards"].items():
         if info["status"] == "claimed" and info.get("claimed_at"):
             if now - info["claimed_at"] > WORKER_TIMEOUT:
                 info["status"]     = "pending"
                 info["worker"]     = None
                 info["claimed_at"] = None
+                reclaimed         += 1
+    if reclaimed:
+        save_state(state)
+# ── Parquet → JSONL ───────────────────────────────────────────────────────────
+def parquet_to_jsonl(parquet_path, jsonl_path, text_col):
+    """Stream parquet batch by batch → write one JSON line per doc. No full load."""
+    pf        = pq.ParquetFile(parquet_path)
+    n_written = 0
+    with open(jsonl_path, "w", encoding="utf-8") as out:
+        for batch in pf.iter_batches(batch_size=1_000, columns=[text_col]):
+            texts = batch.column(text_col).to_pylist()
+            for text in texts:
+                if text and isinstance(text, str) and text.strip():
+                    out.write(json.dumps({"text": text.strip()}, ensure_ascii=False) + "\n")
+                    n_written += 1
+            del texts
+            gc.collect()
+    return n_written
 # ── Download loop ─────────────────────────────────────────────────────────────
 def download_loop(state):
     while True:
         try:
             with open(STATE_FILE) as f:
                 fresh = json.load(f)
         source   = entry["source"]
         text_col = entry["text_col"]
         fmt      = entry.get("fmt", "parquet")
+        base_name  = url.split("?")[0].split("/")[-1].replace(".parquet", "").replace(".jsonl", "")
+        shard_name = f"{source}__{base_name}.jsonl"
+        jsonl_path = Path(RAW_DIR) / shard_name
+        tmp_path   = Path(RAW_DIR) / f"{shard_name}.tmp"
+        print(f"  Downloading: {source} | {base_name}")
         try:
             resp = requests.get(url, headers=AUTH_HEADERS, timeout=300, stream=True)
             resp.raise_for_status()
             with open(tmp_path, "wb") as f:
                 for chunk in resp.iter_content(chunk_size=8 * 1024 * 1024):
                     f.write(chunk)
         except Exception as e:
             print(f"  ✗ Download failed: {e} — retrying in 30s")
             tmp_path.unlink(missing_ok=True)
             time.sleep(30)
             continue
+        if fmt == "parquet":
+            print(f"  Converting → jsonl: {shard_name}")
+            try:
+                n = parquet_to_jsonl(tmp_path, jsonl_path, text_col)
+                tmp_path.unlink(missing_ok=True)
+                print(f"  ✓ {n:,} docs")
+            except Exception as e:
+                print(f"  ✗ Convert failed: {e}")
+                tmp_path.unlink(missing_ok=True)
+                jsonl_path.unlink(missing_ok=True)
+                time.sleep(30)
+                continue
+        else:
+            tmp_path.rename(jsonl_path)
         state["queue"].pop(0)
+        state["shards"][shard_name] = {
+            "status"    : "pending",
+            "url"       : url,
+            "source"    : source,
+            "worker"    : None,
+            "claimed_at": None,
+            "error"     : None,
+        }
         save_state(state)
+        print(f"  ✓ Ready: {shard_name}")
         time.sleep(3)
 # ── Monitor ───────────────────────────────────────────────────────────────────
             total   = len(shards) + len(queue)
             pct     = (done / total * 100) if total else 0
             src_done = {}
             for v in shards.values():
                 src = v.get("source", "?")