Neon-tech commited on
Commit
9b6bcf4
Β·
verified Β·
1 Parent(s): 7fa217b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -3
app.py CHANGED
@@ -1,5 +1,118 @@
 
 
 
 
 
 
1
  import requests
 
 
 
2
 
3
- r = requests.get("https://storage.googleapis.com/kagglesdsdata/datasets/10104711/15766657/tokenizer.json?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20260426%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20260426T150318Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=a2e4065f0b4e848ce90bf17823fc5740a0b2679dc1206f2f6e5dcb9344c173b00b01fdc1387b72469ae6a380afd200761d3dc4c7b412cb690c08deed4df18dd0329636cd1a76f4e2ce29b525878fcb6e33b26bf83d38caec23da8bdd717d20f7dd7659038bda8f4a4d5e707bbe6864cbd5a7bad5e3b16fdc7960d25e89fe6e36c033ab185e4ebea700de8b4e2ed2202340fa78cc741aa8a5d3aa51e02a2391329d4e15a27f270fed8a8efa3e6312b5a6363dc2250f80397bca6fd330d884de96a9fce60e848c427af37ff3df417c6a6ffbbd1ad15f796097f9a940da80c05720be29d8be055130a99b5e97f9a1f8ff9968a1970a931ca591978686bc3a2df3c9")
4
- with open("/data/tokenizer.json", "wb") as f:
5
- f.write(r.content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # process.py
2
+ from huggingface_hub import snapshot_download, HfApi
3
+ from tokenizers import Tokenizer
4
+ import os
5
+ import json
6
+ import threading
7
  import requests
8
+ import pandas as pd
9
+ from pathlib import Path
10
+ from concurrent.futures import ThreadPoolExecutor
11
 
12
+ # ── Config ──
13
+ HF_USERNAME = "Neon-coding"
14
+ DATASET_NAME = "github-code-raw"
15
+ RAW_DIR = "/data/codeparrot-raw"
16
+ OUT_DIR = "/data/by-language"
17
+ STATE_FILE = "/data/progress_state.json"
18
+ TOK_PATH = "/data/tokenizer.json"
19
+
20
+ os.makedirs(OUT_DIR, exist_ok=True)
21
+ os.makedirs(RAW_DIR, exist_ok=True)
22
+
23
+ # ── Load tokenizer (already in bucket) ──
24
+ print("βœ“ Loading tokenizer from bucket...")
25
+ tokenizer = Tokenizer.from_file(TOK_PATH)
26
+ SEP_TOKEN = tokenizer.token_to_id("<eos>")
27
+ print(f"βœ“ Tokenizer loaded | vocab: {tokenizer.get_vocab_size():,}")
28
+
29
+ # ── Load state ──
30
+ if os.path.exists(STATE_FILE):
31
+ with open(STATE_FILE) as f:
32
+ state = json.load(f)
33
+ print(f"Resuming β€” {len(state['processed_files'])} files already done")
34
+ else:
35
+ state = {"processed_files": [], "lang_tokens": {}}
36
+ print("Starting fresh")
37
+
38
+ lock = threading.Lock()
39
+
40
+ def save_state():
41
+ with open(STATE_FILE, "w") as f:
42
+ json.dump(state, f, indent=2)
43
+
44
+ # ── Download codeparrot ──
45
+ print("\nDownloading codeparrot/github-code-clean...")
46
+ local_dir = snapshot_download(
47
+ repo_id="codeparrot/github-code-clean",
48
+ repo_type="dataset",
49
+ local_dir=RAW_DIR,
50
+ )
51
+
52
+ parquet_files = sorted(Path(local_dir).rglob("*.parquet"))
53
+ print(f"Found {len(parquet_files)} parquet files")
54
+
55
+ # ── Process each file ──
56
+ def process_file(path):
57
+ fname = str(path)
58
+
59
+ if fname in state["processed_files"]:
60
+ print(f" SKIP {path.name}")
61
+ return
62
+
63
+ try:
64
+ df = pd.read_parquet(path)
65
+
66
+ for lang, group in df.groupby("language"):
67
+ lang_dir = os.path.join(OUT_DIR, lang)
68
+ os.makedirs(lang_dir, exist_ok=True)
69
+ out = os.path.join(lang_dir, f"{path.stem}.jsonl")
70
+
71
+ if os.path.exists(out):
72
+ continue
73
+
74
+ texts = group["code"].dropna().tolist()
75
+ encoded = tokenizer.encode_batch(texts)
76
+ tok_count = sum(len(e.ids) for e in encoded)
77
+
78
+ group[["code"]].rename(columns={"code": "text"}).to_json(
79
+ out, orient="records", lines=True
80
+ )
81
+
82
+ with lock:
83
+ state["lang_tokens"][lang] = state["lang_tokens"].get(lang, 0) + tok_count
84
+
85
+ with lock:
86
+ state["processed_files"].append(fname)
87
+ save_state()
88
+
89
+ print(f" βœ“ {path.name} | langs: {list(df['language'].unique())}")
90
+
91
+ except Exception as e:
92
+ print(f" βœ— {path.name} ERROR: {e}")
93
+
94
+
95
+ with ThreadPoolExecutor(max_workers=8) as ex:
96
+ list(ex.map(process_file, parquet_files))
97
+
98
+ # ── Save per-language meta ──
99
+ print("\nSaving per-language meta.json...")
100
+ for lang, total_tokens in state["lang_tokens"].items():
101
+ lang_dir = os.path.join(OUT_DIR, lang)
102
+ os.makedirs(lang_dir, exist_ok=True)
103
+ with open(os.path.join(lang_dir, "meta.json"), "w") as f:
104
+ json.dump({"language": lang, "total_tokens": total_tokens}, f, indent=2)
105
+ print(f" {lang}: {total_tokens:,}")
106
+
107
+ # ── Push to HF ──
108
+ print("\nPushing to HuggingFace...")
109
+ api = HfApi()
110
+ api.upload_folder(
111
+ folder_path=OUT_DIR,
112
+ repo_id=f"{HF_USERNAME}/{DATASET_NAME}",
113
+ repo_type="dataset",
114
+ )
115
+
116
+ print("\nDone!")
117
+ for l, t in sorted(state["lang_tokens"].items(), key=lambda x: -x[1]):
118
+ print(f" {l}: {t:,}")