broadfield-dev commited on
Commit
09375cc
·
verified ·
1 Parent(s): 4b5c8eb

Update dataset_gen.py

Browse files
Files changed (1) hide show
  1. dataset_gen.py +50 -13
dataset_gen.py CHANGED
@@ -2,37 +2,74 @@ import json
2
  import os
3
  from parser import parse_source_to_graph
4
  from datetime import datetime
 
5
 
6
  OUTPUT_FILE = "pystructure_dataset.jsonl"
7
 
8
  def create_dataset_entry(code):
9
- """
10
- Parses code and appends a training example to the JSONL file.
11
- """
12
  graph_data = parse_source_to_graph(code)
13
 
14
  if "error" in graph_data:
15
  return {"status": "error", "message": graph_data["error"]}
16
 
17
- vectors = [n['vector'] for n in graph_data['nodes']]
18
 
19
  entry = {
20
  "id": f"sample_{int(datetime.now().timestamp())}",
21
  "timestamp": datetime.now().isoformat(),
22
- "source_code": code,
23
- "graph_structure": {
24
- "nodes": [n['id'] for n in graph_data['nodes']],
25
- "edges": graph_data['connections']
26
- },
27
- "structural_vectors": vectors,
28
  "meta": {
29
  "node_count": len(graph_data['nodes']),
30
- "max_depth": max([n['level'] for n in graph_data['nodes']]) if graph_data['nodes'] else 0
 
 
 
 
 
 
31
  }
32
  }
33
 
34
- # Append to JSONL file
35
  with open(OUTPUT_FILE, 'a') as f:
36
  f.write(json.dumps(entry) + '\n')
37
 
38
- return {"status": "success", "file": OUTPUT_FILE, "entry_id": entry['id']}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import os
3
  from parser import parse_source_to_graph
4
  from datetime import datetime
5
+ from huggingface_hub import HfApi
6
 
7
  OUTPUT_FILE = "pystructure_dataset.jsonl"
8
 
9
  def create_dataset_entry(code):
 
 
 
10
  graph_data = parse_source_to_graph(code)
11
 
12
  if "error" in graph_data:
13
  return {"status": "error", "message": graph_data["error"]}
14
 
15
+ vectors = [n['vec'] for n in graph_data['nodes']]
16
 
17
  entry = {
18
  "id": f"sample_{int(datetime.now().timestamp())}",
19
  "timestamp": datetime.now().isoformat(),
20
+ "source_code": code, # We keep full source for training
 
 
 
 
 
21
  "meta": {
22
  "node_count": len(graph_data['nodes']),
23
+ "max_depth": max([n['lvl'] for n in graph_data['nodes']]) if graph_data['nodes'] else 0,
24
+ "snippet": code[:50].replace('\n', ' ') + "..." # For UI preview
25
+ },
26
+ # Store compact structure for training
27
+ "structure": {
28
+ "vectors": vectors,
29
+ "edges": graph_data['connections']
30
  }
31
  }
32
 
 
33
  with open(OUTPUT_FILE, 'a') as f:
34
  f.write(json.dumps(entry) + '\n')
35
 
36
+ return {"status": "success", "id": entry['id']}
37
+
38
+ def get_dataset_stats():
39
+ """Reads metadata from the JSONL file without loading heavy source code."""
40
+ entries = []
41
+ if not os.path.exists(OUTPUT_FILE):
42
+ return []
43
+
44
+ with open(OUTPUT_FILE, 'r') as f:
45
+ for line in f:
46
+ try:
47
+ data = json.loads(line)
48
+ # Only return lightweight info for the UI table
49
+ entries.append({
50
+ "id": data['id'],
51
+ "timestamp": data['timestamp'],
52
+ "node_count": data['meta']['node_count'],
53
+ "snippet": data['meta']['snippet']
54
+ })
55
+ except:
56
+ continue
57
+ return entries[::-1] # Newest first
58
+
59
+ def upload_to_hub(token, repo_id):
60
+ """Pushes the local JSONL file to Hugging Face."""
61
+ if not os.path.exists(OUTPUT_FILE):
62
+ return {"status": "error", "message": "No dataset found."}
63
+
64
+ try:
65
+ api = HfApi(token=token)
66
+ # Upload the specific file
67
+ api.upload_file(
68
+ path_or_fileobj=OUTPUT_FILE,
69
+ path_in_repo="dataset.jsonl",
70
+ repo_id=repo_id,
71
+ repo_type="dataset"
72
+ )
73
+ return {"status": "success", "message": f"Uploaded to https://huggingface.co/datasets/{repo_id}"}
74
+ except Exception as e:
75
+ return {"status": "error", "message": str(e)}