researchpilot-api / fix_categories.py
Subhadip007's picture
feat: vector database indexing complete
daafb32
raw
history blame contribute delete
585 Bytes
import json
from config.settings import RAW_DIR
from pathlib import Path
fixed = 0
for f in RAW_DIR.glob("*.json"):
if f.name == "paper_index.json":
continue
with open(f, "r", encoding = 'utf-8') as fp:
data = json.load(fp)
if not data.get("primary_category"):
cats = data.get("categories", [])
data['primary_category'] = cats[0] if cats else "cs.LG"
with open(f, "w", encoding = "utf-8") as fp:
json.dump(data, fp, indent = 2, ensure_ascii = False)
fixed += 1
print(f"Fixed {fixed} raw metadata files")