Spaces:
Running
Running
| import pandas as pd | |
| import json | |
| import re | |
| # ========================= | |
| # EDIT THESE TWO VARIABLES | |
| # ========================= | |
| INPUT_CSV = "raw_data.csv" | |
| OUTPUT_CSV = "cleaned_data.csv" | |
| def extract_video_id(filename: str): | |
| """'0111.mp4' -> '0111' (string, to match your existing style)""" | |
| m = re.match(r"(\d+)", str(filename).strip()) | |
| return m.group(1) if m else None | |
| def parse_video_labels(field): | |
| """ | |
| Parse Label Studio `videoLabels` JSON field. | |
| Returns list of dicts: [{"label": str, "start": int, "end": int}, ...] | |
| """ | |
| if pd.isna(field): | |
| return [] | |
| try: | |
| items = json.loads(field) | |
| except Exception: | |
| return [] | |
| out = [] | |
| for entry in items if isinstance(items, list) else [items]: | |
| labels = entry.get("timelinelabels") or entry.get("timelineLabels") or [] | |
| ranges = entry.get("ranges") or [] | |
| # normalize labels | |
| if isinstance(labels, str): | |
| labels = [labels] | |
| labels = [str(x).strip() for x in labels if str(x).strip()] | |
| for rr in ranges: | |
| try: | |
| start = int(rr.get("start")) | |
| end = int(rr.get("end")) | |
| except Exception: | |
| continue | |
| if start > end: | |
| start, end = end, start | |
| for lab in labels: | |
| out.append({"label": lab, "start": start, "end": end}) | |
| return out | |
| def main(): | |
| df = pd.read_csv(INPUT_CSV) | |
| records = [] | |
| for _, row in df.iterrows(): | |
| filename = str(row.get("filename", "")).strip() | |
| if not filename: | |
| continue | |
| video_id = extract_video_id(filename) | |
| video_path = f"videos/{filename}" | |
| labels = parse_video_labels(row.get("videoLabels", "[]")) | |
| # If you want to DROP videos with no labels, replace this block with: `if not labels: continue` | |
| if not labels: | |
| records.append( | |
| { | |
| "filename": filename, | |
| "video_id": video_id, | |
| "video_path": video_path, | |
| "label": None, | |
| "start": None, | |
| "end": None, | |
| } | |
| ) | |
| else: | |
| for lab in labels: | |
| records.append( | |
| { | |
| "filename": filename, | |
| "video_id": video_id, | |
| "video_path": video_path, | |
| "label": lab["label"], | |
| "start": lab["start"], | |
| "end": lab["end"], | |
| } | |
| ) | |
| cleaned = pd.DataFrame( | |
| records, | |
| columns=["filename", "video_id", "video_path", "label", "start", "end"], | |
| ) | |
| cleaned.to_csv(OUTPUT_CSV, index=False) | |
| print(f"✅ Wrote {len(cleaned)} rows -> {OUTPUT_CSV}") | |
| if __name__ == "__main__": | |
| main() | |