yfan07 commited on
Commit
c4c6335
·
verified ·
1 Parent(s): 209e7b5

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. ChatUniVi/model/multimodal_encoder/__pycache__/processor.cpython-310.pyc +0 -0
  3. configs/__pycache__/__init__.cpython-310.pyc +0 -0
  4. configs/config.py +1 -0
  5. datasets/__pycache__/__init__.cpython-310.pyc +0 -0
  6. models/__pycache__/__init__.cpython-310.pyc +0 -0
  7. models/__pycache__/avs_model.cpython-310.pyc +0 -0
  8. models/llava/__pycache__/__init__.cpython-310.pyc +0 -0
  9. models/llava/__pycache__/conversation.cpython-310.pyc +0 -0
  10. models/llava/model/__pycache__/__init__.cpython-310.pyc +0 -0
  11. models/llava/model/__pycache__/llava_arch.cpython-310.pyc +0 -0
  12. models/llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc +0 -0
  13. models/llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc +0 -0
  14. models/llava/model/language_model/mpt/__pycache__/adapt_tokenizer.cpython-310.pyc +0 -0
  15. models/llava/model/language_model/mpt/__pycache__/attention.cpython-310.pyc +0 -0
  16. models/llava/model/language_model/mpt/__pycache__/blocks.cpython-310.pyc +0 -0
  17. models/llava/model/language_model/mpt/__pycache__/configuration_mpt.cpython-310.pyc +0 -0
  18. models/llava/model/language_model/mpt/__pycache__/custom_embedding.cpython-310.pyc +0 -0
  19. models/llava/model/language_model/mpt/__pycache__/flash_attn_triton.cpython-310.pyc +0 -0
  20. models/llava/model/language_model/mpt/__pycache__/hf_prefixlm_converter.cpython-310.pyc +0 -0
  21. models/llava/model/language_model/mpt/__pycache__/meta_init_context.cpython-310.pyc +0 -0
  22. models/llava/model/language_model/mpt/__pycache__/modeling_mpt.cpython-310.pyc +0 -0
  23. models/llava/model/language_model/mpt/__pycache__/norm.cpython-310.pyc +0 -0
  24. models/llava/model/language_model/mpt/__pycache__/param_init_fns.cpython-310.pyc +0 -0
  25. models/llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc +0 -0
  26. models/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc +0 -0
  27. models/segment_anything/__pycache__/__init__.cpython-310.pyc +0 -0
  28. models/segment_anything/__pycache__/automatic_mask_generator.cpython-310.pyc +0 -0
  29. models/segment_anything/__pycache__/build_sam.cpython-310.pyc +0 -0
  30. models/segment_anything/__pycache__/predictor.cpython-310.pyc +0 -0
  31. models/segment_anything/modeling/__pycache__/__init__.cpython-310.pyc +0 -0
  32. models/segment_anything/modeling/__pycache__/common.cpython-310.pyc +0 -0
  33. models/segment_anything/modeling/__pycache__/image_encoder.cpython-310.pyc +0 -0
  34. models/segment_anything/modeling/__pycache__/mask_decoder.cpython-310.pyc +0 -0
  35. models/segment_anything/modeling/__pycache__/prompt_encoder.cpython-310.pyc +0 -0
  36. models/segment_anything/modeling/__pycache__/sam.cpython-310.pyc +0 -0
  37. models/segment_anything/modeling/__pycache__/transformer.cpython-310.pyc +0 -0
  38. models/segment_anything/utils/__pycache__/__init__.cpython-310.pyc +0 -0
  39. models/segment_anything/utils/__pycache__/amg.cpython-310.pyc +0 -0
  40. models/tf/__pycache__/modeling_outputs.cpython-310.pyc +0 -0
  41. runs/tubetoken_phase0/proposals_stride8_n64_bidir/lYwnXP3g050_4000_14000.npz +3 -0
  42. runs/tubetoken_phase_minus1/audit_full.log +47 -0
  43. runs/tubetoken_phase_minus1/audit_full/audit_report.md +34 -0
  44. runs/tubetoken_phase_minus1/audit_full/audit_samples.csv +0 -0
  45. tools/audit_refavs.py +371 -0
  46. tools/tubetoken/__pycache__/evaluate_oracle_refine_sam2.cpython-312.pyc +0 -0
  47. tools/tubetoken/evaluate_oracle_refine_sam2.py +203 -0
  48. utils/__pycache__/__init__.cpython-310.pyc +0 -0
  49. utils/metric/__pycache__/pyutils.cpython-310.pyc +0 -0
  50. utils/metric/__pycache__/utility.cpython-310.pyc +0 -0
.gitattributes CHANGED
@@ -6,3 +6,4 @@
6
  *.bin filter=lfs diff=lfs merge=lfs -text
7
  *.safetensors filter=lfs diff=lfs merge=lfs -text
8
  ChatUniVi/eval/questions/scienceqa/problems.json filter=lfs diff=lfs merge=lfs -text
 
 
6
  *.bin filter=lfs diff=lfs merge=lfs -text
7
  *.safetensors filter=lfs diff=lfs merge=lfs -text
8
  ChatUniVi/eval/questions/scienceqa/problems.json filter=lfs diff=lfs merge=lfs -text
9
+ runs/tubetoken_phase0/proposals_stride8_n64_bidir/lYwnXP3g050_4000_14000.npz filter=lfs diff=lfs merge=lfs -text
ChatUniVi/model/multimodal_encoder/__pycache__/processor.cpython-310.pyc ADDED
Binary file (2.38 kB). View file
 
configs/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (167 Bytes). View file
 
configs/config.py CHANGED
@@ -53,6 +53,7 @@ parser.add_argument("--log_root",type=str,default='log', help="where to save log
53
  parser.add_argument("--checkpoint_root",type=str,default='checkpoints', help="where to save trained checkpoints during training")
54
 
55
  parser.add_argument("--visualization_root",type=str,default='visualization', help="where to save visualization result during test")
 
56
 
57
 
58
 
 
53
  parser.add_argument("--checkpoint_root",type=str,default='checkpoints', help="where to save trained checkpoints during training")
54
 
55
  parser.add_argument("--visualization_root",type=str,default='visualization', help="where to save visualization result during test")
56
+ parser.add_argument("--eval_splits",type=str,default='test_s,test_u,test_n', help="comma-separated eval splits for load_model.py: test_s,test_u,test_n")
57
 
58
 
59
 
datasets/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (178 Bytes). View file
 
models/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (131 Bytes). View file
 
models/__pycache__/avs_model.cpython-310.pyc ADDED
Binary file (11.4 kB). View file
 
models/llava/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (188 Bytes). View file
 
models/llava/__pycache__/conversation.cpython-310.pyc ADDED
Binary file (10.4 kB). View file
 
models/llava/model/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (330 Bytes). View file
 
models/llava/model/__pycache__/llava_arch.cpython-310.pyc ADDED
Binary file (8.2 kB). View file
 
models/llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc ADDED
Binary file (3.6 kB). View file
 
models/llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc ADDED
Binary file (4.85 kB). View file
 
models/llava/model/language_model/mpt/__pycache__/adapt_tokenizer.cpython-310.pyc ADDED
Binary file (2.24 kB). View file
 
models/llava/model/language_model/mpt/__pycache__/attention.cpython-310.pyc ADDED
Binary file (12.3 kB). View file
 
models/llava/model/language_model/mpt/__pycache__/blocks.cpython-310.pyc ADDED
Binary file (2.9 kB). View file
 
models/llava/model/language_model/mpt/__pycache__/configuration_mpt.cpython-310.pyc ADDED
Binary file (8.86 kB). View file
 
models/llava/model/language_model/mpt/__pycache__/custom_embedding.cpython-310.pyc ADDED
Binary file (757 Bytes). View file
 
models/llava/model/language_model/mpt/__pycache__/flash_attn_triton.cpython-310.pyc ADDED
Binary file (21.4 kB). View file
 
models/llava/model/language_model/mpt/__pycache__/hf_prefixlm_converter.cpython-310.pyc ADDED
Binary file (19.8 kB). View file
 
models/llava/model/language_model/mpt/__pycache__/meta_init_context.cpython-310.pyc ADDED
Binary file (3.83 kB). View file
 
models/llava/model/language_model/mpt/__pycache__/modeling_mpt.cpython-310.pyc ADDED
Binary file (15.7 kB). View file
 
models/llava/model/language_model/mpt/__pycache__/norm.cpython-310.pyc ADDED
Binary file (3 kB). View file
 
models/llava/model/language_model/mpt/__pycache__/param_init_fns.cpython-310.pyc ADDED
Binary file (9.31 kB). View file
 
models/llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc ADDED
Binary file (571 Bytes). View file
 
models/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc ADDED
Binary file (3.03 kB). View file
 
models/segment_anything/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (407 Bytes). View file
 
models/segment_anything/__pycache__/automatic_mask_generator.cpython-310.pyc ADDED
Binary file (11.4 kB). View file
 
models/segment_anything/__pycache__/build_sam.cpython-310.pyc ADDED
Binary file (2.17 kB). View file
 
models/segment_anything/__pycache__/predictor.cpython-310.pyc ADDED
Binary file (9.98 kB). View file
 
models/segment_anything/modeling/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (394 Bytes). View file
 
models/segment_anything/modeling/__pycache__/common.cpython-310.pyc ADDED
Binary file (1.75 kB). View file
 
models/segment_anything/modeling/__pycache__/image_encoder.cpython-310.pyc ADDED
Binary file (12.9 kB). View file
 
models/segment_anything/modeling/__pycache__/mask_decoder.cpython-310.pyc ADDED
Binary file (6.26 kB). View file
 
models/segment_anything/modeling/__pycache__/prompt_encoder.cpython-310.pyc ADDED
Binary file (7.85 kB). View file
 
models/segment_anything/modeling/__pycache__/sam.cpython-310.pyc ADDED
Binary file (6.7 kB). View file
 
models/segment_anything/modeling/__pycache__/transformer.cpython-310.pyc ADDED
Binary file (6.61 kB). View file
 
models/segment_anything/utils/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (154 Bytes). View file
 
models/segment_anything/utils/__pycache__/amg.cpython-310.pyc ADDED
Binary file (12.1 kB). View file
 
models/tf/__pycache__/modeling_outputs.cpython-310.pyc ADDED
Binary file (2.94 kB). View file
 
runs/tubetoken_phase0/proposals_stride8_n64_bidir/lYwnXP3g050_4000_14000.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f09a2efd79256574c47624d11d5ae5ccff4e267abb961dc63375d862c8db958
3
+ size 1418418
runs/tubetoken_phase_minus1/audit_full.log ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "area_unstable_expressions": 41,
3
+ "audio_keyword_expressions": 15890,
4
+ "audio_keyword_percent": 77.66753018231586,
5
+ "data_dir": "/workspace/SimToken/data",
6
+ "expressions_per_object": {
7
+ "ge2": 5836,
8
+ "ge3": 4206,
9
+ "max": 10,
10
+ "mean": 2.742125720412813,
11
+ "median": 3
12
+ },
13
+ "expressions_per_video": {
14
+ "ge2": 3521,
15
+ "ge3": 3381,
16
+ "max": 26,
17
+ "mean": 5.7243984331281474,
18
+ "median": 6.0
19
+ },
20
+ "h3_candidate_expressions": 18614,
21
+ "h3_candidate_objects": 5781,
22
+ "late_target_expressions": 0,
23
+ "mask_rows_audited": 20459,
24
+ "multi_expression_objects": 5836,
25
+ "multi_expression_videos": 3521,
26
+ "null_split_expressions": 1028,
27
+ "null_split_percent": 5.0246835133682,
28
+ "num_expressions": 20459,
29
+ "num_objects_vid_fid": 7461,
30
+ "num_videos": 3574,
31
+ "partial_target_expressions": 33,
32
+ "same_category_distractor_heuristic_expressions": 2563,
33
+ "same_category_distractor_heuristic_percent": 12.527494012415074,
34
+ "small_target_expressions": 10037,
35
+ "spatial_keyword_expressions": 5924,
36
+ "spatial_keyword_percent": 28.955471919448655,
37
+ "splits": {
38
+ "TODO": 25,
39
+ "test_n": 1028,
40
+ "test_s": 2288,
41
+ "test_u": 1656,
42
+ "train": 14113,
43
+ "val": 1349
44
+ }
45
+ }
46
+
47
+ Wrote audit files to: /workspace/SimToken/runs/tubetoken_phase_minus1/audit_full
runs/tubetoken_phase_minus1/audit_full/audit_report.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TubeToken Phase -1 Audit
2
+
3
+ - Expressions: 20459
4
+ - Videos: 3574
5
+ - Objects `(vid, fid)`: 7461
6
+ - Splits: `{'val': 1349, 'train': 14113, 'test_s': 2288, 'TODO': 25, 'test_u': 1656, 'test_n': 1028}`
7
+
8
+ ## Multi-expression
9
+
10
+ - Expressions/video mean: 5.724
11
+ - Expressions/video median: 6.0
12
+ - Videos with >=2 expressions: 3521
13
+ - Expressions/object mean: 2.742
14
+ - Objects with >=2 expressions: 5836
15
+ - H3 candidate objects: 5781
16
+ - H3 candidate expressions: 18614
17
+
18
+ ## Diagnostic Subsets
19
+
20
+ - Null split expressions: 1028 (5.02%)
21
+ - Audio-keyword expressions: 15890 (77.67%)
22
+ - Spatial-keyword expressions: 5924 (28.96%)
23
+ - Same-category distractor heuristic expressions: 2563 (12.53%)
24
+ - Mask rows audited: 20459
25
+ - Late-target expressions: 0
26
+ - Small-target expressions: 10037
27
+ - Partial-target expressions: 33
28
+ - Area-unstable expressions: 41
29
+
30
+ ## Phase -1 H3 Decision Hint
31
+
32
+ H3 can stay as a direct validation target: the data has multi-expression structure.
33
+
34
+ Generated files: `audit_summary.json`, `audit_samples.csv`, `h3_candidates.csv`.
runs/tubetoken_phase_minus1/audit_full/audit_samples.csv ADDED
The diff for this file is too large to render. See raw diff
 
tools/audit_refavs.py ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Audit Ref-AVS style metadata for the TubeToken experiment plan.
3
+
4
+ This script intentionally depends only on the dataset files. It does not import
5
+ the training code, so it can run before model dependencies are fully settled.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import argparse
11
+ import csv
12
+ import json
13
+ import math
14
+ import os
15
+ from collections import Counter, defaultdict
16
+ from pathlib import Path
17
+ from statistics import mean, median
18
+ from typing import Dict, Iterable, List, Optional, Tuple
19
+
20
+ try:
21
+ from PIL import Image
22
+ except Exception: # pragma: no cover - only used as an environment fallback
23
+ Image = None
24
+
25
+
26
+ AUDIO_KEYWORDS = (
27
+ "sound",
28
+ "sounding",
29
+ "making sound",
30
+ "longest sound",
31
+ "intermittent sound",
32
+ "silent",
33
+ "audio",
34
+ "heard",
35
+ "emitting",
36
+ "playing instrument",
37
+ "voice",
38
+ "speaking",
39
+ "talking",
40
+ "singing",
41
+ "barking",
42
+ "meowing",
43
+ "hitting",
44
+ )
45
+
46
+ SPATIAL_KEYWORDS = (
47
+ "left",
48
+ "right",
49
+ "top",
50
+ "bottom",
51
+ "front",
52
+ "back",
53
+ "behind",
54
+ "next to",
55
+ "near",
56
+ "far",
57
+ "middle",
58
+ "center",
59
+ "between",
60
+ "above",
61
+ "below",
62
+ "under",
63
+ )
64
+
65
+
66
+ def parse_args() -> argparse.Namespace:
67
+ parser = argparse.ArgumentParser(description="Audit Ref-AVS data for TubeToken Phase -1.")
68
+ parser.add_argument("--data_dir", type=Path, default=Path("data"))
69
+ parser.add_argument("--out_dir", type=Path, default=Path("runs/tubetoken_phase_minus1/audit"))
70
+ parser.add_argument("--frames", type=int, default=10)
71
+ parser.add_argument("--small_area", type=float, default=0.05)
72
+ parser.add_argument("--mask_sample_limit", type=int, default=0, help="0 means audit every row.")
73
+ return parser.parse_args()
74
+
75
+
76
+ def read_metadata(path: Path) -> List[dict]:
77
+ with path.open("r", newline="") as f:
78
+ return list(csv.DictReader(f))
79
+
80
+
81
+ def video_id(row: dict) -> str:
82
+ return row.get("vid") or row["uid"].rsplit("_", 2)[0]
83
+
84
+
85
+ def fid_value(row: dict) -> str:
86
+ return str(row.get("fid", "")).strip()
87
+
88
+
89
+ def object_key(row: dict) -> Tuple[str, str]:
90
+ return video_id(row), fid_value(row)
91
+
92
+
93
+ def category_from_uid(row: dict) -> str:
94
+ vid = video_id(row)
95
+ uid = row.get("uid", "")
96
+ suffix = uid[len(vid) + 1 :] if uid.startswith(vid + "_") else uid.rsplit("_", 2)[-2]
97
+ if "_" in suffix:
98
+ return suffix.rsplit("_", 1)[0]
99
+ return suffix
100
+
101
+
102
+ def has_any(text: str, keywords: Iterable[str]) -> bool:
103
+ text = text.lower()
104
+ return any(k in text for k in keywords)
105
+
106
+
107
+ def mask_path(data_dir: Path, vid: str, fid: str, t: int) -> Path:
108
+ return data_dir / "gt_mask" / vid / f"fid_{fid}" / f"0000{t}.png"
109
+
110
+
111
+ def read_binary_mask_stats(path: Path) -> Optional[Tuple[int, int, int]]:
112
+ if Image is None or not path.exists():
113
+ return None
114
+ with Image.open(path) as img:
115
+ gray = img.convert("L")
116
+ width, height = gray.size
117
+ hist = gray.histogram()
118
+ positive = sum(hist[1:])
119
+ return positive, width, height
120
+
121
+
122
+ def row_mask_stats(data_dir: Path, row: dict, frames: int, small_area: float) -> dict:
123
+ vid = video_id(row)
124
+ fid = fid_value(row)
125
+ positives: List[int] = []
126
+ areas: List[float] = []
127
+ missing = 0
128
+ width = height = None
129
+
130
+ for t in range(frames):
131
+ stats = read_binary_mask_stats(mask_path(data_dir, vid, fid, t))
132
+ if stats is None:
133
+ missing += 1
134
+ positives.append(0)
135
+ areas.append(0.0)
136
+ continue
137
+ pos, width, height = stats
138
+ positives.append(pos)
139
+ denom = max(width * height, 1)
140
+ areas.append(pos / denom)
141
+
142
+ visible = [i for i, pos in enumerate(positives) if pos > 0]
143
+ visible_areas = [areas[i] for i in visible]
144
+ first_visible = min(visible) if visible else None
145
+ mean_visible_area = mean(visible_areas) if visible_areas else 0.0
146
+ mean_all_area = mean(areas) if areas else 0.0
147
+ area_cv = 0.0
148
+ if len(visible_areas) > 1 and mean_visible_area > 0:
149
+ var = sum((x - mean_visible_area) ** 2 for x in visible_areas) / len(visible_areas)
150
+ area_cv = math.sqrt(var) / mean_visible_area
151
+
152
+ return {
153
+ "visible_frames": len(visible),
154
+ "visible_ratio": len(visible) / frames,
155
+ "first_visible": first_visible,
156
+ "late_target": first_visible is not None and first_visible > 0.5 * frames,
157
+ "mean_visible_area": mean_visible_area,
158
+ "mean_all_area": mean_all_area,
159
+ "small_target": mean_visible_area > 0 and mean_visible_area < small_area,
160
+ "partial_target": 0 < len(visible) < 0.5 * frames,
161
+ "area_cv": area_cv,
162
+ "area_unstable": area_cv >= 1.0,
163
+ "missing_masks": missing,
164
+ "width": width,
165
+ "height": height,
166
+ }
167
+
168
+
169
+ def pct(num: int, den: int) -> float:
170
+ return 0.0 if den == 0 else 100.0 * num / den
171
+
172
+
173
+ def summarize_counts(values: List[int]) -> dict:
174
+ if not values:
175
+ return {"mean": 0, "median": 0, "max": 0, "ge2": 0, "ge3": 0}
176
+ return {
177
+ "mean": mean(values),
178
+ "median": median(values),
179
+ "max": max(values),
180
+ "ge2": sum(v >= 2 for v in values),
181
+ "ge3": sum(v >= 3 for v in values),
182
+ }
183
+
184
+
185
+ def write_csv(path: Path, rows: List[dict], fieldnames: List[str]) -> None:
186
+ with path.open("w", newline="") as f:
187
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
188
+ writer.writeheader()
189
+ for row in rows:
190
+ writer.writerow({k: row.get(k, "") for k in fieldnames})
191
+
192
+
193
+ def main() -> None:
194
+ args = parse_args()
195
+ data_dir = args.data_dir
196
+ out_dir = args.out_dir
197
+ out_dir.mkdir(parents=True, exist_ok=True)
198
+
199
+ rows = read_metadata(data_dir / "metadata.csv")
200
+ if args.mask_sample_limit > 0:
201
+ mask_rows = rows[: args.mask_sample_limit]
202
+ else:
203
+ mask_rows = rows
204
+
205
+ by_split = Counter(row["split"] for row in rows)
206
+ by_video: Dict[str, List[dict]] = defaultdict(list)
207
+ by_object: Dict[Tuple[str, str], List[dict]] = defaultdict(list)
208
+ by_video_category: Dict[Tuple[str, str], set] = defaultdict(set)
209
+
210
+ enriched: List[dict] = []
211
+ for row in rows:
212
+ vid = video_id(row)
213
+ fid = fid_value(row)
214
+ category = category_from_uid(row)
215
+ expr = row.get("exp", "")
216
+ row2 = dict(row)
217
+ row2["vid"] = vid
218
+ row2["fid"] = fid
219
+ row2["category"] = category
220
+ row2["is_null_split"] = row.get("split") == "test_n"
221
+ row2["is_audio_keyword"] = has_any(expr, AUDIO_KEYWORDS)
222
+ row2["is_spatial_keyword"] = has_any(expr, SPATIAL_KEYWORDS)
223
+ by_video[vid].append(row2)
224
+ by_object[(vid, fid)].append(row2)
225
+ by_video_category[(vid, category)].add(fid)
226
+ enriched.append(row2)
227
+
228
+ mask_stats_by_uid: Dict[str, dict] = {}
229
+ for row in mask_rows:
230
+ uid = row["uid"]
231
+ mask_stats_by_uid[uid] = row_mask_stats(data_dir, row, args.frames, args.small_area)
232
+
233
+ sample_rows: List[dict] = []
234
+ for row in enriched:
235
+ stats = mask_stats_by_uid.get(row["uid"], {})
236
+ same_cat_fids = by_video_category[(row["vid"], row["category"])]
237
+ row2 = dict(row)
238
+ row2.update(stats)
239
+ row2["same_category_distractor_heuristic"] = len(same_cat_fids) >= 2
240
+ row2["multi_expr_video"] = len(by_video[row["vid"]]) >= 2
241
+ row2["multi_expr_object"] = len(by_object[(row["vid"], row["fid"])]) >= 2
242
+ row2["h3_candidate"] = row2["multi_expr_object"] and not row2["is_null_split"]
243
+ sample_rows.append(row2)
244
+
245
+ video_expr_counts = [len(v) for v in by_video.values()]
246
+ object_expr_counts = [len(v) for v in by_object.values()]
247
+ h3_objects = [k for k, v in by_object.items() if len(v) >= 2 and v[0]["split"] != "test_n"]
248
+ null_rows = [r for r in enriched if r["is_null_split"]]
249
+ audio_rows = [r for r in enriched if r["is_audio_keyword"]]
250
+ spatial_rows = [r for r in enriched if r["is_spatial_keyword"]]
251
+ same_cat_rows = [r for r in sample_rows if r.get("same_category_distractor_heuristic")]
252
+
253
+ audited_mask_rows = [r for r in sample_rows if "visible_ratio" in r]
254
+ late_rows = [r for r in audited_mask_rows if r.get("late_target")]
255
+ small_rows = [r for r in audited_mask_rows if r.get("small_target")]
256
+ partial_rows = [r for r in audited_mask_rows if r.get("partial_target")]
257
+ unstable_rows = [r for r in audited_mask_rows if r.get("area_unstable")]
258
+
259
+ summary = {
260
+ "data_dir": str(data_dir),
261
+ "num_expressions": len(rows),
262
+ "num_videos": len(by_video),
263
+ "num_objects_vid_fid": len(by_object),
264
+ "splits": dict(by_split),
265
+ "expressions_per_video": summarize_counts(video_expr_counts),
266
+ "expressions_per_object": summarize_counts(object_expr_counts),
267
+ "multi_expression_videos": sum(c >= 2 for c in video_expr_counts),
268
+ "multi_expression_objects": sum(c >= 2 for c in object_expr_counts),
269
+ "h3_candidate_objects": len(h3_objects),
270
+ "h3_candidate_expressions": sum(len(by_object[k]) for k in h3_objects),
271
+ "null_split_expressions": len(null_rows),
272
+ "null_split_percent": pct(len(null_rows), len(rows)),
273
+ "audio_keyword_expressions": len(audio_rows),
274
+ "audio_keyword_percent": pct(len(audio_rows), len(rows)),
275
+ "spatial_keyword_expressions": len(spatial_rows),
276
+ "spatial_keyword_percent": pct(len(spatial_rows), len(rows)),
277
+ "same_category_distractor_heuristic_expressions": len(same_cat_rows),
278
+ "same_category_distractor_heuristic_percent": pct(len(same_cat_rows), len(rows)),
279
+ "mask_rows_audited": len(audited_mask_rows),
280
+ "late_target_expressions": len(late_rows),
281
+ "small_target_expressions": len(small_rows),
282
+ "partial_target_expressions": len(partial_rows),
283
+ "area_unstable_expressions": len(unstable_rows),
284
+ }
285
+
286
+ with (out_dir / "audit_summary.json").open("w") as f:
287
+ json.dump(summary, f, indent=2, sort_keys=True)
288
+
289
+ fields = [
290
+ "uid",
291
+ "vid",
292
+ "split",
293
+ "fid",
294
+ "category",
295
+ "exp",
296
+ "is_null_split",
297
+ "is_audio_keyword",
298
+ "is_spatial_keyword",
299
+ "multi_expr_video",
300
+ "multi_expr_object",
301
+ "h3_candidate",
302
+ "same_category_distractor_heuristic",
303
+ "visible_frames",
304
+ "visible_ratio",
305
+ "first_visible",
306
+ "late_target",
307
+ "mean_visible_area",
308
+ "mean_all_area",
309
+ "small_target",
310
+ "partial_target",
311
+ "area_cv",
312
+ "area_unstable",
313
+ "missing_masks",
314
+ "width",
315
+ "height",
316
+ ]
317
+ write_csv(out_dir / "audit_samples.csv", sample_rows, fields)
318
+
319
+ h3_rows = [r for r in sample_rows if r.get("h3_candidate")]
320
+ write_csv(out_dir / "h3_candidates.csv", h3_rows, fields)
321
+
322
+ md = [
323
+ "# TubeToken Phase -1 Audit",
324
+ "",
325
+ f"- Expressions: {summary['num_expressions']}",
326
+ f"- Videos: {summary['num_videos']}",
327
+ f"- Objects `(vid, fid)`: {summary['num_objects_vid_fid']}",
328
+ f"- Splits: `{dict(by_split)}`",
329
+ "",
330
+ "## Multi-expression",
331
+ "",
332
+ f"- Expressions/video mean: {summary['expressions_per_video']['mean']:.3f}",
333
+ f"- Expressions/video median: {summary['expressions_per_video']['median']}",
334
+ f"- Videos with >=2 expressions: {summary['multi_expression_videos']}",
335
+ f"- Expressions/object mean: {summary['expressions_per_object']['mean']:.3f}",
336
+ f"- Objects with >=2 expressions: {summary['multi_expression_objects']}",
337
+ f"- H3 candidate objects: {summary['h3_candidate_objects']}",
338
+ f"- H3 candidate expressions: {summary['h3_candidate_expressions']}",
339
+ "",
340
+ "## Diagnostic Subsets",
341
+ "",
342
+ f"- Null split expressions: {summary['null_split_expressions']} ({summary['null_split_percent']:.2f}%)",
343
+ f"- Audio-keyword expressions: {summary['audio_keyword_expressions']} ({summary['audio_keyword_percent']:.2f}%)",
344
+ f"- Spatial-keyword expressions: {summary['spatial_keyword_expressions']} ({summary['spatial_keyword_percent']:.2f}%)",
345
+ f"- Same-category distractor heuristic expressions: {summary['same_category_distractor_heuristic_expressions']} ({summary['same_category_distractor_heuristic_percent']:.2f}%)",
346
+ f"- Mask rows audited: {summary['mask_rows_audited']}",
347
+ f"- Late-target expressions: {summary['late_target_expressions']}",
348
+ f"- Small-target expressions: {summary['small_target_expressions']}",
349
+ f"- Partial-target expressions: {summary['partial_target_expressions']}",
350
+ f"- Area-unstable expressions: {summary['area_unstable_expressions']}",
351
+ "",
352
+ "## Phase -1 H3 Decision Hint",
353
+ "",
354
+ ]
355
+ epv = summary["expressions_per_video"]["mean"]
356
+ if epv > 1.5 and summary["h3_candidate_objects"] > 0:
357
+ md.append("H3 can stay as a direct validation target: the data has multi-expression structure.")
358
+ elif summary["h3_candidate_objects"] > 0:
359
+ md.append("H3 should be treated as diagnostic: multi-expression objects exist, but average expressions/video is limited.")
360
+ else:
361
+ md.append("H3 should be downgraded: this audit did not find same-object multi-expression candidates.")
362
+ md.append("")
363
+ md.append("Generated files: `audit_summary.json`, `audit_samples.csv`, `h3_candidates.csv`.")
364
+
365
+ (out_dir / "audit_report.md").write_text("\n".join(md) + "\n")
366
+ print(json.dumps(summary, indent=2, sort_keys=True))
367
+ print(f"\nWrote audit files to: {out_dir}")
368
+
369
+
370
+ if __name__ == "__main__":
371
+ main()
tools/tubetoken/__pycache__/evaluate_oracle_refine_sam2.cpython-312.pyc ADDED
Binary file (11.7 kB). View file
 
tools/tubetoken/evaluate_oracle_refine_sam2.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Evaluate bbox-only SAM2 refinement for oracle proposal tubes."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import csv
8
+ import sys
9
+ from collections import defaultdict
10
+ from pathlib import Path
11
+ from typing import Dict, List, Optional
12
+
13
+ import numpy as np
14
+ import torch
15
+ from PIL import Image
16
+ from tqdm import tqdm
17
+
18
+ from phase0_common import (
19
+ bbox_from_mask,
20
+ bool_field,
21
+ evaluate_tube_jf,
22
+ load_audit_rows,
23
+ load_gt_tube,
24
+ read_metadata,
25
+ fid_value,
26
+ video_id,
27
+ write_json,
28
+ )
29
+
30
+
31
+ def parse_args() -> argparse.Namespace:
32
+ parser = argparse.ArgumentParser(description="Evaluate oracle proposal bbox-only SAM2 refinement.")
33
+ parser.add_argument("--data_dir", type=Path, required=True)
34
+ parser.add_argument("--proposal_dir", type=Path, required=True)
35
+ parser.add_argument("--phase0_samples", type=Path, required=True)
36
+ parser.add_argument("--out_dir", type=Path, required=True)
37
+ parser.add_argument("--audit_csv", type=Path, default=None)
38
+ parser.add_argument("--splits", type=str, default="test_s,test_u")
39
+ parser.add_argument("--sam2_repo", type=Path, default=None)
40
+ parser.add_argument("--model_cfg", type=str, default="configs/sam2.1/sam2.1_hiera_l.yaml")
41
+ parser.add_argument("--checkpoint", type=Path, required=True)
42
+ parser.add_argument("--device", type=str, default="cuda")
43
+ parser.add_argument("--frames", type=int, default=10)
44
+ parser.add_argument("--limit_samples", type=int, default=0)
45
+ return parser.parse_args()
46
+
47
+
48
+ def import_sam2(repo: Optional[Path]):
49
+ if repo is not None:
50
+ sys.path.insert(0, str(repo))
51
+ from sam2.build_sam import build_sam2
52
+ from sam2.sam2_image_predictor import SAM2ImagePredictor
53
+
54
+ return build_sam2, SAM2ImagePredictor
55
+
56
+
57
+ def load_rgb(path: Path) -> np.ndarray:
58
+ with Image.open(path) as img:
59
+ return np.array(img.convert("RGB"))
60
+
61
+
62
+ def load_phase0_samples(path: Path) -> Dict[str, dict]:
63
+ with path.open("r", newline="") as f:
64
+ return {row["uid"]: row for row in csv.DictReader(f)}
65
+
66
+
67
+ def nearest_box(tube: np.ndarray, t: int) -> Optional[np.ndarray]:
68
+ boxes = []
69
+ for idx in range(tube.shape[0]):
70
+ box = bbox_from_mask(tube[idx])
71
+ if box is not None:
72
+ boxes.append((idx, np.array(box, dtype=np.float32)))
73
+ if not boxes:
74
+ return None
75
+ _, box = min(boxes, key=lambda item: abs(item[0] - t))
76
+ return box
77
+
78
+
79
+ def predict_box_mask(predictor, image: np.ndarray, box: np.ndarray) -> np.ndarray:
80
+ predictor.set_image(image)
81
+ masks, scores, _ = predictor.predict(box=box, multimask_output=False)
82
+ masks = np.asarray(masks)
83
+ if masks.ndim == 4:
84
+ masks = masks[0]
85
+ if masks.ndim == 3:
86
+ masks = masks[0]
87
+ return masks > 0
88
+
89
+
90
+ def sample_subsets(row: dict, audit: Dict[str, dict]) -> List[str]:
91
+ out = ["all", row["split"]]
92
+ audit_row = audit.get(row["uid"])
93
+ for key, name in [
94
+ ("small_target", "small"),
95
+ ("partial_target", "partial"),
96
+ ("area_unstable", "area_unstable"),
97
+ ("late_target", "late_target"),
98
+ ("is_audio_keyword", "audio_keyword"),
99
+ ("is_spatial_keyword", "spatial_keyword"),
100
+ ("same_category_distractor_heuristic", "same_category"),
101
+ ("h3_candidate", "h3_candidate"),
102
+ ]:
103
+ if bool_field(audit_row, key):
104
+ out.append(name)
105
+ return out
106
+
107
+
108
+ def empty_bucket() -> dict:
109
+ return {"count": 0, "refined_j": 0.0, "refined_f": 0.0, "refined_jf": 0.0}
110
+
111
+
112
+ def add_bucket(bucket: dict, sample: dict) -> None:
113
+ bucket["count"] += 1
114
+ bucket["refined_j"] += sample["refined_j"]
115
+ bucket["refined_f"] += sample["refined_f"]
116
+ bucket["refined_jf"] += sample["refined_jf"]
117
+
118
+
119
+ def finalize(bucket: dict) -> dict:
120
+ out = dict(bucket)
121
+ if bucket["count"]:
122
+ for key in ["refined_j", "refined_f", "refined_jf"]:
123
+ out[key] = bucket[key] / bucket["count"]
124
+ return out
125
+
126
+
127
+ def main() -> None:
128
+ args = parse_args()
129
+ args.out_dir.mkdir(parents=True, exist_ok=True)
130
+ build_sam2, SAM2ImagePredictor = import_sam2(args.sam2_repo)
131
+ model = build_sam2(args.model_cfg, str(args.checkpoint), device=args.device)
132
+ predictor = SAM2ImagePredictor(model)
133
+
134
+ splits = [s.strip() for s in args.splits.split(",") if s.strip()]
135
+ rows = read_metadata(args.data_dir, splits)
136
+ if args.limit_samples:
137
+ rows = rows[: args.limit_samples]
138
+ phase0_samples = load_phase0_samples(args.phase0_samples)
139
+ audit = load_audit_rows(args.audit_csv) if args.audit_csv else {}
140
+
141
+ out_rows: List[dict] = []
142
+ summary = defaultdict(empty_bucket)
143
+
144
+ for row in tqdm(rows, desc="Oracle bbox-only SAM2 refinement"):
145
+ phase0 = phase0_samples[row["uid"]]
146
+ best_idx = int(phase0["best_idx"])
147
+ if best_idx < 0:
148
+ continue
149
+ vid = video_id(row)
150
+ proposals = np.load(args.proposal_dir / f"{vid}.npz")["masks"].astype(bool)
151
+ if best_idx >= proposals.shape[0]:
152
+ continue
153
+ oracle_tube = proposals[best_idx]
154
+ refined_masks = []
155
+
156
+ with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16, enabled=args.device.startswith("cuda")):
157
+ for t in range(args.frames):
158
+ box = bbox_from_mask(oracle_tube[t])
159
+ if box is None:
160
+ box = nearest_box(oracle_tube, t)
161
+ if box is None:
162
+ refined_masks.append(np.zeros_like(oracle_tube[t], dtype=bool))
163
+ continue
164
+ image = load_rgb(args.data_dir / "media" / vid / "frames" / f"{t}.jpg")
165
+ refined_masks.append(predict_box_mask(predictor, image, np.asarray(box, dtype=np.float32)))
166
+
167
+ refined_tube = np.stack(refined_masks, axis=0)
168
+ gt = load_gt_tube(args.data_dir, vid, fid_value(row), args.frames)
169
+ j, f, jf = evaluate_tube_jf(refined_tube, gt)
170
+ sample = {
171
+ "uid": row["uid"],
172
+ "vid": vid,
173
+ "split": row["split"],
174
+ "fid": fid_value(row),
175
+ "best_idx": best_idx,
176
+ "refined_j": j,
177
+ "refined_f": f,
178
+ "refined_jf": jf,
179
+ }
180
+ out_rows.append(sample)
181
+ for subset in sample_subsets(row, audit):
182
+ add_bucket(summary[subset], sample)
183
+
184
+ with (args.out_dir / "sample_metrics.csv").open("w", newline="") as f:
185
+ fieldnames = list(out_rows[0].keys()) if out_rows else []
186
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
187
+ writer.writeheader()
188
+ writer.writerows(out_rows)
189
+
190
+ final_summary = {name: finalize(bucket) for name, bucket in sorted(summary.items())}
191
+ write_json(args.out_dir / "summary.json", final_summary)
192
+
193
+ md = ["# TubeToken Phase 0 Oracle Refined Evaluation", ""]
194
+ for name, metrics in final_summary.items():
195
+ if metrics["count"] == 0:
196
+ continue
197
+ md.append(f"- {name}: n={metrics['count']}, Refined J&F={metrics['refined_jf']:.4f}")
198
+ (args.out_dir / "report.md").write_text("\n".join(md) + "\n")
199
+ print("\n".join(md))
200
+
201
+
202
+ if __name__ == "__main__":
203
+ main()
utils/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (198 Bytes). View file
 
utils/metric/__pycache__/pyutils.cpython-310.pyc ADDED
Binary file (5.4 kB). View file
 
utils/metric/__pycache__/utility.cpython-310.pyc ADDED
Binary file (2.94 kB). View file