yfan07 commited on
Commit
209e7b5
·
verified ·
1 Parent(s): 0f55e72

Add files using upload-large-folder tool

Browse files
Files changed (33) hide show
  1. ChatUniVi/__pycache__/__init__.cpython-310.pyc +0 -0
  2. ChatUniVi/__pycache__/constants.cpython-310.pyc +0 -0
  3. ChatUniVi/model/__pycache__/__init__.cpython-310.pyc +0 -0
  4. ChatUniVi/model/__pycache__/arch.cpython-310.pyc +0 -0
  5. ChatUniVi/model/__pycache__/cluster.cpython-310.pyc +0 -0
  6. ChatUniVi/model/language_model/__pycache__/llama.cpython-310.pyc +0 -0
  7. ChatUniVi/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc +0 -0
  8. ChatUniVi/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc +0 -0
  9. ChatUniVi/model/multimodal_encoder/__pycache__/eva_encoder.cpython-310.pyc +0 -0
  10. ChatUniVi/model/multimodal_encoder/__pycache__/eva_vit.cpython-310.pyc +0 -0
  11. ChatUniVi/model/multimodal_encoder/__pycache__/utils.cpython-310.pyc +0 -0
  12. ChatUniVi/model/multimodal_projector/__pycache__/builder.cpython-310.pyc +0 -0
  13. configs/__pycache__/config.cpython-310.pyc +0 -0
  14. configs/__pycache__/config.cpython-312.pyc +0 -0
  15. datasets/__pycache__/dataset_refavs.cpython-310.pyc +0 -0
  16. models/segment_anything/utils/__pycache__/transforms.cpython-310.pyc +0 -0
  17. runs/tubetoken_phase0/eval_stride8_n64_bidir.log +13 -0
  18. runs/tubetoken_phase0/miss_videos_r64.txt +164 -0
  19. runs/tubetoken_phase0/proposals_stride8_n64_bidir.log +1 -0
  20. runs/tubetoken_phase_minus1/audit_full/audit_summary.json +45 -0
  21. runs/tubetoken_phase_minus1/audit_full/h3_candidates.csv +0 -0
  22. runs/tubetoken_phase_minus1/simtoken_eval/eval.log +0 -0
  23. runs/tubetoken_phase_minus1/simtoken_eval/eval_null.log +11 -0
  24. tools/__pycache__/audit_refavs.cpython-312.pyc +0 -0
  25. tools/tubetoken/__pycache__/evaluate_phase0_proposals.cpython-310.pyc +0 -0
  26. tools/tubetoken/__pycache__/evaluate_phase0_proposals.cpython-312.pyc +0 -0
  27. tools/tubetoken/__pycache__/generate_sam2_proposals.cpython-310.pyc +0 -0
  28. tools/tubetoken/__pycache__/generate_sam2_proposals.cpython-312.pyc +0 -0
  29. tools/tubetoken/__pycache__/phase0_common.cpython-310.pyc +0 -0
  30. tools/tubetoken/__pycache__/phase0_common.cpython-312.pyc +0 -0
  31. tools/tubetoken/evaluate_phase0_proposals.py +234 -0
  32. tools/tubetoken/generate_sam2_proposals.py +356 -0
  33. tools/tubetoken/phase0_common.py +214 -0
ChatUniVi/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (189 Bytes). View file
 
ChatUniVi/__pycache__/constants.cpython-310.pyc ADDED
Binary file (726 Bytes). View file
 
ChatUniVi/model/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (236 Bytes). View file
 
ChatUniVi/model/__pycache__/arch.cpython-310.pyc ADDED
Binary file (14.2 kB). View file
 
ChatUniVi/model/__pycache__/cluster.cpython-310.pyc ADDED
Binary file (9.06 kB). View file
 
ChatUniVi/model/language_model/__pycache__/llama.cpython-310.pyc ADDED
Binary file (3.67 kB). View file
 
ChatUniVi/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc ADDED
Binary file (487 Bytes). View file
 
ChatUniVi/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc ADDED
Binary file (3.05 kB). View file
 
ChatUniVi/model/multimodal_encoder/__pycache__/eva_encoder.cpython-310.pyc ADDED
Binary file (3.07 kB). View file
 
ChatUniVi/model/multimodal_encoder/__pycache__/eva_vit.cpython-310.pyc ADDED
Binary file (14.1 kB). View file
 
ChatUniVi/model/multimodal_encoder/__pycache__/utils.cpython-310.pyc ADDED
Binary file (3.77 kB). View file
 
ChatUniVi/model/multimodal_projector/__pycache__/builder.cpython-310.pyc ADDED
Binary file (2.05 kB). View file
 
configs/__pycache__/config.cpython-310.pyc ADDED
Binary file (2.22 kB). View file
 
configs/__pycache__/config.cpython-312.pyc ADDED
Binary file (3.61 kB). View file
 
datasets/__pycache__/dataset_refavs.cpython-310.pyc ADDED
Binary file (5.04 kB). View file
 
models/segment_anything/utils/__pycache__/transforms.cpython-310.pyc ADDED
Binary file (3.99 kB). View file
 
runs/tubetoken_phase0/eval_stride8_n64_bidir.log ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # TubeToken Phase 0 Proposal Evaluation
3
+
4
+ - all: n=3944, R@16=0.469, R@32=0.597, R@64=0.754, Oracle J&F=0.7491, miss=24.62%
5
+ - area_unstable: n=18, R@16=0.556, R@32=0.556, R@64=0.889, Oracle J&F=0.7114, miss=11.11%
6
+ - audio_keyword: n=2844, R@16=0.475, R@32=0.610, R@64=0.766, Oracle J&F=0.7569, miss=23.42%
7
+ - h3_candidate: n=3932, R@16=0.469, R@32=0.597, R@64=0.754, Oracle J&F=0.7488, miss=24.64%
8
+ - partial: n=8, R@16=0.250, R@32=0.250, R@64=1.000, Oracle J&F=0.8123, miss=0.00%
9
+ - same_category: n=330, R@16=0.482, R@32=0.588, R@64=0.709, Oracle J&F=0.7261, miss=29.09%
10
+ - small: n=1631, R@16=0.237, R@32=0.392, R@64=0.633, Oracle J&F=0.6367, miss=36.73%
11
+ - spatial_keyword: n=965, R@16=0.331, R@32=0.476, R@64=0.658, Oracle J&F=0.6714, miss=34.20%
12
+ - test_s: n=2288, R@16=0.326, R@32=0.483, R@64=0.657, Oracle J&F=0.6674, miss=34.27%
13
+ - test_u: n=1656, R@16=0.665, R@32=0.755, R@64=0.887, Oracle J&F=0.8618, miss=11.29%
runs/tubetoken_phase0/miss_videos_r64.txt ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -wb3JWo4qTg_380000_390000
2
+ 0BsmPd44Bic_60000_70000
3
+ 2VqnoV67t0g_290000_300000
4
+ 2wmxck5D9Vw_102000_112000
5
+ 39xq5AATMp4_42000_52000
6
+ 39xq5AATMp4_60000_70000
7
+ 45oaGY3mzlQ_5000_15000
8
+ 4DCxlVTWN8g_3000_13000
9
+ 4GUrjmIZcIQ_82000_92000
10
+ 4YLjVRZHoZI_1750000_1760000
11
+ 4ZPJ867OBo4_75000_85000
12
+ 4k63MOSjOBw_21000_31000
13
+ 4rerQA8KMzM_10000_20000
14
+ 4tASCYb3ySA_1000_11000
15
+ 50Z4O0mzAXg_150000_160000
16
+ 5FYG_Wtet7U_120000_130000
17
+ 5YLQapaxA8o_31000_41000
18
+ 5YLQapaxA8o_80000_90000
19
+ 6KT2mbCNxO4_228000_238000
20
+ 6KT2mbCNxO4_434000_444000
21
+ 6WUjWMuoEhM_10000_20000
22
+ 6tTLxIKVtfE_26000_36000
23
+ 73QQbJIeB3Y_265000_275000
24
+ 73QQbJIeB3Y_353000_363000
25
+ 73QQbJIeB3Y_95000_105000
26
+ 79T0FclnIDw_0_10000
27
+ 79w-8fTYazw_15000_25000
28
+ 7PGfd8pg86w_15000_25000
29
+ 7pNykt6zACc_275000_285000
30
+ 7pWa_kcAoMg_49000_59000
31
+ 872Lwp3MOro_6000_16000
32
+ 8hr42lVW_gk_60000_70000
33
+ 9ZQ-T83pfWk_65000_75000
34
+ 9xp46AwF9BY_38000_48000
35
+ A-e3dr7fTIs_0_10000
36
+ A-e3dr7fTIs_318000_328000
37
+ A-e3dr7fTIs_80000_90000
38
+ AxjZti5_t94_90000_100000
39
+ BRAQLY85x5U_75000_85000
40
+ BSkcM4ScyEs_150000_160000
41
+ CCFZT2_TJr0_15000_25000
42
+ CPiOGMl59L4_30000_40000
43
+ CVA8LpBW3Sc_76000_86000
44
+ CjQmcO_Q5d8_42000_52000
45
+ De5YhqX0jdI_0_10000
46
+ DzXVMjsZloE_0_10000
47
+ EKM1wu2vXy4_26000_36000
48
+ EQWvv8WbU04_22000_32000
49
+ ETmo71vP7tA_20000_30000
50
+ EuU2PWB1t2g_20000_30000
51
+ EzPcuNoSHMM_0_10000
52
+ F1x1ck3OySg_10000_20000
53
+ F3KWGQfJ2HM_26000_36000
54
+ GG-PF_JxeW4_1640000_1650000
55
+ HuFzVoyayj8_521000_531000
56
+ JgkUToh3HeY_49000_59000
57
+ KXvmc3dLn3E_720000_730000
58
+ LOFX2UVozf8_80000_90000
59
+ LitaFzObEEk_5000_15000
60
+ MDyjY3uiWp0_273000_283000
61
+ Maa21OL-40Q_20000_30000
62
+ Mivqxr0RS8w_18000_28000
63
+ OFgcrlxku9g_160000_170000
64
+ OGCyzmaM_kE_10000_20000
65
+ OMYuLiqSUxE_180000_190000
66
+ OMYuLiqSUxE_30000_40000
67
+ OPVEPq_r-vk_211000_221000
68
+ OX6T2z4P9fA_23000_33000
69
+ Ow9uE_v2AEg_28000_38000
70
+ PSoKYh3ea1o_60000_70000
71
+ PcdKAvd51l0_41000_51000
72
+ PdHpl04tQV8_40000_50000
73
+ Pe1LuVFTczE_106000_116000
74
+ Pe1LuVFTczE_358000_368000
75
+ QHcG-FDM75Q_113000_123000
76
+ QNiHU290owU_55000_65000
77
+ QTe-i0Pcn4s_37000_47000
78
+ RHy5nC-gRV8_668000_678000
79
+ RMF6sp6tWHM_100000_110000
80
+ Ru7m8PyMlVM_120000_130000
81
+ SFVZ2OklsVM_2715000_2725000
82
+ SFVZ2OklsVM_2765000_2775000
83
+ T9K1uy-G5qA_110000_120000
84
+ TCcD-vOUtNc_99000_109000
85
+ U1dZX1ReD88_48000_58000
86
+ URZyjoh9lbc_500000_510000
87
+ URZyjoh9lbc_560000_570000
88
+ UYUH7Jmfp3g_13000_23000
89
+ UYUH7Jmfp3g_76000_86000
90
+ UlYU9z7Y8jY_68000_78000
91
+ V9KZ5FCtG9A_15000_25000
92
+ VbEvfbj_IxU_117000_127000
93
+ VlPdfLr1FSo_7000_17000
94
+ W5yveLPTD04_211000_221000
95
+ WSy8ay1avew_60000_70000
96
+ Y735cxoG5-4_270000_280000
97
+ YAW2vMKV9pw_50000_60000
98
+ ZPYqUww_x6k_293000_303000
99
+ ZR3vnlhJuSE_101000_111000
100
+ ZU0JSxWk1Po_16000_26000
101
+ _-apT0tfo6U_16000_26000
102
+ a1nWlW629TU_15000_25000
103
+ aBmzZJZ_M8Y_9000_19000
104
+ bELyeHxF7eA_42000_52000
105
+ bcGfmy0X-CQ_30000_40000
106
+ cIbVu0ixSAo_302000_312000
107
+ dRa7aBGnStU_82000_92000
108
+ dUcOkRkz6bA_387000_397000
109
+ dVa49WwXzr8_1023000_1033000
110
+ dk_xhLkWyDo_30000_40000
111
+ ds3RKnNB-cY_8000_18000
112
+ ehlPuuiNEd8_376000_386000
113
+ f0jKjIOFzAY_243000_253000
114
+ fTobKZBbMos_2000_12000
115
+ fWvQqgSDUPU_10000_20000
116
+ hYt2Qf438l8_40000_50000
117
+ hornh-NQBHY_262000_272000
118
+ iSBtK1T10Ew_415000_425000
119
+ jj1UxRTBaNw_210000_220000
120
+ k35blcO8Z7k_700000_710000
121
+ kF2y7RIC7-Y_258000_268000
122
+ kF2y7RIC7-Y_385000_395000
123
+ lcOP60uXMeI_20000_30000
124
+ lcOP60uXMeI_396000_406000
125
+ lzzMHoi3r2w_50000_60000
126
+ mUN93MlvX64_42000_52000
127
+ nT0PHpAlvys_123000_133000
128
+ nT0PHpAlvys_313000_323000
129
+ nT0PHpAlvys_435000_445000
130
+ oRSCL3149fI_146000_156000
131
+ oRSCL3149fI_87000_97000
132
+ oSMvY0tErC4_100000_110000
133
+ oSMvY0tErC4_50000_60000
134
+ oVK2QsKq8ak_186000_196000
135
+ oYeir4FWq_8_6000_16000
136
+ oaYIWYXFMqY_70000_80000
137
+ oyi5pKPwz9Q_3000_13000
138
+ p2u4OJKqMxE_130000_140000
139
+ p5uRwtw7S3E_243000_253000
140
+ pNV8MKNqOkI_20000_30000
141
+ pf6ZpxTFL1Y_51000_61000
142
+ puugfzdXYz4_30000_40000
143
+ q3YnhPgt-rM_118000_128000
144
+ qYtrnr4chfU_2000_12000
145
+ r4NdM595K5c_40000_50000
146
+ rozFJYWrLj0_120000_130000
147
+ rzKjN2en0H8_830000_840000
148
+ sHyhvtLTCbo_10000_20000
149
+ sMsrz5VqchQ_130000_140000
150
+ sfv_msSOYTo_10000_20000
151
+ tMorLZku6Pc_194000_204000
152
+ tZbh1cwwfv0_222000_232000
153
+ vVJ-Zhj2HvU_22000_32000
154
+ vifFbeL5rOo_9000_19000
155
+ w7vA5f0vPvQ_15000_25000
156
+ wfkdedUW-dk_50000_60000
157
+ xUQ9rDswHdw_140000_150000
158
+ xnx3u5YaNuc_402000_412000
159
+ z3Q_mZgKLrM_20000_30000
160
+ zM7QopQ3MgI_319000_329000
161
+ zM7QopQ3MgI_410000_420000
162
+ zM7QopQ3MgI_499000_509000
163
+ zPMUL7f4OOU_420000_430000
164
+ zstao4nIPmU_250000_260000
runs/tubetoken_phase0/proposals_stride8_n64_bidir.log ADDED
@@ -0,0 +1 @@
 
 
1
+
runs/tubetoken_phase_minus1/audit_full/audit_summary.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "area_unstable_expressions": 41,
3
+ "audio_keyword_expressions": 15890,
4
+ "audio_keyword_percent": 77.66753018231586,
5
+ "data_dir": "/workspace/SimToken/data",
6
+ "expressions_per_object": {
7
+ "ge2": 5836,
8
+ "ge3": 4206,
9
+ "max": 10,
10
+ "mean": 2.742125720412813,
11
+ "median": 3
12
+ },
13
+ "expressions_per_video": {
14
+ "ge2": 3521,
15
+ "ge3": 3381,
16
+ "max": 26,
17
+ "mean": 5.7243984331281474,
18
+ "median": 6.0
19
+ },
20
+ "h3_candidate_expressions": 18614,
21
+ "h3_candidate_objects": 5781,
22
+ "late_target_expressions": 0,
23
+ "mask_rows_audited": 20459,
24
+ "multi_expression_objects": 5836,
25
+ "multi_expression_videos": 3521,
26
+ "null_split_expressions": 1028,
27
+ "null_split_percent": 5.0246835133682,
28
+ "num_expressions": 20459,
29
+ "num_objects_vid_fid": 7461,
30
+ "num_videos": 3574,
31
+ "partial_target_expressions": 33,
32
+ "same_category_distractor_heuristic_expressions": 2563,
33
+ "same_category_distractor_heuristic_percent": 12.527494012415074,
34
+ "small_target_expressions": 10037,
35
+ "spatial_keyword_expressions": 5924,
36
+ "spatial_keyword_percent": 28.955471919448655,
37
+ "splits": {
38
+ "TODO": 25,
39
+ "test_n": 1028,
40
+ "test_s": 2288,
41
+ "test_u": 1656,
42
+ "train": 14113,
43
+ "val": 1349
44
+ }
45
+ }
runs/tubetoken_phase_minus1/audit_full/h3_candidates.csv ADDED
The diff for this file is too large to render. See raw diff
 
runs/tubetoken_phase_minus1/simtoken_eval/eval.log ADDED
The diff for this file is too large to render. See raw diff
 
runs/tubetoken_phase_minus1/simtoken_eval/eval_null.log ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ seg_token_idx: 32000
2
+
3
+
4
+ model loaded
5
+
6
+ Lora deployed
7
+ trainable params: 4194304 || all params: 7709437232 || trainable%: 0.0544048012037826
8
+ saved model loaded
9
+
10
+
11
+ valuate on test_n_refer, metric: 0.011791757307946682
tools/__pycache__/audit_refavs.cpython-312.pyc ADDED
Binary file (19.6 kB). View file
 
tools/tubetoken/__pycache__/evaluate_phase0_proposals.cpython-310.pyc ADDED
Binary file (8.83 kB). View file
 
tools/tubetoken/__pycache__/evaluate_phase0_proposals.cpython-312.pyc ADDED
Binary file (10.4 kB). View file
 
tools/tubetoken/__pycache__/generate_sam2_proposals.cpython-310.pyc ADDED
Binary file (13.9 kB). View file
 
tools/tubetoken/__pycache__/generate_sam2_proposals.cpython-312.pyc ADDED
Binary file (16 kB). View file
 
tools/tubetoken/__pycache__/phase0_common.cpython-310.pyc ADDED
Binary file (8.49 kB). View file
 
tools/tubetoken/__pycache__/phase0_common.cpython-312.pyc ADDED
Binary file (12.4 kB). View file
 
tools/tubetoken/evaluate_phase0_proposals.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Evaluate proposal recall and oracle tube J/F for TubeToken Phase 0."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import csv
8
+ from collections import defaultdict
9
+ from pathlib import Path
10
+ from typing import Dict, List
11
+
12
+ import numpy as np
13
+ from tqdm import tqdm
14
+
15
+ from phase0_common import (
16
+ bool_field,
17
+ evaluate_tube_jf,
18
+ load_audit_rows,
19
+ load_gt_tube,
20
+ read_metadata,
21
+ rows_by_video,
22
+ rows_by_video,
23
+ tube_iou_all,
24
+ tube_iou_visible,
25
+ video_id,
26
+ fid_value,
27
+ write_json,
28
+ )
29
+
30
+
31
+ def parse_args() -> argparse.Namespace:
32
+ parser = argparse.ArgumentParser(description="Evaluate Phase 0 proposal cache.")
33
+ parser.add_argument("--data_dir", type=Path, required=True)
34
+ parser.add_argument("--proposal_dir", type=Path, required=True)
35
+ parser.add_argument("--out_dir", type=Path, required=True)
36
+ parser.add_argument("--audit_csv", type=Path, default=None)
37
+ parser.add_argument("--splits", type=str, default="test_s,test_u")
38
+ parser.add_argument("--frames", type=int, default=10)
39
+ parser.add_argument("--recall_ns", type=str, default="16,32,64,128")
40
+ parser.add_argument("--match_iou", type=float, default=0.5)
41
+ parser.add_argument("--limit_videos", type=int, default=0)
42
+ parser.add_argument("--video_list", type=Path, default=None)
43
+ parser.add_argument("--shard_id", type=int, default=0)
44
+ parser.add_argument("--num_shards", type=int, default=1)
45
+ parser.add_argument("--only_existing_proposals", action="store_true")
46
+ parser.add_argument("--skip_oracle_jf", action="store_true", help="Only compute Recall@N and tube IoU, useful for fast early checks.")
47
+ return parser.parse_args()
48
+
49
+
50
+ def load_proposals(path: Path) -> np.ndarray:
51
+ data = np.load(path)
52
+ return data["masks"].astype(bool)
53
+
54
+
55
+ def load_video_list(path: Path | None) -> List[str] | None:
56
+ if path is None:
57
+ return None
58
+ vids = []
59
+ for line in path.read_text().splitlines():
60
+ line = line.strip()
61
+ if line and not line.startswith("#"):
62
+ vids.append(line)
63
+ return vids
64
+
65
+
66
+ def sample_subsets(row: dict, audit: Dict[str, dict]) -> List[str]:
67
+ out = ["all", row["split"]]
68
+ audit_row = audit.get(row["uid"])
69
+ if bool_field(audit_row, "small_target"):
70
+ out.append("small")
71
+ if bool_field(audit_row, "partial_target"):
72
+ out.append("partial")
73
+ if bool_field(audit_row, "area_unstable"):
74
+ out.append("area_unstable")
75
+ if bool_field(audit_row, "late_target"):
76
+ out.append("late_target")
77
+ if bool_field(audit_row, "is_audio_keyword"):
78
+ out.append("audio_keyword")
79
+ if bool_field(audit_row, "is_spatial_keyword"):
80
+ out.append("spatial_keyword")
81
+ if bool_field(audit_row, "same_category_distractor_heuristic"):
82
+ out.append("same_category")
83
+ if bool_field(audit_row, "h3_candidate"):
84
+ out.append("h3_candidate")
85
+ return out
86
+
87
+
88
+ def empty_metrics(recall_ns: List[int]) -> dict:
89
+ return {
90
+ "count": 0,
91
+ "proposal_miss": 0,
92
+ "oracle_j": 0.0,
93
+ "oracle_f": 0.0,
94
+ "oracle_jf": 0.0,
95
+ "oracle_iou_visible": 0.0,
96
+ "oracle_iou_all": 0.0,
97
+ **{f"recall@{n}": 0 for n in recall_ns},
98
+ }
99
+
100
+
101
+ def add_metrics(bucket: dict, sample: dict, recall_ns: List[int]) -> None:
102
+ bucket["count"] += 1
103
+ bucket["proposal_miss"] += int(not sample["covered"])
104
+ bucket["oracle_j"] += sample["oracle_j"]
105
+ bucket["oracle_f"] += sample["oracle_f"]
106
+ bucket["oracle_jf"] += sample["oracle_jf"]
107
+ bucket["oracle_iou_visible"] += sample["oracle_iou_visible"]
108
+ bucket["oracle_iou_all"] += sample["oracle_iou_all"]
109
+ for n in recall_ns:
110
+ bucket[f"recall@{n}"] += int(sample[f"recall@{n}"])
111
+
112
+
113
+ def finalize(bucket: dict, recall_ns: List[int]) -> dict:
114
+ count = bucket["count"]
115
+ if count == 0:
116
+ return dict(bucket)
117
+ out = dict(bucket)
118
+ out["proposal_miss_percent"] = 100.0 * bucket["proposal_miss"] / count
119
+ for key in ["oracle_j", "oracle_f", "oracle_jf", "oracle_iou_visible", "oracle_iou_all"]:
120
+ out[key] = bucket[key] / count
121
+ for n in recall_ns:
122
+ out[f"recall@{n}"] = bucket[f"recall@{n}"] / count
123
+ return out
124
+
125
+
126
+ def main() -> None:
127
+ args = parse_args()
128
+ args.out_dir.mkdir(parents=True, exist_ok=True)
129
+ splits = [s.strip() for s in args.splits.split(",") if s.strip()]
130
+ recall_ns = [int(x) for x in args.recall_ns.split(",") if x.strip()]
131
+ audit = load_audit_rows(args.audit_csv) if args.audit_csv else {}
132
+ rows = read_metadata(args.data_dir, splits)
133
+ selected_vids = load_video_list(args.video_list)
134
+ if selected_vids is not None:
135
+ selected = set(selected_vids)
136
+ rows = [row for row in rows if video_id(row) in selected]
137
+ if args.num_shards < 1:
138
+ raise ValueError("--num_shards must be >= 1")
139
+ if args.shard_id < 0 or args.shard_id >= args.num_shards:
140
+ raise ValueError("--shard_id must be in [0, num_shards)")
141
+ if args.num_shards > 1:
142
+ vids = sorted(rows_by_video(rows).keys())
143
+ selected = {vid for idx, vid in enumerate(vids) if idx % args.num_shards == args.shard_id}
144
+ rows = [row for row in rows if video_id(row) in selected]
145
+ if args.limit_videos:
146
+ vids = sorted(rows_by_video(rows).keys())[: args.limit_videos]
147
+ rows = [row for row in rows if video_id(row) in set(vids)]
148
+ if args.only_existing_proposals:
149
+ rows = [row for row in rows if (args.proposal_dir / f"{video_id(row)}.npz").exists()]
150
+
151
+ sample_rows: List[dict] = []
152
+ summary = defaultdict(lambda: empty_metrics(recall_ns))
153
+
154
+ video_groups = rows_by_video(rows)
155
+ total_objects = sum(len({fid_value(row) for row in group}) for group in video_groups.values())
156
+
157
+ with tqdm(total=total_objects, desc="Evaluating proposal objects") as pbar:
158
+ for vid, video_rows in video_groups.items():
159
+ prop_path = args.proposal_dir / f"{vid}.npz"
160
+ if not prop_path.exists():
161
+ raise FileNotFoundError(f"Missing proposal cache: {prop_path}")
162
+ proposals = load_proposals(prop_path)
163
+ object_cache = {}
164
+
165
+ for row in video_rows:
166
+ key = fid_value(row)
167
+ if key in object_cache:
168
+ base_sample = object_cache[key]
169
+ else:
170
+ gt = load_gt_tube(args.data_dir, vid, key, args.frames)
171
+ visible_ious = np.array([tube_iou_visible(tube, gt) for tube in proposals], dtype=np.float32)
172
+ all_ious = np.array([tube_iou_all(tube, gt) for tube in proposals], dtype=np.float32)
173
+ if len(visible_ious) == 0:
174
+ best_idx = -1
175
+ best_visible = 0.0
176
+ best_all = 0.0
177
+ oracle_j = oracle_f = oracle_jf = 0.0
178
+ else:
179
+ best_idx = int(visible_ious.argmax())
180
+ best_visible = float(visible_ious[best_idx])
181
+ best_all = float(all_ious[best_idx])
182
+ if args.skip_oracle_jf:
183
+ oracle_j = oracle_f = oracle_jf = 0.0
184
+ else:
185
+ oracle_j, oracle_f, oracle_jf = evaluate_tube_jf(proposals[best_idx], gt)
186
+
187
+ base_sample = {
188
+ "vid": vid,
189
+ "fid": key,
190
+ "best_idx": best_idx,
191
+ "num_tubes": int(proposals.shape[0]),
192
+ "covered": best_visible >= args.match_iou,
193
+ "oracle_iou_visible": best_visible,
194
+ "oracle_iou_all": best_all,
195
+ "oracle_j": oracle_j,
196
+ "oracle_f": oracle_f,
197
+ "oracle_jf": oracle_jf,
198
+ }
199
+ for n in recall_ns:
200
+ top = visible_ious[: min(n, len(visible_ious))]
201
+ base_sample[f"recall@{n}"] = bool(len(top) and float(top.max()) >= args.match_iou)
202
+ object_cache[key] = base_sample
203
+ pbar.update(1)
204
+
205
+ sample = dict(base_sample)
206
+ sample.update({"uid": row["uid"], "split": row["split"]})
207
+ sample_rows.append(sample)
208
+ for subset in sample_subsets(row, audit):
209
+ add_metrics(summary[subset], sample, recall_ns)
210
+
211
+ with (args.out_dir / "sample_metrics.csv").open("w", newline="") as f:
212
+ fieldnames = list(sample_rows[0].keys()) if sample_rows else []
213
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
214
+ writer.writeheader()
215
+ writer.writerows(sample_rows)
216
+
217
+ final_summary = {name: finalize(bucket, recall_ns) for name, bucket in sorted(summary.items())}
218
+ write_json(args.out_dir / "summary.json", final_summary)
219
+
220
+ md = ["# TubeToken Phase 0 Proposal Evaluation", ""]
221
+ for name, metrics in final_summary.items():
222
+ if metrics["count"] == 0:
223
+ continue
224
+ recall_text = ", ".join(f"R@{n}={metrics[f'recall@{n}']:.3f}" for n in recall_ns)
225
+ md.append(
226
+ f"- {name}: n={metrics['count']}, {recall_text}, "
227
+ f"Oracle J&F={metrics['oracle_jf']:.4f}, miss={metrics['proposal_miss_percent']:.2f}%"
228
+ )
229
+ (args.out_dir / "report.md").write_text("\n".join(md) + "\n")
230
+ print("\n".join(md))
231
+
232
+
233
+ if __name__ == "__main__":
234
+ main()
tools/tubetoken/generate_sam2_proposals.py ADDED
@@ -0,0 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Generate SAM2 proposal tubes for TubeToken Phase 0.
3
+
4
+ The cache format is one NPZ per video:
5
+ masks: uint8 [N, T, H, W]
6
+ scores: float32 [N]
7
+ keyframes: int64 [N]
8
+ boxes_xyxy: float32 [N, 4]
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import argparse
14
+ from contextlib import nullcontext
15
+ import os
16
+ import sys
17
+ import time
18
+ from pathlib import Path
19
+ from typing import Dict, List, Tuple
20
+
21
+ import numpy as np
22
+ import torch
23
+ from PIL import Image
24
+ from tqdm import tqdm
25
+
26
+ from phase0_common import read_metadata, rows_by_video, video_id
27
+
28
+
29
+ def parse_args() -> argparse.Namespace:
30
+ parser = argparse.ArgumentParser(description="Generate SAM2 proposal tubes.")
31
+ parser.add_argument("--data_dir", type=Path, required=True)
32
+ parser.add_argument("--out_dir", type=Path, required=True)
33
+ parser.add_argument("--splits", type=str, default="test_s,test_u")
34
+ parser.add_argument("--sam2_repo", type=Path, default=None, help="Path to a local facebookresearch/sam2 clone.")
35
+ parser.add_argument("--model_cfg", type=str, default="configs/sam2.1/sam2.1_hiera_l.yaml")
36
+ parser.add_argument("--checkpoint", type=Path, required=True)
37
+ parser.add_argument("--seed_proposal_dir", type=Path, default=None, help="Reuse boxes/keyframes/scores from an existing proposal cache and only rerun propagation.")
38
+ parser.add_argument("--device", type=str, default="cuda")
39
+ parser.add_argument("--amp_dtype", type=str, default="bf16", choices=["none", "bf16", "fp16"])
40
+ parser.add_argument("--frames", type=int, default=10)
41
+ parser.add_argument("--stride", type=int, default=8)
42
+ parser.add_argument("--max_tubes", type=int, default=128)
43
+ parser.add_argument("--amg_points_per_side", type=int, default=32)
44
+ parser.add_argument("--amg_pred_iou_thresh", type=float, default=0.80)
45
+ parser.add_argument("--amg_stability_score_thresh", type=float, default=0.88)
46
+ parser.add_argument("--min_mask_area", type=int, default=64)
47
+ parser.add_argument("--limit_videos", type=int, default=0)
48
+ parser.add_argument("--video_list", type=Path, default=None)
49
+ parser.add_argument("--shard_id", type=int, default=0)
50
+ parser.add_argument("--num_shards", type=int, default=1)
51
+ parser.add_argument("--quiet_sam2", action="store_true")
52
+ parser.add_argument("--bidirectional", action="store_true", default=True)
53
+ parser.set_defaults(group_by_keyframe=False)
54
+ parser.add_argument("--group_by_keyframe", dest="group_by_keyframe", action="store_true")
55
+ parser.add_argument("--no_group_by_keyframe", dest="group_by_keyframe", action="store_false")
56
+ parser.add_argument("--overwrite", action="store_true")
57
+ return parser.parse_args()
58
+
59
+
60
+ def import_sam2(repo: Path | None):
61
+ if repo is not None:
62
+ sys.path.insert(0, str(repo))
63
+ from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
64
+ from sam2.build_sam import build_sam2, build_sam2_video_predictor
65
+
66
+ return SAM2AutomaticMaskGenerator, build_sam2, build_sam2_video_predictor
67
+
68
+
69
+ def keyframes_for_stride(frames: int, stride: int) -> List[int]:
70
+ if stride <= 0:
71
+ raise ValueError("--stride must be positive")
72
+ keyframes = list(range(0, frames, stride))
73
+ mid = frames // 2
74
+ if mid not in keyframes:
75
+ keyframes.append(mid)
76
+ return sorted(set(k for k in keyframes if 0 <= k < frames))
77
+
78
+
79
+ def load_rgb(path: Path) -> np.ndarray:
80
+ with Image.open(path) as img:
81
+ return np.array(img.convert("RGB"))
82
+
83
+
84
+ def proposal_score(item: dict) -> float:
85
+ pred_iou = float(item.get("predicted_iou", 0.0))
86
+ stability = float(item.get("stability_score", 0.0))
87
+ area = float(item.get("area", 0.0))
88
+ return pred_iou + 0.1 * stability + min(area / 1_000_000.0, 0.01)
89
+
90
+
91
+ def xywh_to_xyxy(box: List[float]) -> np.ndarray:
92
+ x, y, w, h = box
93
+ return np.array([x, y, x + w, y + h], dtype=np.float32)
94
+
95
+
96
+ def amp_context(device: str, amp_dtype: str):
97
+ if not device.startswith("cuda") or amp_dtype == "none":
98
+ return nullcontext()
99
+ dtype = torch.bfloat16 if amp_dtype == "bf16" else torch.float16
100
+ return torch.autocast("cuda", dtype=dtype)
101
+
102
+
103
+ def collect_keyframe_masks(mask_generator, data_dir: Path, vid: str, keyframes: List[int], min_area: int, device: str, amp_dtype: str) -> List[dict]:
104
+ proposals: List[dict] = []
105
+ for kf in keyframes:
106
+ image = load_rgb(data_dir / "media" / vid / "frames" / f"{kf}.jpg")
107
+ with torch.inference_mode(), amp_context(device, amp_dtype):
108
+ masks = mask_generator.generate(image)
109
+ for m in masks:
110
+ area = int(m.get("area", np.asarray(m["segmentation"]).sum()))
111
+ if area < min_area:
112
+ continue
113
+ bbox = xywh_to_xyxy(m["bbox"])
114
+ proposals.append(
115
+ {
116
+ "keyframe": kf,
117
+ "box": bbox,
118
+ "score": proposal_score(m),
119
+ "area": area,
120
+ }
121
+ )
122
+ proposals.sort(key=lambda x: x["score"], reverse=True)
123
+ return proposals
124
+
125
+
126
+ def load_seed_proposals(seed_dir: Path, vid: str) -> List[dict] | None:
127
+ path = seed_dir / f"{vid}.npz"
128
+ if not path.exists():
129
+ return None
130
+ data = np.load(path)
131
+ boxes = data["boxes_xyxy"]
132
+ keyframes = data["keyframes"]
133
+ scores = data["scores"]
134
+ proposals = []
135
+ for box, keyframe, score in zip(boxes, keyframes, scores):
136
+ proposals.append(
137
+ {
138
+ "keyframe": int(keyframe),
139
+ "box": box.astype(np.float32),
140
+ "score": float(score),
141
+ "area": 0,
142
+ }
143
+ )
144
+ proposals.sort(key=lambda x: x["score"], reverse=True)
145
+ return proposals
146
+
147
+
148
+ def add_box_prompt(predictor, state, frame_idx: int, obj_id: int, box: np.ndarray):
149
+ try:
150
+ return predictor.add_new_points_or_box(
151
+ inference_state=state,
152
+ frame_idx=frame_idx,
153
+ obj_id=obj_id,
154
+ box=box,
155
+ )
156
+ except TypeError:
157
+ return predictor.add_new_points_or_box(state, frame_idx, obj_id, box=box)
158
+
159
+
160
+ def logits_to_mask(logits) -> np.ndarray:
161
+ if hasattr(logits, "detach"):
162
+ logits = logits.detach().cpu().numpy()
163
+ arr = np.asarray(logits)
164
+ while arr.ndim > 2:
165
+ arr = arr[0]
166
+ return arr > 0
167
+
168
+
169
+ def run_propagation(state, predictor, masks_by_obj: Dict[int, List[np.ndarray]], frames: int, reverse: bool, start_frame_idx: int | None = None) -> None:
170
+ for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(
171
+ state,
172
+ start_frame_idx=start_frame_idx,
173
+ reverse=reverse,
174
+ ):
175
+ if hasattr(out_obj_ids, "detach"):
176
+ out_obj_ids = out_obj_ids.detach().cpu().tolist()
177
+ for pos, obj_id in enumerate(list(out_obj_ids)):
178
+ if 0 <= int(out_frame_idx) < frames:
179
+ masks_by_obj[int(obj_id)][int(out_frame_idx)] = logits_to_mask(out_mask_logits[pos])
180
+
181
+
182
+ def propagate_proposal_group(predictor, video_dir: Path, proposals: List[dict], frames: int, device: str, amp_dtype: str, bidirectional: bool) -> np.ndarray:
183
+ with torch.inference_mode(), amp_context(device, amp_dtype):
184
+ state = predictor.init_state(video_path=str(video_dir))
185
+ for obj_id, proposal in enumerate(proposals):
186
+ add_box_prompt(predictor, state, int(proposal["keyframe"]), obj_id, proposal["box"])
187
+
188
+ masks_by_obj: Dict[int, List[np.ndarray]] = {
189
+ obj_id: [None for _ in range(frames)] for obj_id in range(len(proposals))
190
+ }
191
+ run_propagation(state, predictor, masks_by_obj, frames, reverse=False)
192
+ if bidirectional:
193
+ start_frame_idx = max(int(p["keyframe"]) for p in proposals)
194
+ run_propagation(state, predictor, masks_by_obj, frames, reverse=True, start_frame_idx=start_frame_idx)
195
+
196
+ # Some SAM2 versions only propagate forward. Fill missing frames with the
197
+ # nearest available mask so Phase 0 can still score temporal purity.
198
+ tube_masks = []
199
+ for obj_id in range(len(proposals)):
200
+ masks = masks_by_obj[obj_id]
201
+ known = [i for i, m in enumerate(masks) if m is not None]
202
+ if not known:
203
+ continue
204
+ for t in range(frames):
205
+ if masks[t] is None:
206
+ nearest = min(known, key=lambda k: abs(k - t))
207
+ masks[t] = masks[nearest]
208
+ tube_masks.append(np.stack(masks, axis=0))
209
+ if not tube_masks:
210
+ return np.zeros((0, frames, 1, 1), dtype=np.uint8)
211
+ return np.stack(tube_masks, axis=0).astype(np.uint8)
212
+
213
+
214
+ def propagate_boxes(
215
+ predictor,
216
+ video_dir: Path,
217
+ proposals: List[dict],
218
+ frames: int,
219
+ device: str,
220
+ amp_dtype: str,
221
+ bidirectional: bool,
222
+ group_by_keyframe: bool,
223
+ ) -> np.ndarray:
224
+ if not group_by_keyframe:
225
+ return propagate_proposal_group(predictor, video_dir, proposals, frames, device, amp_dtype, bidirectional)
226
+
227
+ grouped: Dict[int, List[Tuple[int, dict]]] = {}
228
+ for idx, proposal in enumerate(proposals):
229
+ grouped.setdefault(int(proposal["keyframe"]), []).append((idx, proposal))
230
+
231
+ ordered_masks: List[np.ndarray | None] = [None for _ in proposals]
232
+ for _, indexed_group in sorted(grouped.items()):
233
+ group_indices = [idx for idx, _ in indexed_group]
234
+ group_props = [proposal for _, proposal in indexed_group]
235
+ group_masks = propagate_proposal_group(predictor, video_dir, group_props, frames, device, amp_dtype, bidirectional)
236
+ for local_idx, global_idx in enumerate(group_indices[: group_masks.shape[0]]):
237
+ ordered_masks[global_idx] = group_masks[local_idx]
238
+
239
+ known = [mask for mask in ordered_masks if mask is not None]
240
+ if not known:
241
+ return np.zeros((0, frames, 1, 1), dtype=np.uint8)
242
+ h, w = known[0].shape[-2:]
243
+ final = [mask if mask is not None else np.zeros((frames, h, w), dtype=np.uint8) for mask in ordered_masks]
244
+ return np.stack(final, axis=0).astype(np.uint8)
245
+
246
+
247
+ def load_video_list(path: Path | None) -> List[str] | None:
248
+ if path is None:
249
+ return None
250
+ vids = []
251
+ for line in path.read_text().splitlines():
252
+ line = line.strip()
253
+ if line and not line.startswith("#"):
254
+ vids.append(line)
255
+ return vids
256
+
257
+
258
+ def main() -> None:
259
+ args = parse_args()
260
+ args.out_dir.mkdir(parents=True, exist_ok=True)
261
+ SAM2AutomaticMaskGenerator, build_sam2, build_sam2_video_predictor = import_sam2(args.sam2_repo)
262
+
263
+ mask_generator = None
264
+ if args.seed_proposal_dir is None:
265
+ image_model = build_sam2(args.model_cfg, str(args.checkpoint), device=args.device)
266
+ mask_generator = SAM2AutomaticMaskGenerator(
267
+ image_model,
268
+ points_per_side=args.amg_points_per_side,
269
+ pred_iou_thresh=args.amg_pred_iou_thresh,
270
+ stability_score_thresh=args.amg_stability_score_thresh,
271
+ )
272
+ video_predictor = build_sam2_video_predictor(args.model_cfg, str(args.checkpoint), device=args.device)
273
+
274
+ splits = [s.strip() for s in args.splits.split(",") if s.strip()]
275
+ rows = read_metadata(args.data_dir, splits)
276
+ vids = sorted(rows_by_video(rows).keys())
277
+ selected_vids = load_video_list(args.video_list)
278
+ if selected_vids is not None:
279
+ selected = set(selected_vids)
280
+ vids = [vid for vid in vids if vid in selected]
281
+ if args.num_shards < 1:
282
+ raise ValueError("--num_shards must be >= 1")
283
+ if args.shard_id < 0 or args.shard_id >= args.num_shards:
284
+ raise ValueError("--shard_id must be in [0, num_shards)")
285
+ if args.num_shards > 1:
286
+ vids = [vid for idx, vid in enumerate(vids) if idx % args.num_shards == args.shard_id]
287
+ if args.limit_videos:
288
+ vids = vids[: args.limit_videos]
289
+ keyframes = keyframes_for_stride(args.frames, args.stride)
290
+
291
+ manifest = {
292
+ "data_dir": str(args.data_dir),
293
+ "splits": splits,
294
+ "model_cfg": args.model_cfg,
295
+ "checkpoint": str(args.checkpoint),
296
+ "stride": args.stride,
297
+ "keyframes": keyframes,
298
+ "max_tubes": args.max_tubes,
299
+ "videos": len(vids),
300
+ "items": [],
301
+ }
302
+
303
+ for vid in tqdm(vids, desc="Generating SAM2 proposals"):
304
+ out_path = args.out_dir / f"{vid}.npz"
305
+ if out_path.exists() and not args.overwrite:
306
+ manifest["items"].append({"vid": vid, "path": str(out_path), "skipped": True})
307
+ continue
308
+
309
+ start = time.perf_counter()
310
+ proposals = load_seed_proposals(args.seed_proposal_dir, vid) if args.seed_proposal_dir is not None else None
311
+ if proposals is None:
312
+ proposals = collect_keyframe_masks(mask_generator, args.data_dir, vid, keyframes, args.min_mask_area, args.device, args.amp_dtype)
313
+ proposals = proposals[: args.max_tubes]
314
+ if args.quiet_sam2:
315
+ from contextlib import redirect_stdout, redirect_stderr
316
+ import io
317
+
318
+ with redirect_stdout(io.StringIO()), redirect_stderr(io.StringIO()):
319
+ if proposals:
320
+ masks = propagate_boxes(video_predictor, args.data_dir / "media" / vid / "frames", proposals, args.frames, args.device, args.amp_dtype, args.bidirectional, args.group_by_keyframe)
321
+ else:
322
+ first = load_rgb(args.data_dir / "media" / vid / "frames" / "0.jpg")
323
+ h, w = first.shape[:2]
324
+ masks = np.zeros((0, args.frames, h, w), dtype=np.uint8)
325
+ else:
326
+ if proposals:
327
+ masks = propagate_boxes(video_predictor, args.data_dir / "media" / vid / "frames", proposals, args.frames, args.device, args.amp_dtype, args.bidirectional, args.group_by_keyframe)
328
+ else:
329
+ first = load_rgb(args.data_dir / "media" / vid / "frames" / "0.jpg")
330
+ h, w = first.shape[:2]
331
+ masks = np.zeros((0, args.frames, h, w), dtype=np.uint8)
332
+
333
+ n = min(len(proposals), masks.shape[0])
334
+ proposals = proposals[:n]
335
+ masks = masks[:n]
336
+ scores = np.array([p["score"] for p in proposals], dtype=np.float32)
337
+ boxes = np.stack([p["box"] for p in proposals], axis=0).astype(np.float32) if proposals else np.zeros((0, 4), dtype=np.float32)
338
+ proposal_keyframes = np.array([p["keyframe"] for p in proposals], dtype=np.int64)
339
+ np.savez_compressed(
340
+ out_path,
341
+ masks=masks,
342
+ scores=scores,
343
+ keyframes=proposal_keyframes,
344
+ boxes_xyxy=boxes,
345
+ )
346
+ elapsed = time.perf_counter() - start
347
+ manifest["items"].append({"vid": vid, "path": str(out_path), "tubes": int(n), "seconds": elapsed})
348
+
349
+ with (args.out_dir / "manifest.json").open("w") as f:
350
+ import json
351
+
352
+ json.dump(manifest, f, indent=2)
353
+
354
+
355
+ if __name__ == "__main__":
356
+ main()
tools/tubetoken/phase0_common.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared utilities for TubeToken Phase 0 experiments."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ import json
7
+ from collections import defaultdict
8
+ from pathlib import Path
9
+ from typing import Dict, Iterable, List, Optional, Sequence, Tuple
10
+
11
+ import numpy as np
12
+ from PIL import Image
13
+
14
+ try:
15
+ import cv2
16
+ except Exception: # pragma: no cover
17
+ cv2 = None
18
+
19
+
20
+ VALID_EVAL_SPLITS = {"test_s", "test_u"}
21
+ VALID_DATA_SPLITS = {"train", "val", "test_s", "test_u", "test_n"}
22
+
23
+
24
+ def read_metadata(data_dir: Path, splits: Optional[Iterable[str]] = None) -> List[dict]:
25
+ wanted = set(splits) if splits else None
26
+ with (data_dir / "metadata.csv").open("r", newline="") as f:
27
+ rows = list(csv.DictReader(f))
28
+ if wanted is not None:
29
+ rows = [r for r in rows if r["split"] in wanted]
30
+ return rows
31
+
32
+
33
+ def video_id(row: dict) -> str:
34
+ return row.get("vid") or row["uid"].rsplit("_", 2)[0]
35
+
36
+
37
+ def fid_value(row: dict) -> str:
38
+ return str(row.get("fid", "")).strip()
39
+
40
+
41
+ def category_from_uid(row: dict) -> str:
42
+ vid = video_id(row)
43
+ uid = row.get("uid", "")
44
+ suffix = uid[len(vid) + 1 :] if uid.startswith(vid + "_") else uid.rsplit("_", 2)[-2]
45
+ return suffix.rsplit("_", 1)[0] if "_" in suffix else suffix
46
+
47
+
48
+ def rows_by_video(rows: Sequence[dict]) -> Dict[str, List[dict]]:
49
+ out: Dict[str, List[dict]] = defaultdict(list)
50
+ for row in rows:
51
+ out[video_id(row)].append(row)
52
+ return out
53
+
54
+
55
+ def load_mask(path: Path) -> np.ndarray:
56
+ with Image.open(path) as img:
57
+ return np.array(img.convert("L")) > 0
58
+
59
+
60
+ def load_gt_tube(data_dir: Path, vid: str, fid: str, frames: int = 10) -> np.ndarray:
61
+ masks = []
62
+ for t in range(frames):
63
+ path = data_dir / "gt_mask" / vid / f"fid_{fid}" / f"0000{t}.png"
64
+ masks.append(load_mask(path))
65
+ return np.stack(masks, axis=0)
66
+
67
+
68
+ def mask_iou(pred: np.ndarray, gt: np.ndarray) -> float:
69
+ pred = pred.astype(bool)
70
+ gt = gt.astype(bool)
71
+ union = np.logical_or(pred, gt).sum()
72
+ if union == 0:
73
+ return 1.0
74
+ inter = np.logical_and(pred, gt).sum()
75
+ return float(inter / union)
76
+
77
+
78
+ def tube_iou_visible(pred_tube: np.ndarray, gt_tube: np.ndarray) -> float:
79
+ visible = gt_tube.reshape(gt_tube.shape[0], -1).sum(axis=1) > 0
80
+ if not visible.any():
81
+ return 0.0
82
+ vals = [mask_iou(pred_tube[t], gt_tube[t]) for t in np.where(visible)[0]]
83
+ return float(np.mean(vals)) if vals else 0.0
84
+
85
+
86
+ def tube_iou_all(pred_tube: np.ndarray, gt_tube: np.ndarray) -> float:
87
+ vals = [mask_iou(pred_tube[t], gt_tube[t]) for t in range(gt_tube.shape[0])]
88
+ return float(np.mean(vals)) if vals else 0.0
89
+
90
+
91
+ def db_eval_iou(annotation: np.ndarray, segmentation: np.ndarray) -> float:
92
+ annotation = annotation.astype(bool)
93
+ segmentation = segmentation.astype(bool)
94
+ if annotation.sum() == 0 and segmentation.sum() == 0:
95
+ return 1.0
96
+ if annotation.sum() == 0 or segmentation.sum() == 0:
97
+ return 0.0
98
+ inter = np.logical_and(annotation, segmentation).sum()
99
+ union = np.logical_or(annotation, segmentation).sum()
100
+ return float(inter / union) if union > 0 else 0.0
101
+
102
+
103
+ def db_eval_boundary(annotation: np.ndarray, segmentation: np.ndarray, bound_th: float = 0.008) -> float:
104
+ annotation = annotation.astype(bool)
105
+ segmentation = segmentation.astype(bool)
106
+ if annotation.sum() == 0 and segmentation.sum() == 0:
107
+ return 1.0
108
+ if annotation.sum() == 0 or segmentation.sum() == 0:
109
+ return 0.0
110
+
111
+ bound_pix = max(1, int(round(bound_th * np.linalg.norm(annotation.shape))))
112
+ if cv2 is not None:
113
+ fg_boundary = mask_to_boundary_cv2(annotation, bound_pix)
114
+ seg_boundary = mask_to_boundary_cv2(segmentation, bound_pix)
115
+ kernel = np.ones((2 * bound_pix + 1, 2 * bound_pix + 1), dtype=np.uint8)
116
+ fg_dil = cv2.dilate(fg_boundary.astype(np.uint8), kernel, iterations=1).astype(bool)
117
+ seg_dil = cv2.dilate(seg_boundary.astype(np.uint8), kernel, iterations=1).astype(bool)
118
+ gt_match = np.logical_and(fg_boundary, seg_dil).sum()
119
+ pred_match = np.logical_and(seg_boundary, fg_dil).sum()
120
+ n_fg = fg_boundary.sum()
121
+ n_pred = seg_boundary.sum()
122
+ if n_fg == 0 and n_pred == 0:
123
+ return 1.0
124
+ if n_fg == 0 or n_pred == 0:
125
+ return 0.0
126
+ precision = pred_match / n_pred
127
+ recall = gt_match / n_fg
128
+ if precision + recall == 0:
129
+ return 0.0
130
+ return float(2 * precision * recall / (precision + recall))
131
+
132
+ fg_boundary = mask_to_boundary(annotation, bound_pix)
133
+ seg_boundary = mask_to_boundary(segmentation, bound_pix)
134
+ fg_dil = binary_dilate(fg_boundary, bound_pix)
135
+ seg_dil = binary_dilate(seg_boundary, bound_pix)
136
+
137
+ gt_match = np.logical_and(fg_boundary, seg_dil).sum()
138
+ pred_match = np.logical_and(seg_boundary, fg_dil).sum()
139
+ n_fg = fg_boundary.sum()
140
+ n_pred = seg_boundary.sum()
141
+ if n_fg == 0 and n_pred == 0:
142
+ return 1.0
143
+ if n_fg == 0 or n_pred == 0:
144
+ return 0.0
145
+ precision = pred_match / n_pred
146
+ recall = gt_match / n_fg
147
+ if precision + recall == 0:
148
+ return 0.0
149
+ return float(2 * precision * recall / (precision + recall))
150
+
151
+
152
+ def mask_to_boundary_cv2(mask: np.ndarray, dilation: int) -> np.ndarray:
153
+ kernel = np.ones((2 * dilation + 1, 2 * dilation + 1), dtype=np.uint8)
154
+ eroded = cv2.erode(mask.astype(np.uint8), kernel, iterations=1).astype(bool)
155
+ return np.logical_xor(mask.astype(bool), eroded)
156
+
157
+
158
+ def mask_to_boundary(mask: np.ndarray, dilation: int) -> np.ndarray:
159
+ eroded = binary_erode(mask, dilation)
160
+ return np.logical_xor(mask, eroded)
161
+
162
+
163
+ def binary_erode(mask: np.ndarray, radius: int) -> np.ndarray:
164
+ padded = np.pad(mask.astype(bool), radius, mode="constant", constant_values=False)
165
+ out = np.ones_like(mask, dtype=bool)
166
+ size = 2 * radius + 1
167
+ for dy in range(size):
168
+ for dx in range(size):
169
+ out &= padded[dy : dy + mask.shape[0], dx : dx + mask.shape[1]]
170
+ return out
171
+
172
+
173
+ def binary_dilate(mask: np.ndarray, radius: int) -> np.ndarray:
174
+ padded = np.pad(mask.astype(bool), radius, mode="constant", constant_values=False)
175
+ out = np.zeros_like(mask, dtype=bool)
176
+ size = 2 * radius + 1
177
+ for dy in range(size):
178
+ for dx in range(size):
179
+ out |= padded[dy : dy + mask.shape[0], dx : dx + mask.shape[1]]
180
+ return out
181
+
182
+
183
+ def evaluate_tube_jf(pred_tube: np.ndarray, gt_tube: np.ndarray) -> Tuple[float, float, float]:
184
+ js = [db_eval_iou(gt_tube[t], pred_tube[t]) for t in range(gt_tube.shape[0])]
185
+ fs = [db_eval_boundary(gt_tube[t], pred_tube[t]) for t in range(gt_tube.shape[0])]
186
+ j = float(np.mean(js)) if js else 0.0
187
+ f = float(np.mean(fs)) if fs else 0.0
188
+ return j, f, (j + f) / 2
189
+
190
+
191
+ def bbox_from_mask(mask: np.ndarray) -> Optional[Tuple[int, int, int, int]]:
192
+ ys, xs = np.where(mask.astype(bool))
193
+ if len(xs) == 0:
194
+ return None
195
+ return int(xs.min()), int(ys.min()), int(xs.max()), int(ys.max())
196
+
197
+
198
+ def load_audit_rows(audit_csv: Path) -> Dict[str, dict]:
199
+ if not audit_csv.exists():
200
+ return {}
201
+ with audit_csv.open("r", newline="") as f:
202
+ return {row["uid"]: row for row in csv.DictReader(f)}
203
+
204
+
205
+ def bool_field(row: Optional[dict], key: str) -> bool:
206
+ if not row:
207
+ return False
208
+ return str(row.get(key, "")).lower() in {"1", "true", "yes"}
209
+
210
+
211
+ def write_json(path: Path, obj: dict) -> None:
212
+ path.parent.mkdir(parents=True, exist_ok=True)
213
+ with path.open("w") as f:
214
+ json.dump(obj, f, indent=2, sort_keys=True)