Add files using upload-large-folder tool
Browse files- ChatUniVi/__pycache__/__init__.cpython-310.pyc +0 -0
- ChatUniVi/__pycache__/constants.cpython-310.pyc +0 -0
- ChatUniVi/model/__pycache__/__init__.cpython-310.pyc +0 -0
- ChatUniVi/model/__pycache__/arch.cpython-310.pyc +0 -0
- ChatUniVi/model/__pycache__/cluster.cpython-310.pyc +0 -0
- ChatUniVi/model/language_model/__pycache__/llama.cpython-310.pyc +0 -0
- ChatUniVi/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc +0 -0
- ChatUniVi/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc +0 -0
- ChatUniVi/model/multimodal_encoder/__pycache__/eva_encoder.cpython-310.pyc +0 -0
- ChatUniVi/model/multimodal_encoder/__pycache__/eva_vit.cpython-310.pyc +0 -0
- ChatUniVi/model/multimodal_encoder/__pycache__/utils.cpython-310.pyc +0 -0
- ChatUniVi/model/multimodal_projector/__pycache__/builder.cpython-310.pyc +0 -0
- configs/__pycache__/config.cpython-310.pyc +0 -0
- configs/__pycache__/config.cpython-312.pyc +0 -0
- datasets/__pycache__/dataset_refavs.cpython-310.pyc +0 -0
- models/segment_anything/utils/__pycache__/transforms.cpython-310.pyc +0 -0
- runs/tubetoken_phase0/eval_stride8_n64_bidir.log +13 -0
- runs/tubetoken_phase0/miss_videos_r64.txt +164 -0
- runs/tubetoken_phase0/proposals_stride8_n64_bidir.log +1 -0
- runs/tubetoken_phase_minus1/audit_full/audit_summary.json +45 -0
- runs/tubetoken_phase_minus1/audit_full/h3_candidates.csv +0 -0
- runs/tubetoken_phase_minus1/simtoken_eval/eval.log +0 -0
- runs/tubetoken_phase_minus1/simtoken_eval/eval_null.log +11 -0
- tools/__pycache__/audit_refavs.cpython-312.pyc +0 -0
- tools/tubetoken/__pycache__/evaluate_phase0_proposals.cpython-310.pyc +0 -0
- tools/tubetoken/__pycache__/evaluate_phase0_proposals.cpython-312.pyc +0 -0
- tools/tubetoken/__pycache__/generate_sam2_proposals.cpython-310.pyc +0 -0
- tools/tubetoken/__pycache__/generate_sam2_proposals.cpython-312.pyc +0 -0
- tools/tubetoken/__pycache__/phase0_common.cpython-310.pyc +0 -0
- tools/tubetoken/__pycache__/phase0_common.cpython-312.pyc +0 -0
- tools/tubetoken/evaluate_phase0_proposals.py +234 -0
- tools/tubetoken/generate_sam2_proposals.py +356 -0
- tools/tubetoken/phase0_common.py +214 -0
ChatUniVi/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (189 Bytes). View file
|
|
|
ChatUniVi/__pycache__/constants.cpython-310.pyc
ADDED
|
Binary file (726 Bytes). View file
|
|
|
ChatUniVi/model/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (236 Bytes). View file
|
|
|
ChatUniVi/model/__pycache__/arch.cpython-310.pyc
ADDED
|
Binary file (14.2 kB). View file
|
|
|
ChatUniVi/model/__pycache__/cluster.cpython-310.pyc
ADDED
|
Binary file (9.06 kB). View file
|
|
|
ChatUniVi/model/language_model/__pycache__/llama.cpython-310.pyc
ADDED
|
Binary file (3.67 kB). View file
|
|
|
ChatUniVi/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc
ADDED
|
Binary file (487 Bytes). View file
|
|
|
ChatUniVi/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc
ADDED
|
Binary file (3.05 kB). View file
|
|
|
ChatUniVi/model/multimodal_encoder/__pycache__/eva_encoder.cpython-310.pyc
ADDED
|
Binary file (3.07 kB). View file
|
|
|
ChatUniVi/model/multimodal_encoder/__pycache__/eva_vit.cpython-310.pyc
ADDED
|
Binary file (14.1 kB). View file
|
|
|
ChatUniVi/model/multimodal_encoder/__pycache__/utils.cpython-310.pyc
ADDED
|
Binary file (3.77 kB). View file
|
|
|
ChatUniVi/model/multimodal_projector/__pycache__/builder.cpython-310.pyc
ADDED
|
Binary file (2.05 kB). View file
|
|
|
configs/__pycache__/config.cpython-310.pyc
ADDED
|
Binary file (2.22 kB). View file
|
|
|
configs/__pycache__/config.cpython-312.pyc
ADDED
|
Binary file (3.61 kB). View file
|
|
|
datasets/__pycache__/dataset_refavs.cpython-310.pyc
ADDED
|
Binary file (5.04 kB). View file
|
|
|
models/segment_anything/utils/__pycache__/transforms.cpython-310.pyc
ADDED
|
Binary file (3.99 kB). View file
|
|
|
runs/tubetoken_phase0/eval_stride8_n64_bidir.log
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# TubeToken Phase 0 Proposal Evaluation
|
| 3 |
+
|
| 4 |
+
- all: n=3944, R@16=0.469, R@32=0.597, R@64=0.754, Oracle J&F=0.7491, miss=24.62%
|
| 5 |
+
- area_unstable: n=18, R@16=0.556, R@32=0.556, R@64=0.889, Oracle J&F=0.7114, miss=11.11%
|
| 6 |
+
- audio_keyword: n=2844, R@16=0.475, R@32=0.610, R@64=0.766, Oracle J&F=0.7569, miss=23.42%
|
| 7 |
+
- h3_candidate: n=3932, R@16=0.469, R@32=0.597, R@64=0.754, Oracle J&F=0.7488, miss=24.64%
|
| 8 |
+
- partial: n=8, R@16=0.250, R@32=0.250, R@64=1.000, Oracle J&F=0.8123, miss=0.00%
|
| 9 |
+
- same_category: n=330, R@16=0.482, R@32=0.588, R@64=0.709, Oracle J&F=0.7261, miss=29.09%
|
| 10 |
+
- small: n=1631, R@16=0.237, R@32=0.392, R@64=0.633, Oracle J&F=0.6367, miss=36.73%
|
| 11 |
+
- spatial_keyword: n=965, R@16=0.331, R@32=0.476, R@64=0.658, Oracle J&F=0.6714, miss=34.20%
|
| 12 |
+
- test_s: n=2288, R@16=0.326, R@32=0.483, R@64=0.657, Oracle J&F=0.6674, miss=34.27%
|
| 13 |
+
- test_u: n=1656, R@16=0.665, R@32=0.755, R@64=0.887, Oracle J&F=0.8618, miss=11.29%
|
runs/tubetoken_phase0/miss_videos_r64.txt
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-wb3JWo4qTg_380000_390000
|
| 2 |
+
0BsmPd44Bic_60000_70000
|
| 3 |
+
2VqnoV67t0g_290000_300000
|
| 4 |
+
2wmxck5D9Vw_102000_112000
|
| 5 |
+
39xq5AATMp4_42000_52000
|
| 6 |
+
39xq5AATMp4_60000_70000
|
| 7 |
+
45oaGY3mzlQ_5000_15000
|
| 8 |
+
4DCxlVTWN8g_3000_13000
|
| 9 |
+
4GUrjmIZcIQ_82000_92000
|
| 10 |
+
4YLjVRZHoZI_1750000_1760000
|
| 11 |
+
4ZPJ867OBo4_75000_85000
|
| 12 |
+
4k63MOSjOBw_21000_31000
|
| 13 |
+
4rerQA8KMzM_10000_20000
|
| 14 |
+
4tASCYb3ySA_1000_11000
|
| 15 |
+
50Z4O0mzAXg_150000_160000
|
| 16 |
+
5FYG_Wtet7U_120000_130000
|
| 17 |
+
5YLQapaxA8o_31000_41000
|
| 18 |
+
5YLQapaxA8o_80000_90000
|
| 19 |
+
6KT2mbCNxO4_228000_238000
|
| 20 |
+
6KT2mbCNxO4_434000_444000
|
| 21 |
+
6WUjWMuoEhM_10000_20000
|
| 22 |
+
6tTLxIKVtfE_26000_36000
|
| 23 |
+
73QQbJIeB3Y_265000_275000
|
| 24 |
+
73QQbJIeB3Y_353000_363000
|
| 25 |
+
73QQbJIeB3Y_95000_105000
|
| 26 |
+
79T0FclnIDw_0_10000
|
| 27 |
+
79w-8fTYazw_15000_25000
|
| 28 |
+
7PGfd8pg86w_15000_25000
|
| 29 |
+
7pNykt6zACc_275000_285000
|
| 30 |
+
7pWa_kcAoMg_49000_59000
|
| 31 |
+
872Lwp3MOro_6000_16000
|
| 32 |
+
8hr42lVW_gk_60000_70000
|
| 33 |
+
9ZQ-T83pfWk_65000_75000
|
| 34 |
+
9xp46AwF9BY_38000_48000
|
| 35 |
+
A-e3dr7fTIs_0_10000
|
| 36 |
+
A-e3dr7fTIs_318000_328000
|
| 37 |
+
A-e3dr7fTIs_80000_90000
|
| 38 |
+
AxjZti5_t94_90000_100000
|
| 39 |
+
BRAQLY85x5U_75000_85000
|
| 40 |
+
BSkcM4ScyEs_150000_160000
|
| 41 |
+
CCFZT2_TJr0_15000_25000
|
| 42 |
+
CPiOGMl59L4_30000_40000
|
| 43 |
+
CVA8LpBW3Sc_76000_86000
|
| 44 |
+
CjQmcO_Q5d8_42000_52000
|
| 45 |
+
De5YhqX0jdI_0_10000
|
| 46 |
+
DzXVMjsZloE_0_10000
|
| 47 |
+
EKM1wu2vXy4_26000_36000
|
| 48 |
+
EQWvv8WbU04_22000_32000
|
| 49 |
+
ETmo71vP7tA_20000_30000
|
| 50 |
+
EuU2PWB1t2g_20000_30000
|
| 51 |
+
EzPcuNoSHMM_0_10000
|
| 52 |
+
F1x1ck3OySg_10000_20000
|
| 53 |
+
F3KWGQfJ2HM_26000_36000
|
| 54 |
+
GG-PF_JxeW4_1640000_1650000
|
| 55 |
+
HuFzVoyayj8_521000_531000
|
| 56 |
+
JgkUToh3HeY_49000_59000
|
| 57 |
+
KXvmc3dLn3E_720000_730000
|
| 58 |
+
LOFX2UVozf8_80000_90000
|
| 59 |
+
LitaFzObEEk_5000_15000
|
| 60 |
+
MDyjY3uiWp0_273000_283000
|
| 61 |
+
Maa21OL-40Q_20000_30000
|
| 62 |
+
Mivqxr0RS8w_18000_28000
|
| 63 |
+
OFgcrlxku9g_160000_170000
|
| 64 |
+
OGCyzmaM_kE_10000_20000
|
| 65 |
+
OMYuLiqSUxE_180000_190000
|
| 66 |
+
OMYuLiqSUxE_30000_40000
|
| 67 |
+
OPVEPq_r-vk_211000_221000
|
| 68 |
+
OX6T2z4P9fA_23000_33000
|
| 69 |
+
Ow9uE_v2AEg_28000_38000
|
| 70 |
+
PSoKYh3ea1o_60000_70000
|
| 71 |
+
PcdKAvd51l0_41000_51000
|
| 72 |
+
PdHpl04tQV8_40000_50000
|
| 73 |
+
Pe1LuVFTczE_106000_116000
|
| 74 |
+
Pe1LuVFTczE_358000_368000
|
| 75 |
+
QHcG-FDM75Q_113000_123000
|
| 76 |
+
QNiHU290owU_55000_65000
|
| 77 |
+
QTe-i0Pcn4s_37000_47000
|
| 78 |
+
RHy5nC-gRV8_668000_678000
|
| 79 |
+
RMF6sp6tWHM_100000_110000
|
| 80 |
+
Ru7m8PyMlVM_120000_130000
|
| 81 |
+
SFVZ2OklsVM_2715000_2725000
|
| 82 |
+
SFVZ2OklsVM_2765000_2775000
|
| 83 |
+
T9K1uy-G5qA_110000_120000
|
| 84 |
+
TCcD-vOUtNc_99000_109000
|
| 85 |
+
U1dZX1ReD88_48000_58000
|
| 86 |
+
URZyjoh9lbc_500000_510000
|
| 87 |
+
URZyjoh9lbc_560000_570000
|
| 88 |
+
UYUH7Jmfp3g_13000_23000
|
| 89 |
+
UYUH7Jmfp3g_76000_86000
|
| 90 |
+
UlYU9z7Y8jY_68000_78000
|
| 91 |
+
V9KZ5FCtG9A_15000_25000
|
| 92 |
+
VbEvfbj_IxU_117000_127000
|
| 93 |
+
VlPdfLr1FSo_7000_17000
|
| 94 |
+
W5yveLPTD04_211000_221000
|
| 95 |
+
WSy8ay1avew_60000_70000
|
| 96 |
+
Y735cxoG5-4_270000_280000
|
| 97 |
+
YAW2vMKV9pw_50000_60000
|
| 98 |
+
ZPYqUww_x6k_293000_303000
|
| 99 |
+
ZR3vnlhJuSE_101000_111000
|
| 100 |
+
ZU0JSxWk1Po_16000_26000
|
| 101 |
+
_-apT0tfo6U_16000_26000
|
| 102 |
+
a1nWlW629TU_15000_25000
|
| 103 |
+
aBmzZJZ_M8Y_9000_19000
|
| 104 |
+
bELyeHxF7eA_42000_52000
|
| 105 |
+
bcGfmy0X-CQ_30000_40000
|
| 106 |
+
cIbVu0ixSAo_302000_312000
|
| 107 |
+
dRa7aBGnStU_82000_92000
|
| 108 |
+
dUcOkRkz6bA_387000_397000
|
| 109 |
+
dVa49WwXzr8_1023000_1033000
|
| 110 |
+
dk_xhLkWyDo_30000_40000
|
| 111 |
+
ds3RKnNB-cY_8000_18000
|
| 112 |
+
ehlPuuiNEd8_376000_386000
|
| 113 |
+
f0jKjIOFzAY_243000_253000
|
| 114 |
+
fTobKZBbMos_2000_12000
|
| 115 |
+
fWvQqgSDUPU_10000_20000
|
| 116 |
+
hYt2Qf438l8_40000_50000
|
| 117 |
+
hornh-NQBHY_262000_272000
|
| 118 |
+
iSBtK1T10Ew_415000_425000
|
| 119 |
+
jj1UxRTBaNw_210000_220000
|
| 120 |
+
k35blcO8Z7k_700000_710000
|
| 121 |
+
kF2y7RIC7-Y_258000_268000
|
| 122 |
+
kF2y7RIC7-Y_385000_395000
|
| 123 |
+
lcOP60uXMeI_20000_30000
|
| 124 |
+
lcOP60uXMeI_396000_406000
|
| 125 |
+
lzzMHoi3r2w_50000_60000
|
| 126 |
+
mUN93MlvX64_42000_52000
|
| 127 |
+
nT0PHpAlvys_123000_133000
|
| 128 |
+
nT0PHpAlvys_313000_323000
|
| 129 |
+
nT0PHpAlvys_435000_445000
|
| 130 |
+
oRSCL3149fI_146000_156000
|
| 131 |
+
oRSCL3149fI_87000_97000
|
| 132 |
+
oSMvY0tErC4_100000_110000
|
| 133 |
+
oSMvY0tErC4_50000_60000
|
| 134 |
+
oVK2QsKq8ak_186000_196000
|
| 135 |
+
oYeir4FWq_8_6000_16000
|
| 136 |
+
oaYIWYXFMqY_70000_80000
|
| 137 |
+
oyi5pKPwz9Q_3000_13000
|
| 138 |
+
p2u4OJKqMxE_130000_140000
|
| 139 |
+
p5uRwtw7S3E_243000_253000
|
| 140 |
+
pNV8MKNqOkI_20000_30000
|
| 141 |
+
pf6ZpxTFL1Y_51000_61000
|
| 142 |
+
puugfzdXYz4_30000_40000
|
| 143 |
+
q3YnhPgt-rM_118000_128000
|
| 144 |
+
qYtrnr4chfU_2000_12000
|
| 145 |
+
r4NdM595K5c_40000_50000
|
| 146 |
+
rozFJYWrLj0_120000_130000
|
| 147 |
+
rzKjN2en0H8_830000_840000
|
| 148 |
+
sHyhvtLTCbo_10000_20000
|
| 149 |
+
sMsrz5VqchQ_130000_140000
|
| 150 |
+
sfv_msSOYTo_10000_20000
|
| 151 |
+
tMorLZku6Pc_194000_204000
|
| 152 |
+
tZbh1cwwfv0_222000_232000
|
| 153 |
+
vVJ-Zhj2HvU_22000_32000
|
| 154 |
+
vifFbeL5rOo_9000_19000
|
| 155 |
+
w7vA5f0vPvQ_15000_25000
|
| 156 |
+
wfkdedUW-dk_50000_60000
|
| 157 |
+
xUQ9rDswHdw_140000_150000
|
| 158 |
+
xnx3u5YaNuc_402000_412000
|
| 159 |
+
z3Q_mZgKLrM_20000_30000
|
| 160 |
+
zM7QopQ3MgI_319000_329000
|
| 161 |
+
zM7QopQ3MgI_410000_420000
|
| 162 |
+
zM7QopQ3MgI_499000_509000
|
| 163 |
+
zPMUL7f4OOU_420000_430000
|
| 164 |
+
zstao4nIPmU_250000_260000
|
runs/tubetoken_phase0/proposals_stride8_n64_bidir.log
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
runs/tubetoken_phase_minus1/audit_full/audit_summary.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"area_unstable_expressions": 41,
|
| 3 |
+
"audio_keyword_expressions": 15890,
|
| 4 |
+
"audio_keyword_percent": 77.66753018231586,
|
| 5 |
+
"data_dir": "/workspace/SimToken/data",
|
| 6 |
+
"expressions_per_object": {
|
| 7 |
+
"ge2": 5836,
|
| 8 |
+
"ge3": 4206,
|
| 9 |
+
"max": 10,
|
| 10 |
+
"mean": 2.742125720412813,
|
| 11 |
+
"median": 3
|
| 12 |
+
},
|
| 13 |
+
"expressions_per_video": {
|
| 14 |
+
"ge2": 3521,
|
| 15 |
+
"ge3": 3381,
|
| 16 |
+
"max": 26,
|
| 17 |
+
"mean": 5.7243984331281474,
|
| 18 |
+
"median": 6.0
|
| 19 |
+
},
|
| 20 |
+
"h3_candidate_expressions": 18614,
|
| 21 |
+
"h3_candidate_objects": 5781,
|
| 22 |
+
"late_target_expressions": 0,
|
| 23 |
+
"mask_rows_audited": 20459,
|
| 24 |
+
"multi_expression_objects": 5836,
|
| 25 |
+
"multi_expression_videos": 3521,
|
| 26 |
+
"null_split_expressions": 1028,
|
| 27 |
+
"null_split_percent": 5.0246835133682,
|
| 28 |
+
"num_expressions": 20459,
|
| 29 |
+
"num_objects_vid_fid": 7461,
|
| 30 |
+
"num_videos": 3574,
|
| 31 |
+
"partial_target_expressions": 33,
|
| 32 |
+
"same_category_distractor_heuristic_expressions": 2563,
|
| 33 |
+
"same_category_distractor_heuristic_percent": 12.527494012415074,
|
| 34 |
+
"small_target_expressions": 10037,
|
| 35 |
+
"spatial_keyword_expressions": 5924,
|
| 36 |
+
"spatial_keyword_percent": 28.955471919448655,
|
| 37 |
+
"splits": {
|
| 38 |
+
"TODO": 25,
|
| 39 |
+
"test_n": 1028,
|
| 40 |
+
"test_s": 2288,
|
| 41 |
+
"test_u": 1656,
|
| 42 |
+
"train": 14113,
|
| 43 |
+
"val": 1349
|
| 44 |
+
}
|
| 45 |
+
}
|
runs/tubetoken_phase_minus1/audit_full/h3_candidates.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
runs/tubetoken_phase_minus1/simtoken_eval/eval.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
runs/tubetoken_phase_minus1/simtoken_eval/eval_null.log
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
seg_token_idx: 32000
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
model loaded
|
| 5 |
+
|
| 6 |
+
Lora deployed
|
| 7 |
+
trainable params: 4194304 || all params: 7709437232 || trainable%: 0.0544048012037826
|
| 8 |
+
saved model loaded
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
valuate on test_n_refer, metric: 0.011791757307946682
|
tools/__pycache__/audit_refavs.cpython-312.pyc
ADDED
|
Binary file (19.6 kB). View file
|
|
|
tools/tubetoken/__pycache__/evaluate_phase0_proposals.cpython-310.pyc
ADDED
|
Binary file (8.83 kB). View file
|
|
|
tools/tubetoken/__pycache__/evaluate_phase0_proposals.cpython-312.pyc
ADDED
|
Binary file (10.4 kB). View file
|
|
|
tools/tubetoken/__pycache__/generate_sam2_proposals.cpython-310.pyc
ADDED
|
Binary file (13.9 kB). View file
|
|
|
tools/tubetoken/__pycache__/generate_sam2_proposals.cpython-312.pyc
ADDED
|
Binary file (16 kB). View file
|
|
|
tools/tubetoken/__pycache__/phase0_common.cpython-310.pyc
ADDED
|
Binary file (8.49 kB). View file
|
|
|
tools/tubetoken/__pycache__/phase0_common.cpython-312.pyc
ADDED
|
Binary file (12.4 kB). View file
|
|
|
tools/tubetoken/evaluate_phase0_proposals.py
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Evaluate proposal recall and oracle tube J/F for TubeToken Phase 0."""
|
| 3 |
+
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import argparse
|
| 7 |
+
import csv
|
| 8 |
+
from collections import defaultdict
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Dict, List
|
| 11 |
+
|
| 12 |
+
import numpy as np
|
| 13 |
+
from tqdm import tqdm
|
| 14 |
+
|
| 15 |
+
from phase0_common import (
|
| 16 |
+
bool_field,
|
| 17 |
+
evaluate_tube_jf,
|
| 18 |
+
load_audit_rows,
|
| 19 |
+
load_gt_tube,
|
| 20 |
+
read_metadata,
|
| 21 |
+
rows_by_video,
|
| 22 |
+
rows_by_video,
|
| 23 |
+
tube_iou_all,
|
| 24 |
+
tube_iou_visible,
|
| 25 |
+
video_id,
|
| 26 |
+
fid_value,
|
| 27 |
+
write_json,
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def parse_args() -> argparse.Namespace:
|
| 32 |
+
parser = argparse.ArgumentParser(description="Evaluate Phase 0 proposal cache.")
|
| 33 |
+
parser.add_argument("--data_dir", type=Path, required=True)
|
| 34 |
+
parser.add_argument("--proposal_dir", type=Path, required=True)
|
| 35 |
+
parser.add_argument("--out_dir", type=Path, required=True)
|
| 36 |
+
parser.add_argument("--audit_csv", type=Path, default=None)
|
| 37 |
+
parser.add_argument("--splits", type=str, default="test_s,test_u")
|
| 38 |
+
parser.add_argument("--frames", type=int, default=10)
|
| 39 |
+
parser.add_argument("--recall_ns", type=str, default="16,32,64,128")
|
| 40 |
+
parser.add_argument("--match_iou", type=float, default=0.5)
|
| 41 |
+
parser.add_argument("--limit_videos", type=int, default=0)
|
| 42 |
+
parser.add_argument("--video_list", type=Path, default=None)
|
| 43 |
+
parser.add_argument("--shard_id", type=int, default=0)
|
| 44 |
+
parser.add_argument("--num_shards", type=int, default=1)
|
| 45 |
+
parser.add_argument("--only_existing_proposals", action="store_true")
|
| 46 |
+
parser.add_argument("--skip_oracle_jf", action="store_true", help="Only compute Recall@N and tube IoU, useful for fast early checks.")
|
| 47 |
+
return parser.parse_args()
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def load_proposals(path: Path) -> np.ndarray:
|
| 51 |
+
data = np.load(path)
|
| 52 |
+
return data["masks"].astype(bool)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def load_video_list(path: Path | None) -> List[str] | None:
|
| 56 |
+
if path is None:
|
| 57 |
+
return None
|
| 58 |
+
vids = []
|
| 59 |
+
for line in path.read_text().splitlines():
|
| 60 |
+
line = line.strip()
|
| 61 |
+
if line and not line.startswith("#"):
|
| 62 |
+
vids.append(line)
|
| 63 |
+
return vids
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def sample_subsets(row: dict, audit: Dict[str, dict]) -> List[str]:
|
| 67 |
+
out = ["all", row["split"]]
|
| 68 |
+
audit_row = audit.get(row["uid"])
|
| 69 |
+
if bool_field(audit_row, "small_target"):
|
| 70 |
+
out.append("small")
|
| 71 |
+
if bool_field(audit_row, "partial_target"):
|
| 72 |
+
out.append("partial")
|
| 73 |
+
if bool_field(audit_row, "area_unstable"):
|
| 74 |
+
out.append("area_unstable")
|
| 75 |
+
if bool_field(audit_row, "late_target"):
|
| 76 |
+
out.append("late_target")
|
| 77 |
+
if bool_field(audit_row, "is_audio_keyword"):
|
| 78 |
+
out.append("audio_keyword")
|
| 79 |
+
if bool_field(audit_row, "is_spatial_keyword"):
|
| 80 |
+
out.append("spatial_keyword")
|
| 81 |
+
if bool_field(audit_row, "same_category_distractor_heuristic"):
|
| 82 |
+
out.append("same_category")
|
| 83 |
+
if bool_field(audit_row, "h3_candidate"):
|
| 84 |
+
out.append("h3_candidate")
|
| 85 |
+
return out
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def empty_metrics(recall_ns: List[int]) -> dict:
|
| 89 |
+
return {
|
| 90 |
+
"count": 0,
|
| 91 |
+
"proposal_miss": 0,
|
| 92 |
+
"oracle_j": 0.0,
|
| 93 |
+
"oracle_f": 0.0,
|
| 94 |
+
"oracle_jf": 0.0,
|
| 95 |
+
"oracle_iou_visible": 0.0,
|
| 96 |
+
"oracle_iou_all": 0.0,
|
| 97 |
+
**{f"recall@{n}": 0 for n in recall_ns},
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def add_metrics(bucket: dict, sample: dict, recall_ns: List[int]) -> None:
|
| 102 |
+
bucket["count"] += 1
|
| 103 |
+
bucket["proposal_miss"] += int(not sample["covered"])
|
| 104 |
+
bucket["oracle_j"] += sample["oracle_j"]
|
| 105 |
+
bucket["oracle_f"] += sample["oracle_f"]
|
| 106 |
+
bucket["oracle_jf"] += sample["oracle_jf"]
|
| 107 |
+
bucket["oracle_iou_visible"] += sample["oracle_iou_visible"]
|
| 108 |
+
bucket["oracle_iou_all"] += sample["oracle_iou_all"]
|
| 109 |
+
for n in recall_ns:
|
| 110 |
+
bucket[f"recall@{n}"] += int(sample[f"recall@{n}"])
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def finalize(bucket: dict, recall_ns: List[int]) -> dict:
|
| 114 |
+
count = bucket["count"]
|
| 115 |
+
if count == 0:
|
| 116 |
+
return dict(bucket)
|
| 117 |
+
out = dict(bucket)
|
| 118 |
+
out["proposal_miss_percent"] = 100.0 * bucket["proposal_miss"] / count
|
| 119 |
+
for key in ["oracle_j", "oracle_f", "oracle_jf", "oracle_iou_visible", "oracle_iou_all"]:
|
| 120 |
+
out[key] = bucket[key] / count
|
| 121 |
+
for n in recall_ns:
|
| 122 |
+
out[f"recall@{n}"] = bucket[f"recall@{n}"] / count
|
| 123 |
+
return out
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def main() -> None:
|
| 127 |
+
args = parse_args()
|
| 128 |
+
args.out_dir.mkdir(parents=True, exist_ok=True)
|
| 129 |
+
splits = [s.strip() for s in args.splits.split(",") if s.strip()]
|
| 130 |
+
recall_ns = [int(x) for x in args.recall_ns.split(",") if x.strip()]
|
| 131 |
+
audit = load_audit_rows(args.audit_csv) if args.audit_csv else {}
|
| 132 |
+
rows = read_metadata(args.data_dir, splits)
|
| 133 |
+
selected_vids = load_video_list(args.video_list)
|
| 134 |
+
if selected_vids is not None:
|
| 135 |
+
selected = set(selected_vids)
|
| 136 |
+
rows = [row for row in rows if video_id(row) in selected]
|
| 137 |
+
if args.num_shards < 1:
|
| 138 |
+
raise ValueError("--num_shards must be >= 1")
|
| 139 |
+
if args.shard_id < 0 or args.shard_id >= args.num_shards:
|
| 140 |
+
raise ValueError("--shard_id must be in [0, num_shards)")
|
| 141 |
+
if args.num_shards > 1:
|
| 142 |
+
vids = sorted(rows_by_video(rows).keys())
|
| 143 |
+
selected = {vid for idx, vid in enumerate(vids) if idx % args.num_shards == args.shard_id}
|
| 144 |
+
rows = [row for row in rows if video_id(row) in selected]
|
| 145 |
+
if args.limit_videos:
|
| 146 |
+
vids = sorted(rows_by_video(rows).keys())[: args.limit_videos]
|
| 147 |
+
rows = [row for row in rows if video_id(row) in set(vids)]
|
| 148 |
+
if args.only_existing_proposals:
|
| 149 |
+
rows = [row for row in rows if (args.proposal_dir / f"{video_id(row)}.npz").exists()]
|
| 150 |
+
|
| 151 |
+
sample_rows: List[dict] = []
|
| 152 |
+
summary = defaultdict(lambda: empty_metrics(recall_ns))
|
| 153 |
+
|
| 154 |
+
video_groups = rows_by_video(rows)
|
| 155 |
+
total_objects = sum(len({fid_value(row) for row in group}) for group in video_groups.values())
|
| 156 |
+
|
| 157 |
+
with tqdm(total=total_objects, desc="Evaluating proposal objects") as pbar:
|
| 158 |
+
for vid, video_rows in video_groups.items():
|
| 159 |
+
prop_path = args.proposal_dir / f"{vid}.npz"
|
| 160 |
+
if not prop_path.exists():
|
| 161 |
+
raise FileNotFoundError(f"Missing proposal cache: {prop_path}")
|
| 162 |
+
proposals = load_proposals(prop_path)
|
| 163 |
+
object_cache = {}
|
| 164 |
+
|
| 165 |
+
for row in video_rows:
|
| 166 |
+
key = fid_value(row)
|
| 167 |
+
if key in object_cache:
|
| 168 |
+
base_sample = object_cache[key]
|
| 169 |
+
else:
|
| 170 |
+
gt = load_gt_tube(args.data_dir, vid, key, args.frames)
|
| 171 |
+
visible_ious = np.array([tube_iou_visible(tube, gt) for tube in proposals], dtype=np.float32)
|
| 172 |
+
all_ious = np.array([tube_iou_all(tube, gt) for tube in proposals], dtype=np.float32)
|
| 173 |
+
if len(visible_ious) == 0:
|
| 174 |
+
best_idx = -1
|
| 175 |
+
best_visible = 0.0
|
| 176 |
+
best_all = 0.0
|
| 177 |
+
oracle_j = oracle_f = oracle_jf = 0.0
|
| 178 |
+
else:
|
| 179 |
+
best_idx = int(visible_ious.argmax())
|
| 180 |
+
best_visible = float(visible_ious[best_idx])
|
| 181 |
+
best_all = float(all_ious[best_idx])
|
| 182 |
+
if args.skip_oracle_jf:
|
| 183 |
+
oracle_j = oracle_f = oracle_jf = 0.0
|
| 184 |
+
else:
|
| 185 |
+
oracle_j, oracle_f, oracle_jf = evaluate_tube_jf(proposals[best_idx], gt)
|
| 186 |
+
|
| 187 |
+
base_sample = {
|
| 188 |
+
"vid": vid,
|
| 189 |
+
"fid": key,
|
| 190 |
+
"best_idx": best_idx,
|
| 191 |
+
"num_tubes": int(proposals.shape[0]),
|
| 192 |
+
"covered": best_visible >= args.match_iou,
|
| 193 |
+
"oracle_iou_visible": best_visible,
|
| 194 |
+
"oracle_iou_all": best_all,
|
| 195 |
+
"oracle_j": oracle_j,
|
| 196 |
+
"oracle_f": oracle_f,
|
| 197 |
+
"oracle_jf": oracle_jf,
|
| 198 |
+
}
|
| 199 |
+
for n in recall_ns:
|
| 200 |
+
top = visible_ious[: min(n, len(visible_ious))]
|
| 201 |
+
base_sample[f"recall@{n}"] = bool(len(top) and float(top.max()) >= args.match_iou)
|
| 202 |
+
object_cache[key] = base_sample
|
| 203 |
+
pbar.update(1)
|
| 204 |
+
|
| 205 |
+
sample = dict(base_sample)
|
| 206 |
+
sample.update({"uid": row["uid"], "split": row["split"]})
|
| 207 |
+
sample_rows.append(sample)
|
| 208 |
+
for subset in sample_subsets(row, audit):
|
| 209 |
+
add_metrics(summary[subset], sample, recall_ns)
|
| 210 |
+
|
| 211 |
+
with (args.out_dir / "sample_metrics.csv").open("w", newline="") as f:
|
| 212 |
+
fieldnames = list(sample_rows[0].keys()) if sample_rows else []
|
| 213 |
+
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
| 214 |
+
writer.writeheader()
|
| 215 |
+
writer.writerows(sample_rows)
|
| 216 |
+
|
| 217 |
+
final_summary = {name: finalize(bucket, recall_ns) for name, bucket in sorted(summary.items())}
|
| 218 |
+
write_json(args.out_dir / "summary.json", final_summary)
|
| 219 |
+
|
| 220 |
+
md = ["# TubeToken Phase 0 Proposal Evaluation", ""]
|
| 221 |
+
for name, metrics in final_summary.items():
|
| 222 |
+
if metrics["count"] == 0:
|
| 223 |
+
continue
|
| 224 |
+
recall_text = ", ".join(f"R@{n}={metrics[f'recall@{n}']:.3f}" for n in recall_ns)
|
| 225 |
+
md.append(
|
| 226 |
+
f"- {name}: n={metrics['count']}, {recall_text}, "
|
| 227 |
+
f"Oracle J&F={metrics['oracle_jf']:.4f}, miss={metrics['proposal_miss_percent']:.2f}%"
|
| 228 |
+
)
|
| 229 |
+
(args.out_dir / "report.md").write_text("\n".join(md) + "\n")
|
| 230 |
+
print("\n".join(md))
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
if __name__ == "__main__":
|
| 234 |
+
main()
|
tools/tubetoken/generate_sam2_proposals.py
ADDED
|
@@ -0,0 +1,356 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Generate SAM2 proposal tubes for TubeToken Phase 0.
|
| 3 |
+
|
| 4 |
+
The cache format is one NPZ per video:
|
| 5 |
+
masks: uint8 [N, T, H, W]
|
| 6 |
+
scores: float32 [N]
|
| 7 |
+
keyframes: int64 [N]
|
| 8 |
+
boxes_xyxy: float32 [N, 4]
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import argparse
|
| 14 |
+
from contextlib import nullcontext
|
| 15 |
+
import os
|
| 16 |
+
import sys
|
| 17 |
+
import time
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
from typing import Dict, List, Tuple
|
| 20 |
+
|
| 21 |
+
import numpy as np
|
| 22 |
+
import torch
|
| 23 |
+
from PIL import Image
|
| 24 |
+
from tqdm import tqdm
|
| 25 |
+
|
| 26 |
+
from phase0_common import read_metadata, rows_by_video, video_id
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def parse_args() -> argparse.Namespace:
|
| 30 |
+
parser = argparse.ArgumentParser(description="Generate SAM2 proposal tubes.")
|
| 31 |
+
parser.add_argument("--data_dir", type=Path, required=True)
|
| 32 |
+
parser.add_argument("--out_dir", type=Path, required=True)
|
| 33 |
+
parser.add_argument("--splits", type=str, default="test_s,test_u")
|
| 34 |
+
parser.add_argument("--sam2_repo", type=Path, default=None, help="Path to a local facebookresearch/sam2 clone.")
|
| 35 |
+
parser.add_argument("--model_cfg", type=str, default="configs/sam2.1/sam2.1_hiera_l.yaml")
|
| 36 |
+
parser.add_argument("--checkpoint", type=Path, required=True)
|
| 37 |
+
parser.add_argument("--seed_proposal_dir", type=Path, default=None, help="Reuse boxes/keyframes/scores from an existing proposal cache and only rerun propagation.")
|
| 38 |
+
parser.add_argument("--device", type=str, default="cuda")
|
| 39 |
+
parser.add_argument("--amp_dtype", type=str, default="bf16", choices=["none", "bf16", "fp16"])
|
| 40 |
+
parser.add_argument("--frames", type=int, default=10)
|
| 41 |
+
parser.add_argument("--stride", type=int, default=8)
|
| 42 |
+
parser.add_argument("--max_tubes", type=int, default=128)
|
| 43 |
+
parser.add_argument("--amg_points_per_side", type=int, default=32)
|
| 44 |
+
parser.add_argument("--amg_pred_iou_thresh", type=float, default=0.80)
|
| 45 |
+
parser.add_argument("--amg_stability_score_thresh", type=float, default=0.88)
|
| 46 |
+
parser.add_argument("--min_mask_area", type=int, default=64)
|
| 47 |
+
parser.add_argument("--limit_videos", type=int, default=0)
|
| 48 |
+
parser.add_argument("--video_list", type=Path, default=None)
|
| 49 |
+
parser.add_argument("--shard_id", type=int, default=0)
|
| 50 |
+
parser.add_argument("--num_shards", type=int, default=1)
|
| 51 |
+
parser.add_argument("--quiet_sam2", action="store_true")
|
| 52 |
+
parser.add_argument("--bidirectional", action="store_true", default=True)
|
| 53 |
+
parser.set_defaults(group_by_keyframe=False)
|
| 54 |
+
parser.add_argument("--group_by_keyframe", dest="group_by_keyframe", action="store_true")
|
| 55 |
+
parser.add_argument("--no_group_by_keyframe", dest="group_by_keyframe", action="store_false")
|
| 56 |
+
parser.add_argument("--overwrite", action="store_true")
|
| 57 |
+
return parser.parse_args()
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def import_sam2(repo: Path | None):
|
| 61 |
+
if repo is not None:
|
| 62 |
+
sys.path.insert(0, str(repo))
|
| 63 |
+
from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
|
| 64 |
+
from sam2.build_sam import build_sam2, build_sam2_video_predictor
|
| 65 |
+
|
| 66 |
+
return SAM2AutomaticMaskGenerator, build_sam2, build_sam2_video_predictor
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def keyframes_for_stride(frames: int, stride: int) -> List[int]:
|
| 70 |
+
if stride <= 0:
|
| 71 |
+
raise ValueError("--stride must be positive")
|
| 72 |
+
keyframes = list(range(0, frames, stride))
|
| 73 |
+
mid = frames // 2
|
| 74 |
+
if mid not in keyframes:
|
| 75 |
+
keyframes.append(mid)
|
| 76 |
+
return sorted(set(k for k in keyframes if 0 <= k < frames))
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def load_rgb(path: Path) -> np.ndarray:
|
| 80 |
+
with Image.open(path) as img:
|
| 81 |
+
return np.array(img.convert("RGB"))
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def proposal_score(item: dict) -> float:
|
| 85 |
+
pred_iou = float(item.get("predicted_iou", 0.0))
|
| 86 |
+
stability = float(item.get("stability_score", 0.0))
|
| 87 |
+
area = float(item.get("area", 0.0))
|
| 88 |
+
return pred_iou + 0.1 * stability + min(area / 1_000_000.0, 0.01)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def xywh_to_xyxy(box: List[float]) -> np.ndarray:
|
| 92 |
+
x, y, w, h = box
|
| 93 |
+
return np.array([x, y, x + w, y + h], dtype=np.float32)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def amp_context(device: str, amp_dtype: str):
|
| 97 |
+
if not device.startswith("cuda") or amp_dtype == "none":
|
| 98 |
+
return nullcontext()
|
| 99 |
+
dtype = torch.bfloat16 if amp_dtype == "bf16" else torch.float16
|
| 100 |
+
return torch.autocast("cuda", dtype=dtype)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def collect_keyframe_masks(mask_generator, data_dir: Path, vid: str, keyframes: List[int], min_area: int, device: str, amp_dtype: str) -> List[dict]:
|
| 104 |
+
proposals: List[dict] = []
|
| 105 |
+
for kf in keyframes:
|
| 106 |
+
image = load_rgb(data_dir / "media" / vid / "frames" / f"{kf}.jpg")
|
| 107 |
+
with torch.inference_mode(), amp_context(device, amp_dtype):
|
| 108 |
+
masks = mask_generator.generate(image)
|
| 109 |
+
for m in masks:
|
| 110 |
+
area = int(m.get("area", np.asarray(m["segmentation"]).sum()))
|
| 111 |
+
if area < min_area:
|
| 112 |
+
continue
|
| 113 |
+
bbox = xywh_to_xyxy(m["bbox"])
|
| 114 |
+
proposals.append(
|
| 115 |
+
{
|
| 116 |
+
"keyframe": kf,
|
| 117 |
+
"box": bbox,
|
| 118 |
+
"score": proposal_score(m),
|
| 119 |
+
"area": area,
|
| 120 |
+
}
|
| 121 |
+
)
|
| 122 |
+
proposals.sort(key=lambda x: x["score"], reverse=True)
|
| 123 |
+
return proposals
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def load_seed_proposals(seed_dir: Path, vid: str) -> List[dict] | None:
|
| 127 |
+
path = seed_dir / f"{vid}.npz"
|
| 128 |
+
if not path.exists():
|
| 129 |
+
return None
|
| 130 |
+
data = np.load(path)
|
| 131 |
+
boxes = data["boxes_xyxy"]
|
| 132 |
+
keyframes = data["keyframes"]
|
| 133 |
+
scores = data["scores"]
|
| 134 |
+
proposals = []
|
| 135 |
+
for box, keyframe, score in zip(boxes, keyframes, scores):
|
| 136 |
+
proposals.append(
|
| 137 |
+
{
|
| 138 |
+
"keyframe": int(keyframe),
|
| 139 |
+
"box": box.astype(np.float32),
|
| 140 |
+
"score": float(score),
|
| 141 |
+
"area": 0,
|
| 142 |
+
}
|
| 143 |
+
)
|
| 144 |
+
proposals.sort(key=lambda x: x["score"], reverse=True)
|
| 145 |
+
return proposals
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def add_box_prompt(predictor, state, frame_idx: int, obj_id: int, box: np.ndarray):
|
| 149 |
+
try:
|
| 150 |
+
return predictor.add_new_points_or_box(
|
| 151 |
+
inference_state=state,
|
| 152 |
+
frame_idx=frame_idx,
|
| 153 |
+
obj_id=obj_id,
|
| 154 |
+
box=box,
|
| 155 |
+
)
|
| 156 |
+
except TypeError:
|
| 157 |
+
return predictor.add_new_points_or_box(state, frame_idx, obj_id, box=box)
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def logits_to_mask(logits) -> np.ndarray:
|
| 161 |
+
if hasattr(logits, "detach"):
|
| 162 |
+
logits = logits.detach().cpu().numpy()
|
| 163 |
+
arr = np.asarray(logits)
|
| 164 |
+
while arr.ndim > 2:
|
| 165 |
+
arr = arr[0]
|
| 166 |
+
return arr > 0
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def run_propagation(state, predictor, masks_by_obj: Dict[int, List[np.ndarray]], frames: int, reverse: bool, start_frame_idx: int | None = None) -> None:
|
| 170 |
+
for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(
|
| 171 |
+
state,
|
| 172 |
+
start_frame_idx=start_frame_idx,
|
| 173 |
+
reverse=reverse,
|
| 174 |
+
):
|
| 175 |
+
if hasattr(out_obj_ids, "detach"):
|
| 176 |
+
out_obj_ids = out_obj_ids.detach().cpu().tolist()
|
| 177 |
+
for pos, obj_id in enumerate(list(out_obj_ids)):
|
| 178 |
+
if 0 <= int(out_frame_idx) < frames:
|
| 179 |
+
masks_by_obj[int(obj_id)][int(out_frame_idx)] = logits_to_mask(out_mask_logits[pos])
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def propagate_proposal_group(predictor, video_dir: Path, proposals: List[dict], frames: int, device: str, amp_dtype: str, bidirectional: bool) -> np.ndarray:
|
| 183 |
+
with torch.inference_mode(), amp_context(device, amp_dtype):
|
| 184 |
+
state = predictor.init_state(video_path=str(video_dir))
|
| 185 |
+
for obj_id, proposal in enumerate(proposals):
|
| 186 |
+
add_box_prompt(predictor, state, int(proposal["keyframe"]), obj_id, proposal["box"])
|
| 187 |
+
|
| 188 |
+
masks_by_obj: Dict[int, List[np.ndarray]] = {
|
| 189 |
+
obj_id: [None for _ in range(frames)] for obj_id in range(len(proposals))
|
| 190 |
+
}
|
| 191 |
+
run_propagation(state, predictor, masks_by_obj, frames, reverse=False)
|
| 192 |
+
if bidirectional:
|
| 193 |
+
start_frame_idx = max(int(p["keyframe"]) for p in proposals)
|
| 194 |
+
run_propagation(state, predictor, masks_by_obj, frames, reverse=True, start_frame_idx=start_frame_idx)
|
| 195 |
+
|
| 196 |
+
# Some SAM2 versions only propagate forward. Fill missing frames with the
|
| 197 |
+
# nearest available mask so Phase 0 can still score temporal purity.
|
| 198 |
+
tube_masks = []
|
| 199 |
+
for obj_id in range(len(proposals)):
|
| 200 |
+
masks = masks_by_obj[obj_id]
|
| 201 |
+
known = [i for i, m in enumerate(masks) if m is not None]
|
| 202 |
+
if not known:
|
| 203 |
+
continue
|
| 204 |
+
for t in range(frames):
|
| 205 |
+
if masks[t] is None:
|
| 206 |
+
nearest = min(known, key=lambda k: abs(k - t))
|
| 207 |
+
masks[t] = masks[nearest]
|
| 208 |
+
tube_masks.append(np.stack(masks, axis=0))
|
| 209 |
+
if not tube_masks:
|
| 210 |
+
return np.zeros((0, frames, 1, 1), dtype=np.uint8)
|
| 211 |
+
return np.stack(tube_masks, axis=0).astype(np.uint8)
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def propagate_boxes(
|
| 215 |
+
predictor,
|
| 216 |
+
video_dir: Path,
|
| 217 |
+
proposals: List[dict],
|
| 218 |
+
frames: int,
|
| 219 |
+
device: str,
|
| 220 |
+
amp_dtype: str,
|
| 221 |
+
bidirectional: bool,
|
| 222 |
+
group_by_keyframe: bool,
|
| 223 |
+
) -> np.ndarray:
|
| 224 |
+
if not group_by_keyframe:
|
| 225 |
+
return propagate_proposal_group(predictor, video_dir, proposals, frames, device, amp_dtype, bidirectional)
|
| 226 |
+
|
| 227 |
+
grouped: Dict[int, List[Tuple[int, dict]]] = {}
|
| 228 |
+
for idx, proposal in enumerate(proposals):
|
| 229 |
+
grouped.setdefault(int(proposal["keyframe"]), []).append((idx, proposal))
|
| 230 |
+
|
| 231 |
+
ordered_masks: List[np.ndarray | None] = [None for _ in proposals]
|
| 232 |
+
for _, indexed_group in sorted(grouped.items()):
|
| 233 |
+
group_indices = [idx for idx, _ in indexed_group]
|
| 234 |
+
group_props = [proposal for _, proposal in indexed_group]
|
| 235 |
+
group_masks = propagate_proposal_group(predictor, video_dir, group_props, frames, device, amp_dtype, bidirectional)
|
| 236 |
+
for local_idx, global_idx in enumerate(group_indices[: group_masks.shape[0]]):
|
| 237 |
+
ordered_masks[global_idx] = group_masks[local_idx]
|
| 238 |
+
|
| 239 |
+
known = [mask for mask in ordered_masks if mask is not None]
|
| 240 |
+
if not known:
|
| 241 |
+
return np.zeros((0, frames, 1, 1), dtype=np.uint8)
|
| 242 |
+
h, w = known[0].shape[-2:]
|
| 243 |
+
final = [mask if mask is not None else np.zeros((frames, h, w), dtype=np.uint8) for mask in ordered_masks]
|
| 244 |
+
return np.stack(final, axis=0).astype(np.uint8)
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
def load_video_list(path: Path | None) -> List[str] | None:
|
| 248 |
+
if path is None:
|
| 249 |
+
return None
|
| 250 |
+
vids = []
|
| 251 |
+
for line in path.read_text().splitlines():
|
| 252 |
+
line = line.strip()
|
| 253 |
+
if line and not line.startswith("#"):
|
| 254 |
+
vids.append(line)
|
| 255 |
+
return vids
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
def main() -> None:
|
| 259 |
+
args = parse_args()
|
| 260 |
+
args.out_dir.mkdir(parents=True, exist_ok=True)
|
| 261 |
+
SAM2AutomaticMaskGenerator, build_sam2, build_sam2_video_predictor = import_sam2(args.sam2_repo)
|
| 262 |
+
|
| 263 |
+
mask_generator = None
|
| 264 |
+
if args.seed_proposal_dir is None:
|
| 265 |
+
image_model = build_sam2(args.model_cfg, str(args.checkpoint), device=args.device)
|
| 266 |
+
mask_generator = SAM2AutomaticMaskGenerator(
|
| 267 |
+
image_model,
|
| 268 |
+
points_per_side=args.amg_points_per_side,
|
| 269 |
+
pred_iou_thresh=args.amg_pred_iou_thresh,
|
| 270 |
+
stability_score_thresh=args.amg_stability_score_thresh,
|
| 271 |
+
)
|
| 272 |
+
video_predictor = build_sam2_video_predictor(args.model_cfg, str(args.checkpoint), device=args.device)
|
| 273 |
+
|
| 274 |
+
splits = [s.strip() for s in args.splits.split(",") if s.strip()]
|
| 275 |
+
rows = read_metadata(args.data_dir, splits)
|
| 276 |
+
vids = sorted(rows_by_video(rows).keys())
|
| 277 |
+
selected_vids = load_video_list(args.video_list)
|
| 278 |
+
if selected_vids is not None:
|
| 279 |
+
selected = set(selected_vids)
|
| 280 |
+
vids = [vid for vid in vids if vid in selected]
|
| 281 |
+
if args.num_shards < 1:
|
| 282 |
+
raise ValueError("--num_shards must be >= 1")
|
| 283 |
+
if args.shard_id < 0 or args.shard_id >= args.num_shards:
|
| 284 |
+
raise ValueError("--shard_id must be in [0, num_shards)")
|
| 285 |
+
if args.num_shards > 1:
|
| 286 |
+
vids = [vid for idx, vid in enumerate(vids) if idx % args.num_shards == args.shard_id]
|
| 287 |
+
if args.limit_videos:
|
| 288 |
+
vids = vids[: args.limit_videos]
|
| 289 |
+
keyframes = keyframes_for_stride(args.frames, args.stride)
|
| 290 |
+
|
| 291 |
+
manifest = {
|
| 292 |
+
"data_dir": str(args.data_dir),
|
| 293 |
+
"splits": splits,
|
| 294 |
+
"model_cfg": args.model_cfg,
|
| 295 |
+
"checkpoint": str(args.checkpoint),
|
| 296 |
+
"stride": args.stride,
|
| 297 |
+
"keyframes": keyframes,
|
| 298 |
+
"max_tubes": args.max_tubes,
|
| 299 |
+
"videos": len(vids),
|
| 300 |
+
"items": [],
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
for vid in tqdm(vids, desc="Generating SAM2 proposals"):
|
| 304 |
+
out_path = args.out_dir / f"{vid}.npz"
|
| 305 |
+
if out_path.exists() and not args.overwrite:
|
| 306 |
+
manifest["items"].append({"vid": vid, "path": str(out_path), "skipped": True})
|
| 307 |
+
continue
|
| 308 |
+
|
| 309 |
+
start = time.perf_counter()
|
| 310 |
+
proposals = load_seed_proposals(args.seed_proposal_dir, vid) if args.seed_proposal_dir is not None else None
|
| 311 |
+
if proposals is None:
|
| 312 |
+
proposals = collect_keyframe_masks(mask_generator, args.data_dir, vid, keyframes, args.min_mask_area, args.device, args.amp_dtype)
|
| 313 |
+
proposals = proposals[: args.max_tubes]
|
| 314 |
+
if args.quiet_sam2:
|
| 315 |
+
from contextlib import redirect_stdout, redirect_stderr
|
| 316 |
+
import io
|
| 317 |
+
|
| 318 |
+
with redirect_stdout(io.StringIO()), redirect_stderr(io.StringIO()):
|
| 319 |
+
if proposals:
|
| 320 |
+
masks = propagate_boxes(video_predictor, args.data_dir / "media" / vid / "frames", proposals, args.frames, args.device, args.amp_dtype, args.bidirectional, args.group_by_keyframe)
|
| 321 |
+
else:
|
| 322 |
+
first = load_rgb(args.data_dir / "media" / vid / "frames" / "0.jpg")
|
| 323 |
+
h, w = first.shape[:2]
|
| 324 |
+
masks = np.zeros((0, args.frames, h, w), dtype=np.uint8)
|
| 325 |
+
else:
|
| 326 |
+
if proposals:
|
| 327 |
+
masks = propagate_boxes(video_predictor, args.data_dir / "media" / vid / "frames", proposals, args.frames, args.device, args.amp_dtype, args.bidirectional, args.group_by_keyframe)
|
| 328 |
+
else:
|
| 329 |
+
first = load_rgb(args.data_dir / "media" / vid / "frames" / "0.jpg")
|
| 330 |
+
h, w = first.shape[:2]
|
| 331 |
+
masks = np.zeros((0, args.frames, h, w), dtype=np.uint8)
|
| 332 |
+
|
| 333 |
+
n = min(len(proposals), masks.shape[0])
|
| 334 |
+
proposals = proposals[:n]
|
| 335 |
+
masks = masks[:n]
|
| 336 |
+
scores = np.array([p["score"] for p in proposals], dtype=np.float32)
|
| 337 |
+
boxes = np.stack([p["box"] for p in proposals], axis=0).astype(np.float32) if proposals else np.zeros((0, 4), dtype=np.float32)
|
| 338 |
+
proposal_keyframes = np.array([p["keyframe"] for p in proposals], dtype=np.int64)
|
| 339 |
+
np.savez_compressed(
|
| 340 |
+
out_path,
|
| 341 |
+
masks=masks,
|
| 342 |
+
scores=scores,
|
| 343 |
+
keyframes=proposal_keyframes,
|
| 344 |
+
boxes_xyxy=boxes,
|
| 345 |
+
)
|
| 346 |
+
elapsed = time.perf_counter() - start
|
| 347 |
+
manifest["items"].append({"vid": vid, "path": str(out_path), "tubes": int(n), "seconds": elapsed})
|
| 348 |
+
|
| 349 |
+
with (args.out_dir / "manifest.json").open("w") as f:
|
| 350 |
+
import json
|
| 351 |
+
|
| 352 |
+
json.dump(manifest, f, indent=2)
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
if __name__ == "__main__":
|
| 356 |
+
main()
|
tools/tubetoken/phase0_common.py
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Shared utilities for TubeToken Phase 0 experiments."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import csv
|
| 6 |
+
import json
|
| 7 |
+
from collections import defaultdict
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
| 10 |
+
|
| 11 |
+
import numpy as np
|
| 12 |
+
from PIL import Image
|
| 13 |
+
|
| 14 |
+
try:
|
| 15 |
+
import cv2
|
| 16 |
+
except Exception: # pragma: no cover
|
| 17 |
+
cv2 = None
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
VALID_EVAL_SPLITS = {"test_s", "test_u"}
|
| 21 |
+
VALID_DATA_SPLITS = {"train", "val", "test_s", "test_u", "test_n"}
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def read_metadata(data_dir: Path, splits: Optional[Iterable[str]] = None) -> List[dict]:
|
| 25 |
+
wanted = set(splits) if splits else None
|
| 26 |
+
with (data_dir / "metadata.csv").open("r", newline="") as f:
|
| 27 |
+
rows = list(csv.DictReader(f))
|
| 28 |
+
if wanted is not None:
|
| 29 |
+
rows = [r for r in rows if r["split"] in wanted]
|
| 30 |
+
return rows
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def video_id(row: dict) -> str:
|
| 34 |
+
return row.get("vid") or row["uid"].rsplit("_", 2)[0]
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def fid_value(row: dict) -> str:
|
| 38 |
+
return str(row.get("fid", "")).strip()
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def category_from_uid(row: dict) -> str:
|
| 42 |
+
vid = video_id(row)
|
| 43 |
+
uid = row.get("uid", "")
|
| 44 |
+
suffix = uid[len(vid) + 1 :] if uid.startswith(vid + "_") else uid.rsplit("_", 2)[-2]
|
| 45 |
+
return suffix.rsplit("_", 1)[0] if "_" in suffix else suffix
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def rows_by_video(rows: Sequence[dict]) -> Dict[str, List[dict]]:
|
| 49 |
+
out: Dict[str, List[dict]] = defaultdict(list)
|
| 50 |
+
for row in rows:
|
| 51 |
+
out[video_id(row)].append(row)
|
| 52 |
+
return out
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def load_mask(path: Path) -> np.ndarray:
|
| 56 |
+
with Image.open(path) as img:
|
| 57 |
+
return np.array(img.convert("L")) > 0
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def load_gt_tube(data_dir: Path, vid: str, fid: str, frames: int = 10) -> np.ndarray:
|
| 61 |
+
masks = []
|
| 62 |
+
for t in range(frames):
|
| 63 |
+
path = data_dir / "gt_mask" / vid / f"fid_{fid}" / f"0000{t}.png"
|
| 64 |
+
masks.append(load_mask(path))
|
| 65 |
+
return np.stack(masks, axis=0)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def mask_iou(pred: np.ndarray, gt: np.ndarray) -> float:
|
| 69 |
+
pred = pred.astype(bool)
|
| 70 |
+
gt = gt.astype(bool)
|
| 71 |
+
union = np.logical_or(pred, gt).sum()
|
| 72 |
+
if union == 0:
|
| 73 |
+
return 1.0
|
| 74 |
+
inter = np.logical_and(pred, gt).sum()
|
| 75 |
+
return float(inter / union)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def tube_iou_visible(pred_tube: np.ndarray, gt_tube: np.ndarray) -> float:
|
| 79 |
+
visible = gt_tube.reshape(gt_tube.shape[0], -1).sum(axis=1) > 0
|
| 80 |
+
if not visible.any():
|
| 81 |
+
return 0.0
|
| 82 |
+
vals = [mask_iou(pred_tube[t], gt_tube[t]) for t in np.where(visible)[0]]
|
| 83 |
+
return float(np.mean(vals)) if vals else 0.0
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def tube_iou_all(pred_tube: np.ndarray, gt_tube: np.ndarray) -> float:
|
| 87 |
+
vals = [mask_iou(pred_tube[t], gt_tube[t]) for t in range(gt_tube.shape[0])]
|
| 88 |
+
return float(np.mean(vals)) if vals else 0.0
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def db_eval_iou(annotation: np.ndarray, segmentation: np.ndarray) -> float:
|
| 92 |
+
annotation = annotation.astype(bool)
|
| 93 |
+
segmentation = segmentation.astype(bool)
|
| 94 |
+
if annotation.sum() == 0 and segmentation.sum() == 0:
|
| 95 |
+
return 1.0
|
| 96 |
+
if annotation.sum() == 0 or segmentation.sum() == 0:
|
| 97 |
+
return 0.0
|
| 98 |
+
inter = np.logical_and(annotation, segmentation).sum()
|
| 99 |
+
union = np.logical_or(annotation, segmentation).sum()
|
| 100 |
+
return float(inter / union) if union > 0 else 0.0
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def db_eval_boundary(annotation: np.ndarray, segmentation: np.ndarray, bound_th: float = 0.008) -> float:
|
| 104 |
+
annotation = annotation.astype(bool)
|
| 105 |
+
segmentation = segmentation.astype(bool)
|
| 106 |
+
if annotation.sum() == 0 and segmentation.sum() == 0:
|
| 107 |
+
return 1.0
|
| 108 |
+
if annotation.sum() == 0 or segmentation.sum() == 0:
|
| 109 |
+
return 0.0
|
| 110 |
+
|
| 111 |
+
bound_pix = max(1, int(round(bound_th * np.linalg.norm(annotation.shape))))
|
| 112 |
+
if cv2 is not None:
|
| 113 |
+
fg_boundary = mask_to_boundary_cv2(annotation, bound_pix)
|
| 114 |
+
seg_boundary = mask_to_boundary_cv2(segmentation, bound_pix)
|
| 115 |
+
kernel = np.ones((2 * bound_pix + 1, 2 * bound_pix + 1), dtype=np.uint8)
|
| 116 |
+
fg_dil = cv2.dilate(fg_boundary.astype(np.uint8), kernel, iterations=1).astype(bool)
|
| 117 |
+
seg_dil = cv2.dilate(seg_boundary.astype(np.uint8), kernel, iterations=1).astype(bool)
|
| 118 |
+
gt_match = np.logical_and(fg_boundary, seg_dil).sum()
|
| 119 |
+
pred_match = np.logical_and(seg_boundary, fg_dil).sum()
|
| 120 |
+
n_fg = fg_boundary.sum()
|
| 121 |
+
n_pred = seg_boundary.sum()
|
| 122 |
+
if n_fg == 0 and n_pred == 0:
|
| 123 |
+
return 1.0
|
| 124 |
+
if n_fg == 0 or n_pred == 0:
|
| 125 |
+
return 0.0
|
| 126 |
+
precision = pred_match / n_pred
|
| 127 |
+
recall = gt_match / n_fg
|
| 128 |
+
if precision + recall == 0:
|
| 129 |
+
return 0.0
|
| 130 |
+
return float(2 * precision * recall / (precision + recall))
|
| 131 |
+
|
| 132 |
+
fg_boundary = mask_to_boundary(annotation, bound_pix)
|
| 133 |
+
seg_boundary = mask_to_boundary(segmentation, bound_pix)
|
| 134 |
+
fg_dil = binary_dilate(fg_boundary, bound_pix)
|
| 135 |
+
seg_dil = binary_dilate(seg_boundary, bound_pix)
|
| 136 |
+
|
| 137 |
+
gt_match = np.logical_and(fg_boundary, seg_dil).sum()
|
| 138 |
+
pred_match = np.logical_and(seg_boundary, fg_dil).sum()
|
| 139 |
+
n_fg = fg_boundary.sum()
|
| 140 |
+
n_pred = seg_boundary.sum()
|
| 141 |
+
if n_fg == 0 and n_pred == 0:
|
| 142 |
+
return 1.0
|
| 143 |
+
if n_fg == 0 or n_pred == 0:
|
| 144 |
+
return 0.0
|
| 145 |
+
precision = pred_match / n_pred
|
| 146 |
+
recall = gt_match / n_fg
|
| 147 |
+
if precision + recall == 0:
|
| 148 |
+
return 0.0
|
| 149 |
+
return float(2 * precision * recall / (precision + recall))
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def mask_to_boundary_cv2(mask: np.ndarray, dilation: int) -> np.ndarray:
|
| 153 |
+
kernel = np.ones((2 * dilation + 1, 2 * dilation + 1), dtype=np.uint8)
|
| 154 |
+
eroded = cv2.erode(mask.astype(np.uint8), kernel, iterations=1).astype(bool)
|
| 155 |
+
return np.logical_xor(mask.astype(bool), eroded)
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def mask_to_boundary(mask: np.ndarray, dilation: int) -> np.ndarray:
|
| 159 |
+
eroded = binary_erode(mask, dilation)
|
| 160 |
+
return np.logical_xor(mask, eroded)
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def binary_erode(mask: np.ndarray, radius: int) -> np.ndarray:
|
| 164 |
+
padded = np.pad(mask.astype(bool), radius, mode="constant", constant_values=False)
|
| 165 |
+
out = np.ones_like(mask, dtype=bool)
|
| 166 |
+
size = 2 * radius + 1
|
| 167 |
+
for dy in range(size):
|
| 168 |
+
for dx in range(size):
|
| 169 |
+
out &= padded[dy : dy + mask.shape[0], dx : dx + mask.shape[1]]
|
| 170 |
+
return out
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def binary_dilate(mask: np.ndarray, radius: int) -> np.ndarray:
|
| 174 |
+
padded = np.pad(mask.astype(bool), radius, mode="constant", constant_values=False)
|
| 175 |
+
out = np.zeros_like(mask, dtype=bool)
|
| 176 |
+
size = 2 * radius + 1
|
| 177 |
+
for dy in range(size):
|
| 178 |
+
for dx in range(size):
|
| 179 |
+
out |= padded[dy : dy + mask.shape[0], dx : dx + mask.shape[1]]
|
| 180 |
+
return out
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def evaluate_tube_jf(pred_tube: np.ndarray, gt_tube: np.ndarray) -> Tuple[float, float, float]:
|
| 184 |
+
js = [db_eval_iou(gt_tube[t], pred_tube[t]) for t in range(gt_tube.shape[0])]
|
| 185 |
+
fs = [db_eval_boundary(gt_tube[t], pred_tube[t]) for t in range(gt_tube.shape[0])]
|
| 186 |
+
j = float(np.mean(js)) if js else 0.0
|
| 187 |
+
f = float(np.mean(fs)) if fs else 0.0
|
| 188 |
+
return j, f, (j + f) / 2
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def bbox_from_mask(mask: np.ndarray) -> Optional[Tuple[int, int, int, int]]:
|
| 192 |
+
ys, xs = np.where(mask.astype(bool))
|
| 193 |
+
if len(xs) == 0:
|
| 194 |
+
return None
|
| 195 |
+
return int(xs.min()), int(ys.min()), int(xs.max()), int(ys.max())
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
def load_audit_rows(audit_csv: Path) -> Dict[str, dict]:
|
| 199 |
+
if not audit_csv.exists():
|
| 200 |
+
return {}
|
| 201 |
+
with audit_csv.open("r", newline="") as f:
|
| 202 |
+
return {row["uid"]: row for row in csv.DictReader(f)}
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def bool_field(row: Optional[dict], key: str) -> bool:
|
| 206 |
+
if not row:
|
| 207 |
+
return False
|
| 208 |
+
return str(row.get(key, "")).lower() in {"1", "true", "yes"}
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
def write_json(path: Path, obj: dict) -> None:
|
| 212 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 213 |
+
with path.open("w") as f:
|
| 214 |
+
json.dump(obj, f, indent=2, sort_keys=True)
|