Add files using upload-large-folder tool
Browse files- .gitattributes +1 -0
- ChatUniVi/model/multimodal_encoder/__pycache__/processor.cpython-310.pyc +0 -0
- configs/__pycache__/__init__.cpython-310.pyc +0 -0
- configs/config.py +1 -0
- datasets/__pycache__/__init__.cpython-310.pyc +0 -0
- models/__pycache__/__init__.cpython-310.pyc +0 -0
- models/__pycache__/avs_model.cpython-310.pyc +0 -0
- models/llava/__pycache__/__init__.cpython-310.pyc +0 -0
- models/llava/__pycache__/conversation.cpython-310.pyc +0 -0
- models/llava/model/__pycache__/__init__.cpython-310.pyc +0 -0
- models/llava/model/__pycache__/llava_arch.cpython-310.pyc +0 -0
- models/llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc +0 -0
- models/llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc +0 -0
- models/llava/model/language_model/mpt/__pycache__/adapt_tokenizer.cpython-310.pyc +0 -0
- models/llava/model/language_model/mpt/__pycache__/attention.cpython-310.pyc +0 -0
- models/llava/model/language_model/mpt/__pycache__/blocks.cpython-310.pyc +0 -0
- models/llava/model/language_model/mpt/__pycache__/configuration_mpt.cpython-310.pyc +0 -0
- models/llava/model/language_model/mpt/__pycache__/custom_embedding.cpython-310.pyc +0 -0
- models/llava/model/language_model/mpt/__pycache__/flash_attn_triton.cpython-310.pyc +0 -0
- models/llava/model/language_model/mpt/__pycache__/hf_prefixlm_converter.cpython-310.pyc +0 -0
- models/llava/model/language_model/mpt/__pycache__/meta_init_context.cpython-310.pyc +0 -0
- models/llava/model/language_model/mpt/__pycache__/modeling_mpt.cpython-310.pyc +0 -0
- models/llava/model/language_model/mpt/__pycache__/norm.cpython-310.pyc +0 -0
- models/llava/model/language_model/mpt/__pycache__/param_init_fns.cpython-310.pyc +0 -0
- models/llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc +0 -0
- models/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc +0 -0
- models/segment_anything/__pycache__/__init__.cpython-310.pyc +0 -0
- models/segment_anything/__pycache__/automatic_mask_generator.cpython-310.pyc +0 -0
- models/segment_anything/__pycache__/build_sam.cpython-310.pyc +0 -0
- models/segment_anything/__pycache__/predictor.cpython-310.pyc +0 -0
- models/segment_anything/modeling/__pycache__/__init__.cpython-310.pyc +0 -0
- models/segment_anything/modeling/__pycache__/common.cpython-310.pyc +0 -0
- models/segment_anything/modeling/__pycache__/image_encoder.cpython-310.pyc +0 -0
- models/segment_anything/modeling/__pycache__/mask_decoder.cpython-310.pyc +0 -0
- models/segment_anything/modeling/__pycache__/prompt_encoder.cpython-310.pyc +0 -0
- models/segment_anything/modeling/__pycache__/sam.cpython-310.pyc +0 -0
- models/segment_anything/modeling/__pycache__/transformer.cpython-310.pyc +0 -0
- models/segment_anything/utils/__pycache__/__init__.cpython-310.pyc +0 -0
- models/segment_anything/utils/__pycache__/amg.cpython-310.pyc +0 -0
- models/tf/__pycache__/modeling_outputs.cpython-310.pyc +0 -0
- runs/tubetoken_phase0/proposals_stride8_n64_bidir/lYwnXP3g050_4000_14000.npz +3 -0
- runs/tubetoken_phase_minus1/audit_full.log +47 -0
- runs/tubetoken_phase_minus1/audit_full/audit_report.md +34 -0
- runs/tubetoken_phase_minus1/audit_full/audit_samples.csv +0 -0
- tools/audit_refavs.py +371 -0
- tools/tubetoken/__pycache__/evaluate_oracle_refine_sam2.cpython-312.pyc +0 -0
- tools/tubetoken/evaluate_oracle_refine_sam2.py +203 -0
- utils/__pycache__/__init__.cpython-310.pyc +0 -0
- utils/metric/__pycache__/pyutils.cpython-310.pyc +0 -0
- utils/metric/__pycache__/utility.cpython-310.pyc +0 -0
.gitattributes
CHANGED
|
@@ -6,3 +6,4 @@
|
|
| 6 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 7 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 8 |
ChatUniVi/eval/questions/scienceqa/problems.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 6 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 7 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 8 |
ChatUniVi/eval/questions/scienceqa/problems.json filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
runs/tubetoken_phase0/proposals_stride8_n64_bidir/lYwnXP3g050_4000_14000.npz filter=lfs diff=lfs merge=lfs -text
|
ChatUniVi/model/multimodal_encoder/__pycache__/processor.cpython-310.pyc
ADDED
|
Binary file (2.38 kB). View file
|
|
|
configs/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (167 Bytes). View file
|
|
|
configs/config.py
CHANGED
|
@@ -53,6 +53,7 @@ parser.add_argument("--log_root",type=str,default='log', help="where to save log
|
|
| 53 |
parser.add_argument("--checkpoint_root",type=str,default='checkpoints', help="where to save trained checkpoints during training")
|
| 54 |
|
| 55 |
parser.add_argument("--visualization_root",type=str,default='visualization', help="where to save visualization result during test")
|
|
|
|
| 56 |
|
| 57 |
|
| 58 |
|
|
|
|
| 53 |
parser.add_argument("--checkpoint_root",type=str,default='checkpoints', help="where to save trained checkpoints during training")
|
| 54 |
|
| 55 |
parser.add_argument("--visualization_root",type=str,default='visualization', help="where to save visualization result during test")
|
| 56 |
+
parser.add_argument("--eval_splits",type=str,default='test_s,test_u,test_n', help="comma-separated eval splits for load_model.py: test_s,test_u,test_n")
|
| 57 |
|
| 58 |
|
| 59 |
|
datasets/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (178 Bytes). View file
|
|
|
models/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (131 Bytes). View file
|
|
|
models/__pycache__/avs_model.cpython-310.pyc
ADDED
|
Binary file (11.4 kB). View file
|
|
|
models/llava/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (188 Bytes). View file
|
|
|
models/llava/__pycache__/conversation.cpython-310.pyc
ADDED
|
Binary file (10.4 kB). View file
|
|
|
models/llava/model/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (330 Bytes). View file
|
|
|
models/llava/model/__pycache__/llava_arch.cpython-310.pyc
ADDED
|
Binary file (8.2 kB). View file
|
|
|
models/llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc
ADDED
|
Binary file (3.6 kB). View file
|
|
|
models/llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc
ADDED
|
Binary file (4.85 kB). View file
|
|
|
models/llava/model/language_model/mpt/__pycache__/adapt_tokenizer.cpython-310.pyc
ADDED
|
Binary file (2.24 kB). View file
|
|
|
models/llava/model/language_model/mpt/__pycache__/attention.cpython-310.pyc
ADDED
|
Binary file (12.3 kB). View file
|
|
|
models/llava/model/language_model/mpt/__pycache__/blocks.cpython-310.pyc
ADDED
|
Binary file (2.9 kB). View file
|
|
|
models/llava/model/language_model/mpt/__pycache__/configuration_mpt.cpython-310.pyc
ADDED
|
Binary file (8.86 kB). View file
|
|
|
models/llava/model/language_model/mpt/__pycache__/custom_embedding.cpython-310.pyc
ADDED
|
Binary file (757 Bytes). View file
|
|
|
models/llava/model/language_model/mpt/__pycache__/flash_attn_triton.cpython-310.pyc
ADDED
|
Binary file (21.4 kB). View file
|
|
|
models/llava/model/language_model/mpt/__pycache__/hf_prefixlm_converter.cpython-310.pyc
ADDED
|
Binary file (19.8 kB). View file
|
|
|
models/llava/model/language_model/mpt/__pycache__/meta_init_context.cpython-310.pyc
ADDED
|
Binary file (3.83 kB). View file
|
|
|
models/llava/model/language_model/mpt/__pycache__/modeling_mpt.cpython-310.pyc
ADDED
|
Binary file (15.7 kB). View file
|
|
|
models/llava/model/language_model/mpt/__pycache__/norm.cpython-310.pyc
ADDED
|
Binary file (3 kB). View file
|
|
|
models/llava/model/language_model/mpt/__pycache__/param_init_fns.cpython-310.pyc
ADDED
|
Binary file (9.31 kB). View file
|
|
|
models/llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc
ADDED
|
Binary file (571 Bytes). View file
|
|
|
models/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc
ADDED
|
Binary file (3.03 kB). View file
|
|
|
models/segment_anything/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (407 Bytes). View file
|
|
|
models/segment_anything/__pycache__/automatic_mask_generator.cpython-310.pyc
ADDED
|
Binary file (11.4 kB). View file
|
|
|
models/segment_anything/__pycache__/build_sam.cpython-310.pyc
ADDED
|
Binary file (2.17 kB). View file
|
|
|
models/segment_anything/__pycache__/predictor.cpython-310.pyc
ADDED
|
Binary file (9.98 kB). View file
|
|
|
models/segment_anything/modeling/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (394 Bytes). View file
|
|
|
models/segment_anything/modeling/__pycache__/common.cpython-310.pyc
ADDED
|
Binary file (1.75 kB). View file
|
|
|
models/segment_anything/modeling/__pycache__/image_encoder.cpython-310.pyc
ADDED
|
Binary file (12.9 kB). View file
|
|
|
models/segment_anything/modeling/__pycache__/mask_decoder.cpython-310.pyc
ADDED
|
Binary file (6.26 kB). View file
|
|
|
models/segment_anything/modeling/__pycache__/prompt_encoder.cpython-310.pyc
ADDED
|
Binary file (7.85 kB). View file
|
|
|
models/segment_anything/modeling/__pycache__/sam.cpython-310.pyc
ADDED
|
Binary file (6.7 kB). View file
|
|
|
models/segment_anything/modeling/__pycache__/transformer.cpython-310.pyc
ADDED
|
Binary file (6.61 kB). View file
|
|
|
models/segment_anything/utils/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (154 Bytes). View file
|
|
|
models/segment_anything/utils/__pycache__/amg.cpython-310.pyc
ADDED
|
Binary file (12.1 kB). View file
|
|
|
models/tf/__pycache__/modeling_outputs.cpython-310.pyc
ADDED
|
Binary file (2.94 kB). View file
|
|
|
runs/tubetoken_phase0/proposals_stride8_n64_bidir/lYwnXP3g050_4000_14000.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7f09a2efd79256574c47624d11d5ae5ccff4e267abb961dc63375d862c8db958
|
| 3 |
+
size 1418418
|
runs/tubetoken_phase_minus1/audit_full.log
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"area_unstable_expressions": 41,
|
| 3 |
+
"audio_keyword_expressions": 15890,
|
| 4 |
+
"audio_keyword_percent": 77.66753018231586,
|
| 5 |
+
"data_dir": "/workspace/SimToken/data",
|
| 6 |
+
"expressions_per_object": {
|
| 7 |
+
"ge2": 5836,
|
| 8 |
+
"ge3": 4206,
|
| 9 |
+
"max": 10,
|
| 10 |
+
"mean": 2.742125720412813,
|
| 11 |
+
"median": 3
|
| 12 |
+
},
|
| 13 |
+
"expressions_per_video": {
|
| 14 |
+
"ge2": 3521,
|
| 15 |
+
"ge3": 3381,
|
| 16 |
+
"max": 26,
|
| 17 |
+
"mean": 5.7243984331281474,
|
| 18 |
+
"median": 6.0
|
| 19 |
+
},
|
| 20 |
+
"h3_candidate_expressions": 18614,
|
| 21 |
+
"h3_candidate_objects": 5781,
|
| 22 |
+
"late_target_expressions": 0,
|
| 23 |
+
"mask_rows_audited": 20459,
|
| 24 |
+
"multi_expression_objects": 5836,
|
| 25 |
+
"multi_expression_videos": 3521,
|
| 26 |
+
"null_split_expressions": 1028,
|
| 27 |
+
"null_split_percent": 5.0246835133682,
|
| 28 |
+
"num_expressions": 20459,
|
| 29 |
+
"num_objects_vid_fid": 7461,
|
| 30 |
+
"num_videos": 3574,
|
| 31 |
+
"partial_target_expressions": 33,
|
| 32 |
+
"same_category_distractor_heuristic_expressions": 2563,
|
| 33 |
+
"same_category_distractor_heuristic_percent": 12.527494012415074,
|
| 34 |
+
"small_target_expressions": 10037,
|
| 35 |
+
"spatial_keyword_expressions": 5924,
|
| 36 |
+
"spatial_keyword_percent": 28.955471919448655,
|
| 37 |
+
"splits": {
|
| 38 |
+
"TODO": 25,
|
| 39 |
+
"test_n": 1028,
|
| 40 |
+
"test_s": 2288,
|
| 41 |
+
"test_u": 1656,
|
| 42 |
+
"train": 14113,
|
| 43 |
+
"val": 1349
|
| 44 |
+
}
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
Wrote audit files to: /workspace/SimToken/runs/tubetoken_phase_minus1/audit_full
|
runs/tubetoken_phase_minus1/audit_full/audit_report.md
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# TubeToken Phase -1 Audit
|
| 2 |
+
|
| 3 |
+
- Expressions: 20459
|
| 4 |
+
- Videos: 3574
|
| 5 |
+
- Objects `(vid, fid)`: 7461
|
| 6 |
+
- Splits: `{'val': 1349, 'train': 14113, 'test_s': 2288, 'TODO': 25, 'test_u': 1656, 'test_n': 1028}`
|
| 7 |
+
|
| 8 |
+
## Multi-expression
|
| 9 |
+
|
| 10 |
+
- Expressions/video mean: 5.724
|
| 11 |
+
- Expressions/video median: 6.0
|
| 12 |
+
- Videos with >=2 expressions: 3521
|
| 13 |
+
- Expressions/object mean: 2.742
|
| 14 |
+
- Objects with >=2 expressions: 5836
|
| 15 |
+
- H3 candidate objects: 5781
|
| 16 |
+
- H3 candidate expressions: 18614
|
| 17 |
+
|
| 18 |
+
## Diagnostic Subsets
|
| 19 |
+
|
| 20 |
+
- Null split expressions: 1028 (5.02%)
|
| 21 |
+
- Audio-keyword expressions: 15890 (77.67%)
|
| 22 |
+
- Spatial-keyword expressions: 5924 (28.96%)
|
| 23 |
+
- Same-category distractor heuristic expressions: 2563 (12.53%)
|
| 24 |
+
- Mask rows audited: 20459
|
| 25 |
+
- Late-target expressions: 0
|
| 26 |
+
- Small-target expressions: 10037
|
| 27 |
+
- Partial-target expressions: 33
|
| 28 |
+
- Area-unstable expressions: 41
|
| 29 |
+
|
| 30 |
+
## Phase -1 H3 Decision Hint
|
| 31 |
+
|
| 32 |
+
H3 can stay as a direct validation target: the data has multi-expression structure.
|
| 33 |
+
|
| 34 |
+
Generated files: `audit_summary.json`, `audit_samples.csv`, `h3_candidates.csv`.
|
runs/tubetoken_phase_minus1/audit_full/audit_samples.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tools/audit_refavs.py
ADDED
|
@@ -0,0 +1,371 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Audit Ref-AVS style metadata for the TubeToken experiment plan.
|
| 3 |
+
|
| 4 |
+
This script intentionally depends only on the dataset files. It does not import
|
| 5 |
+
the training code, so it can run before model dependencies are fully settled.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import argparse
|
| 11 |
+
import csv
|
| 12 |
+
import json
|
| 13 |
+
import math
|
| 14 |
+
import os
|
| 15 |
+
from collections import Counter, defaultdict
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from statistics import mean, median
|
| 18 |
+
from typing import Dict, Iterable, List, Optional, Tuple
|
| 19 |
+
|
| 20 |
+
try:
|
| 21 |
+
from PIL import Image
|
| 22 |
+
except Exception: # pragma: no cover - only used as an environment fallback
|
| 23 |
+
Image = None
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
AUDIO_KEYWORDS = (
|
| 27 |
+
"sound",
|
| 28 |
+
"sounding",
|
| 29 |
+
"making sound",
|
| 30 |
+
"longest sound",
|
| 31 |
+
"intermittent sound",
|
| 32 |
+
"silent",
|
| 33 |
+
"audio",
|
| 34 |
+
"heard",
|
| 35 |
+
"emitting",
|
| 36 |
+
"playing instrument",
|
| 37 |
+
"voice",
|
| 38 |
+
"speaking",
|
| 39 |
+
"talking",
|
| 40 |
+
"singing",
|
| 41 |
+
"barking",
|
| 42 |
+
"meowing",
|
| 43 |
+
"hitting",
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
SPATIAL_KEYWORDS = (
|
| 47 |
+
"left",
|
| 48 |
+
"right",
|
| 49 |
+
"top",
|
| 50 |
+
"bottom",
|
| 51 |
+
"front",
|
| 52 |
+
"back",
|
| 53 |
+
"behind",
|
| 54 |
+
"next to",
|
| 55 |
+
"near",
|
| 56 |
+
"far",
|
| 57 |
+
"middle",
|
| 58 |
+
"center",
|
| 59 |
+
"between",
|
| 60 |
+
"above",
|
| 61 |
+
"below",
|
| 62 |
+
"under",
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def parse_args() -> argparse.Namespace:
|
| 67 |
+
parser = argparse.ArgumentParser(description="Audit Ref-AVS data for TubeToken Phase -1.")
|
| 68 |
+
parser.add_argument("--data_dir", type=Path, default=Path("data"))
|
| 69 |
+
parser.add_argument("--out_dir", type=Path, default=Path("runs/tubetoken_phase_minus1/audit"))
|
| 70 |
+
parser.add_argument("--frames", type=int, default=10)
|
| 71 |
+
parser.add_argument("--small_area", type=float, default=0.05)
|
| 72 |
+
parser.add_argument("--mask_sample_limit", type=int, default=0, help="0 means audit every row.")
|
| 73 |
+
return parser.parse_args()
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def read_metadata(path: Path) -> List[dict]:
|
| 77 |
+
with path.open("r", newline="") as f:
|
| 78 |
+
return list(csv.DictReader(f))
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def video_id(row: dict) -> str:
|
| 82 |
+
return row.get("vid") or row["uid"].rsplit("_", 2)[0]
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def fid_value(row: dict) -> str:
|
| 86 |
+
return str(row.get("fid", "")).strip()
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def object_key(row: dict) -> Tuple[str, str]:
|
| 90 |
+
return video_id(row), fid_value(row)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def category_from_uid(row: dict) -> str:
|
| 94 |
+
vid = video_id(row)
|
| 95 |
+
uid = row.get("uid", "")
|
| 96 |
+
suffix = uid[len(vid) + 1 :] if uid.startswith(vid + "_") else uid.rsplit("_", 2)[-2]
|
| 97 |
+
if "_" in suffix:
|
| 98 |
+
return suffix.rsplit("_", 1)[0]
|
| 99 |
+
return suffix
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def has_any(text: str, keywords: Iterable[str]) -> bool:
|
| 103 |
+
text = text.lower()
|
| 104 |
+
return any(k in text for k in keywords)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def mask_path(data_dir: Path, vid: str, fid: str, t: int) -> Path:
|
| 108 |
+
return data_dir / "gt_mask" / vid / f"fid_{fid}" / f"0000{t}.png"
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def read_binary_mask_stats(path: Path) -> Optional[Tuple[int, int, int]]:
|
| 112 |
+
if Image is None or not path.exists():
|
| 113 |
+
return None
|
| 114 |
+
with Image.open(path) as img:
|
| 115 |
+
gray = img.convert("L")
|
| 116 |
+
width, height = gray.size
|
| 117 |
+
hist = gray.histogram()
|
| 118 |
+
positive = sum(hist[1:])
|
| 119 |
+
return positive, width, height
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def row_mask_stats(data_dir: Path, row: dict, frames: int, small_area: float) -> dict:
|
| 123 |
+
vid = video_id(row)
|
| 124 |
+
fid = fid_value(row)
|
| 125 |
+
positives: List[int] = []
|
| 126 |
+
areas: List[float] = []
|
| 127 |
+
missing = 0
|
| 128 |
+
width = height = None
|
| 129 |
+
|
| 130 |
+
for t in range(frames):
|
| 131 |
+
stats = read_binary_mask_stats(mask_path(data_dir, vid, fid, t))
|
| 132 |
+
if stats is None:
|
| 133 |
+
missing += 1
|
| 134 |
+
positives.append(0)
|
| 135 |
+
areas.append(0.0)
|
| 136 |
+
continue
|
| 137 |
+
pos, width, height = stats
|
| 138 |
+
positives.append(pos)
|
| 139 |
+
denom = max(width * height, 1)
|
| 140 |
+
areas.append(pos / denom)
|
| 141 |
+
|
| 142 |
+
visible = [i for i, pos in enumerate(positives) if pos > 0]
|
| 143 |
+
visible_areas = [areas[i] for i in visible]
|
| 144 |
+
first_visible = min(visible) if visible else None
|
| 145 |
+
mean_visible_area = mean(visible_areas) if visible_areas else 0.0
|
| 146 |
+
mean_all_area = mean(areas) if areas else 0.0
|
| 147 |
+
area_cv = 0.0
|
| 148 |
+
if len(visible_areas) > 1 and mean_visible_area > 0:
|
| 149 |
+
var = sum((x - mean_visible_area) ** 2 for x in visible_areas) / len(visible_areas)
|
| 150 |
+
area_cv = math.sqrt(var) / mean_visible_area
|
| 151 |
+
|
| 152 |
+
return {
|
| 153 |
+
"visible_frames": len(visible),
|
| 154 |
+
"visible_ratio": len(visible) / frames,
|
| 155 |
+
"first_visible": first_visible,
|
| 156 |
+
"late_target": first_visible is not None and first_visible > 0.5 * frames,
|
| 157 |
+
"mean_visible_area": mean_visible_area,
|
| 158 |
+
"mean_all_area": mean_all_area,
|
| 159 |
+
"small_target": mean_visible_area > 0 and mean_visible_area < small_area,
|
| 160 |
+
"partial_target": 0 < len(visible) < 0.5 * frames,
|
| 161 |
+
"area_cv": area_cv,
|
| 162 |
+
"area_unstable": area_cv >= 1.0,
|
| 163 |
+
"missing_masks": missing,
|
| 164 |
+
"width": width,
|
| 165 |
+
"height": height,
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def pct(num: int, den: int) -> float:
|
| 170 |
+
return 0.0 if den == 0 else 100.0 * num / den
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def summarize_counts(values: List[int]) -> dict:
|
| 174 |
+
if not values:
|
| 175 |
+
return {"mean": 0, "median": 0, "max": 0, "ge2": 0, "ge3": 0}
|
| 176 |
+
return {
|
| 177 |
+
"mean": mean(values),
|
| 178 |
+
"median": median(values),
|
| 179 |
+
"max": max(values),
|
| 180 |
+
"ge2": sum(v >= 2 for v in values),
|
| 181 |
+
"ge3": sum(v >= 3 for v in values),
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def write_csv(path: Path, rows: List[dict], fieldnames: List[str]) -> None:
|
| 186 |
+
with path.open("w", newline="") as f:
|
| 187 |
+
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
| 188 |
+
writer.writeheader()
|
| 189 |
+
for row in rows:
|
| 190 |
+
writer.writerow({k: row.get(k, "") for k in fieldnames})
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def main() -> None:
|
| 194 |
+
args = parse_args()
|
| 195 |
+
data_dir = args.data_dir
|
| 196 |
+
out_dir = args.out_dir
|
| 197 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 198 |
+
|
| 199 |
+
rows = read_metadata(data_dir / "metadata.csv")
|
| 200 |
+
if args.mask_sample_limit > 0:
|
| 201 |
+
mask_rows = rows[: args.mask_sample_limit]
|
| 202 |
+
else:
|
| 203 |
+
mask_rows = rows
|
| 204 |
+
|
| 205 |
+
by_split = Counter(row["split"] for row in rows)
|
| 206 |
+
by_video: Dict[str, List[dict]] = defaultdict(list)
|
| 207 |
+
by_object: Dict[Tuple[str, str], List[dict]] = defaultdict(list)
|
| 208 |
+
by_video_category: Dict[Tuple[str, str], set] = defaultdict(set)
|
| 209 |
+
|
| 210 |
+
enriched: List[dict] = []
|
| 211 |
+
for row in rows:
|
| 212 |
+
vid = video_id(row)
|
| 213 |
+
fid = fid_value(row)
|
| 214 |
+
category = category_from_uid(row)
|
| 215 |
+
expr = row.get("exp", "")
|
| 216 |
+
row2 = dict(row)
|
| 217 |
+
row2["vid"] = vid
|
| 218 |
+
row2["fid"] = fid
|
| 219 |
+
row2["category"] = category
|
| 220 |
+
row2["is_null_split"] = row.get("split") == "test_n"
|
| 221 |
+
row2["is_audio_keyword"] = has_any(expr, AUDIO_KEYWORDS)
|
| 222 |
+
row2["is_spatial_keyword"] = has_any(expr, SPATIAL_KEYWORDS)
|
| 223 |
+
by_video[vid].append(row2)
|
| 224 |
+
by_object[(vid, fid)].append(row2)
|
| 225 |
+
by_video_category[(vid, category)].add(fid)
|
| 226 |
+
enriched.append(row2)
|
| 227 |
+
|
| 228 |
+
mask_stats_by_uid: Dict[str, dict] = {}
|
| 229 |
+
for row in mask_rows:
|
| 230 |
+
uid = row["uid"]
|
| 231 |
+
mask_stats_by_uid[uid] = row_mask_stats(data_dir, row, args.frames, args.small_area)
|
| 232 |
+
|
| 233 |
+
sample_rows: List[dict] = []
|
| 234 |
+
for row in enriched:
|
| 235 |
+
stats = mask_stats_by_uid.get(row["uid"], {})
|
| 236 |
+
same_cat_fids = by_video_category[(row["vid"], row["category"])]
|
| 237 |
+
row2 = dict(row)
|
| 238 |
+
row2.update(stats)
|
| 239 |
+
row2["same_category_distractor_heuristic"] = len(same_cat_fids) >= 2
|
| 240 |
+
row2["multi_expr_video"] = len(by_video[row["vid"]]) >= 2
|
| 241 |
+
row2["multi_expr_object"] = len(by_object[(row["vid"], row["fid"])]) >= 2
|
| 242 |
+
row2["h3_candidate"] = row2["multi_expr_object"] and not row2["is_null_split"]
|
| 243 |
+
sample_rows.append(row2)
|
| 244 |
+
|
| 245 |
+
video_expr_counts = [len(v) for v in by_video.values()]
|
| 246 |
+
object_expr_counts = [len(v) for v in by_object.values()]
|
| 247 |
+
h3_objects = [k for k, v in by_object.items() if len(v) >= 2 and v[0]["split"] != "test_n"]
|
| 248 |
+
null_rows = [r for r in enriched if r["is_null_split"]]
|
| 249 |
+
audio_rows = [r for r in enriched if r["is_audio_keyword"]]
|
| 250 |
+
spatial_rows = [r for r in enriched if r["is_spatial_keyword"]]
|
| 251 |
+
same_cat_rows = [r for r in sample_rows if r.get("same_category_distractor_heuristic")]
|
| 252 |
+
|
| 253 |
+
audited_mask_rows = [r for r in sample_rows if "visible_ratio" in r]
|
| 254 |
+
late_rows = [r for r in audited_mask_rows if r.get("late_target")]
|
| 255 |
+
small_rows = [r for r in audited_mask_rows if r.get("small_target")]
|
| 256 |
+
partial_rows = [r for r in audited_mask_rows if r.get("partial_target")]
|
| 257 |
+
unstable_rows = [r for r in audited_mask_rows if r.get("area_unstable")]
|
| 258 |
+
|
| 259 |
+
summary = {
|
| 260 |
+
"data_dir": str(data_dir),
|
| 261 |
+
"num_expressions": len(rows),
|
| 262 |
+
"num_videos": len(by_video),
|
| 263 |
+
"num_objects_vid_fid": len(by_object),
|
| 264 |
+
"splits": dict(by_split),
|
| 265 |
+
"expressions_per_video": summarize_counts(video_expr_counts),
|
| 266 |
+
"expressions_per_object": summarize_counts(object_expr_counts),
|
| 267 |
+
"multi_expression_videos": sum(c >= 2 for c in video_expr_counts),
|
| 268 |
+
"multi_expression_objects": sum(c >= 2 for c in object_expr_counts),
|
| 269 |
+
"h3_candidate_objects": len(h3_objects),
|
| 270 |
+
"h3_candidate_expressions": sum(len(by_object[k]) for k in h3_objects),
|
| 271 |
+
"null_split_expressions": len(null_rows),
|
| 272 |
+
"null_split_percent": pct(len(null_rows), len(rows)),
|
| 273 |
+
"audio_keyword_expressions": len(audio_rows),
|
| 274 |
+
"audio_keyword_percent": pct(len(audio_rows), len(rows)),
|
| 275 |
+
"spatial_keyword_expressions": len(spatial_rows),
|
| 276 |
+
"spatial_keyword_percent": pct(len(spatial_rows), len(rows)),
|
| 277 |
+
"same_category_distractor_heuristic_expressions": len(same_cat_rows),
|
| 278 |
+
"same_category_distractor_heuristic_percent": pct(len(same_cat_rows), len(rows)),
|
| 279 |
+
"mask_rows_audited": len(audited_mask_rows),
|
| 280 |
+
"late_target_expressions": len(late_rows),
|
| 281 |
+
"small_target_expressions": len(small_rows),
|
| 282 |
+
"partial_target_expressions": len(partial_rows),
|
| 283 |
+
"area_unstable_expressions": len(unstable_rows),
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
with (out_dir / "audit_summary.json").open("w") as f:
|
| 287 |
+
json.dump(summary, f, indent=2, sort_keys=True)
|
| 288 |
+
|
| 289 |
+
fields = [
|
| 290 |
+
"uid",
|
| 291 |
+
"vid",
|
| 292 |
+
"split",
|
| 293 |
+
"fid",
|
| 294 |
+
"category",
|
| 295 |
+
"exp",
|
| 296 |
+
"is_null_split",
|
| 297 |
+
"is_audio_keyword",
|
| 298 |
+
"is_spatial_keyword",
|
| 299 |
+
"multi_expr_video",
|
| 300 |
+
"multi_expr_object",
|
| 301 |
+
"h3_candidate",
|
| 302 |
+
"same_category_distractor_heuristic",
|
| 303 |
+
"visible_frames",
|
| 304 |
+
"visible_ratio",
|
| 305 |
+
"first_visible",
|
| 306 |
+
"late_target",
|
| 307 |
+
"mean_visible_area",
|
| 308 |
+
"mean_all_area",
|
| 309 |
+
"small_target",
|
| 310 |
+
"partial_target",
|
| 311 |
+
"area_cv",
|
| 312 |
+
"area_unstable",
|
| 313 |
+
"missing_masks",
|
| 314 |
+
"width",
|
| 315 |
+
"height",
|
| 316 |
+
]
|
| 317 |
+
write_csv(out_dir / "audit_samples.csv", sample_rows, fields)
|
| 318 |
+
|
| 319 |
+
h3_rows = [r for r in sample_rows if r.get("h3_candidate")]
|
| 320 |
+
write_csv(out_dir / "h3_candidates.csv", h3_rows, fields)
|
| 321 |
+
|
| 322 |
+
md = [
|
| 323 |
+
"# TubeToken Phase -1 Audit",
|
| 324 |
+
"",
|
| 325 |
+
f"- Expressions: {summary['num_expressions']}",
|
| 326 |
+
f"- Videos: {summary['num_videos']}",
|
| 327 |
+
f"- Objects `(vid, fid)`: {summary['num_objects_vid_fid']}",
|
| 328 |
+
f"- Splits: `{dict(by_split)}`",
|
| 329 |
+
"",
|
| 330 |
+
"## Multi-expression",
|
| 331 |
+
"",
|
| 332 |
+
f"- Expressions/video mean: {summary['expressions_per_video']['mean']:.3f}",
|
| 333 |
+
f"- Expressions/video median: {summary['expressions_per_video']['median']}",
|
| 334 |
+
f"- Videos with >=2 expressions: {summary['multi_expression_videos']}",
|
| 335 |
+
f"- Expressions/object mean: {summary['expressions_per_object']['mean']:.3f}",
|
| 336 |
+
f"- Objects with >=2 expressions: {summary['multi_expression_objects']}",
|
| 337 |
+
f"- H3 candidate objects: {summary['h3_candidate_objects']}",
|
| 338 |
+
f"- H3 candidate expressions: {summary['h3_candidate_expressions']}",
|
| 339 |
+
"",
|
| 340 |
+
"## Diagnostic Subsets",
|
| 341 |
+
"",
|
| 342 |
+
f"- Null split expressions: {summary['null_split_expressions']} ({summary['null_split_percent']:.2f}%)",
|
| 343 |
+
f"- Audio-keyword expressions: {summary['audio_keyword_expressions']} ({summary['audio_keyword_percent']:.2f}%)",
|
| 344 |
+
f"- Spatial-keyword expressions: {summary['spatial_keyword_expressions']} ({summary['spatial_keyword_percent']:.2f}%)",
|
| 345 |
+
f"- Same-category distractor heuristic expressions: {summary['same_category_distractor_heuristic_expressions']} ({summary['same_category_distractor_heuristic_percent']:.2f}%)",
|
| 346 |
+
f"- Mask rows audited: {summary['mask_rows_audited']}",
|
| 347 |
+
f"- Late-target expressions: {summary['late_target_expressions']}",
|
| 348 |
+
f"- Small-target expressions: {summary['small_target_expressions']}",
|
| 349 |
+
f"- Partial-target expressions: {summary['partial_target_expressions']}",
|
| 350 |
+
f"- Area-unstable expressions: {summary['area_unstable_expressions']}",
|
| 351 |
+
"",
|
| 352 |
+
"## Phase -1 H3 Decision Hint",
|
| 353 |
+
"",
|
| 354 |
+
]
|
| 355 |
+
epv = summary["expressions_per_video"]["mean"]
|
| 356 |
+
if epv > 1.5 and summary["h3_candidate_objects"] > 0:
|
| 357 |
+
md.append("H3 can stay as a direct validation target: the data has multi-expression structure.")
|
| 358 |
+
elif summary["h3_candidate_objects"] > 0:
|
| 359 |
+
md.append("H3 should be treated as diagnostic: multi-expression objects exist, but average expressions/video is limited.")
|
| 360 |
+
else:
|
| 361 |
+
md.append("H3 should be downgraded: this audit did not find same-object multi-expression candidates.")
|
| 362 |
+
md.append("")
|
| 363 |
+
md.append("Generated files: `audit_summary.json`, `audit_samples.csv`, `h3_candidates.csv`.")
|
| 364 |
+
|
| 365 |
+
(out_dir / "audit_report.md").write_text("\n".join(md) + "\n")
|
| 366 |
+
print(json.dumps(summary, indent=2, sort_keys=True))
|
| 367 |
+
print(f"\nWrote audit files to: {out_dir}")
|
| 368 |
+
|
| 369 |
+
|
| 370 |
+
if __name__ == "__main__":
|
| 371 |
+
main()
|
tools/tubetoken/__pycache__/evaluate_oracle_refine_sam2.cpython-312.pyc
ADDED
|
Binary file (11.7 kB). View file
|
|
|
tools/tubetoken/evaluate_oracle_refine_sam2.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Evaluate bbox-only SAM2 refinement for oracle proposal tubes."""
|
| 3 |
+
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import argparse
|
| 7 |
+
import csv
|
| 8 |
+
import sys
|
| 9 |
+
from collections import defaultdict
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Dict, List, Optional
|
| 12 |
+
|
| 13 |
+
import numpy as np
|
| 14 |
+
import torch
|
| 15 |
+
from PIL import Image
|
| 16 |
+
from tqdm import tqdm
|
| 17 |
+
|
| 18 |
+
from phase0_common import (
|
| 19 |
+
bbox_from_mask,
|
| 20 |
+
bool_field,
|
| 21 |
+
evaluate_tube_jf,
|
| 22 |
+
load_audit_rows,
|
| 23 |
+
load_gt_tube,
|
| 24 |
+
read_metadata,
|
| 25 |
+
fid_value,
|
| 26 |
+
video_id,
|
| 27 |
+
write_json,
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def parse_args() -> argparse.Namespace:
|
| 32 |
+
parser = argparse.ArgumentParser(description="Evaluate oracle proposal bbox-only SAM2 refinement.")
|
| 33 |
+
parser.add_argument("--data_dir", type=Path, required=True)
|
| 34 |
+
parser.add_argument("--proposal_dir", type=Path, required=True)
|
| 35 |
+
parser.add_argument("--phase0_samples", type=Path, required=True)
|
| 36 |
+
parser.add_argument("--out_dir", type=Path, required=True)
|
| 37 |
+
parser.add_argument("--audit_csv", type=Path, default=None)
|
| 38 |
+
parser.add_argument("--splits", type=str, default="test_s,test_u")
|
| 39 |
+
parser.add_argument("--sam2_repo", type=Path, default=None)
|
| 40 |
+
parser.add_argument("--model_cfg", type=str, default="configs/sam2.1/sam2.1_hiera_l.yaml")
|
| 41 |
+
parser.add_argument("--checkpoint", type=Path, required=True)
|
| 42 |
+
parser.add_argument("--device", type=str, default="cuda")
|
| 43 |
+
parser.add_argument("--frames", type=int, default=10)
|
| 44 |
+
parser.add_argument("--limit_samples", type=int, default=0)
|
| 45 |
+
return parser.parse_args()
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def import_sam2(repo: Optional[Path]):
|
| 49 |
+
if repo is not None:
|
| 50 |
+
sys.path.insert(0, str(repo))
|
| 51 |
+
from sam2.build_sam import build_sam2
|
| 52 |
+
from sam2.sam2_image_predictor import SAM2ImagePredictor
|
| 53 |
+
|
| 54 |
+
return build_sam2, SAM2ImagePredictor
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def load_rgb(path: Path) -> np.ndarray:
|
| 58 |
+
with Image.open(path) as img:
|
| 59 |
+
return np.array(img.convert("RGB"))
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def load_phase0_samples(path: Path) -> Dict[str, dict]:
|
| 63 |
+
with path.open("r", newline="") as f:
|
| 64 |
+
return {row["uid"]: row for row in csv.DictReader(f)}
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def nearest_box(tube: np.ndarray, t: int) -> Optional[np.ndarray]:
|
| 68 |
+
boxes = []
|
| 69 |
+
for idx in range(tube.shape[0]):
|
| 70 |
+
box = bbox_from_mask(tube[idx])
|
| 71 |
+
if box is not None:
|
| 72 |
+
boxes.append((idx, np.array(box, dtype=np.float32)))
|
| 73 |
+
if not boxes:
|
| 74 |
+
return None
|
| 75 |
+
_, box = min(boxes, key=lambda item: abs(item[0] - t))
|
| 76 |
+
return box
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def predict_box_mask(predictor, image: np.ndarray, box: np.ndarray) -> np.ndarray:
|
| 80 |
+
predictor.set_image(image)
|
| 81 |
+
masks, scores, _ = predictor.predict(box=box, multimask_output=False)
|
| 82 |
+
masks = np.asarray(masks)
|
| 83 |
+
if masks.ndim == 4:
|
| 84 |
+
masks = masks[0]
|
| 85 |
+
if masks.ndim == 3:
|
| 86 |
+
masks = masks[0]
|
| 87 |
+
return masks > 0
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def sample_subsets(row: dict, audit: Dict[str, dict]) -> List[str]:
|
| 91 |
+
out = ["all", row["split"]]
|
| 92 |
+
audit_row = audit.get(row["uid"])
|
| 93 |
+
for key, name in [
|
| 94 |
+
("small_target", "small"),
|
| 95 |
+
("partial_target", "partial"),
|
| 96 |
+
("area_unstable", "area_unstable"),
|
| 97 |
+
("late_target", "late_target"),
|
| 98 |
+
("is_audio_keyword", "audio_keyword"),
|
| 99 |
+
("is_spatial_keyword", "spatial_keyword"),
|
| 100 |
+
("same_category_distractor_heuristic", "same_category"),
|
| 101 |
+
("h3_candidate", "h3_candidate"),
|
| 102 |
+
]:
|
| 103 |
+
if bool_field(audit_row, key):
|
| 104 |
+
out.append(name)
|
| 105 |
+
return out
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def empty_bucket() -> dict:
|
| 109 |
+
return {"count": 0, "refined_j": 0.0, "refined_f": 0.0, "refined_jf": 0.0}
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def add_bucket(bucket: dict, sample: dict) -> None:
|
| 113 |
+
bucket["count"] += 1
|
| 114 |
+
bucket["refined_j"] += sample["refined_j"]
|
| 115 |
+
bucket["refined_f"] += sample["refined_f"]
|
| 116 |
+
bucket["refined_jf"] += sample["refined_jf"]
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def finalize(bucket: dict) -> dict:
|
| 120 |
+
out = dict(bucket)
|
| 121 |
+
if bucket["count"]:
|
| 122 |
+
for key in ["refined_j", "refined_f", "refined_jf"]:
|
| 123 |
+
out[key] = bucket[key] / bucket["count"]
|
| 124 |
+
return out
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def main() -> None:
|
| 128 |
+
args = parse_args()
|
| 129 |
+
args.out_dir.mkdir(parents=True, exist_ok=True)
|
| 130 |
+
build_sam2, SAM2ImagePredictor = import_sam2(args.sam2_repo)
|
| 131 |
+
model = build_sam2(args.model_cfg, str(args.checkpoint), device=args.device)
|
| 132 |
+
predictor = SAM2ImagePredictor(model)
|
| 133 |
+
|
| 134 |
+
splits = [s.strip() for s in args.splits.split(",") if s.strip()]
|
| 135 |
+
rows = read_metadata(args.data_dir, splits)
|
| 136 |
+
if args.limit_samples:
|
| 137 |
+
rows = rows[: args.limit_samples]
|
| 138 |
+
phase0_samples = load_phase0_samples(args.phase0_samples)
|
| 139 |
+
audit = load_audit_rows(args.audit_csv) if args.audit_csv else {}
|
| 140 |
+
|
| 141 |
+
out_rows: List[dict] = []
|
| 142 |
+
summary = defaultdict(empty_bucket)
|
| 143 |
+
|
| 144 |
+
for row in tqdm(rows, desc="Oracle bbox-only SAM2 refinement"):
|
| 145 |
+
phase0 = phase0_samples[row["uid"]]
|
| 146 |
+
best_idx = int(phase0["best_idx"])
|
| 147 |
+
if best_idx < 0:
|
| 148 |
+
continue
|
| 149 |
+
vid = video_id(row)
|
| 150 |
+
proposals = np.load(args.proposal_dir / f"{vid}.npz")["masks"].astype(bool)
|
| 151 |
+
if best_idx >= proposals.shape[0]:
|
| 152 |
+
continue
|
| 153 |
+
oracle_tube = proposals[best_idx]
|
| 154 |
+
refined_masks = []
|
| 155 |
+
|
| 156 |
+
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16, enabled=args.device.startswith("cuda")):
|
| 157 |
+
for t in range(args.frames):
|
| 158 |
+
box = bbox_from_mask(oracle_tube[t])
|
| 159 |
+
if box is None:
|
| 160 |
+
box = nearest_box(oracle_tube, t)
|
| 161 |
+
if box is None:
|
| 162 |
+
refined_masks.append(np.zeros_like(oracle_tube[t], dtype=bool))
|
| 163 |
+
continue
|
| 164 |
+
image = load_rgb(args.data_dir / "media" / vid / "frames" / f"{t}.jpg")
|
| 165 |
+
refined_masks.append(predict_box_mask(predictor, image, np.asarray(box, dtype=np.float32)))
|
| 166 |
+
|
| 167 |
+
refined_tube = np.stack(refined_masks, axis=0)
|
| 168 |
+
gt = load_gt_tube(args.data_dir, vid, fid_value(row), args.frames)
|
| 169 |
+
j, f, jf = evaluate_tube_jf(refined_tube, gt)
|
| 170 |
+
sample = {
|
| 171 |
+
"uid": row["uid"],
|
| 172 |
+
"vid": vid,
|
| 173 |
+
"split": row["split"],
|
| 174 |
+
"fid": fid_value(row),
|
| 175 |
+
"best_idx": best_idx,
|
| 176 |
+
"refined_j": j,
|
| 177 |
+
"refined_f": f,
|
| 178 |
+
"refined_jf": jf,
|
| 179 |
+
}
|
| 180 |
+
out_rows.append(sample)
|
| 181 |
+
for subset in sample_subsets(row, audit):
|
| 182 |
+
add_bucket(summary[subset], sample)
|
| 183 |
+
|
| 184 |
+
with (args.out_dir / "sample_metrics.csv").open("w", newline="") as f:
|
| 185 |
+
fieldnames = list(out_rows[0].keys()) if out_rows else []
|
| 186 |
+
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
| 187 |
+
writer.writeheader()
|
| 188 |
+
writer.writerows(out_rows)
|
| 189 |
+
|
| 190 |
+
final_summary = {name: finalize(bucket) for name, bucket in sorted(summary.items())}
|
| 191 |
+
write_json(args.out_dir / "summary.json", final_summary)
|
| 192 |
+
|
| 193 |
+
md = ["# TubeToken Phase 0 Oracle Refined Evaluation", ""]
|
| 194 |
+
for name, metrics in final_summary.items():
|
| 195 |
+
if metrics["count"] == 0:
|
| 196 |
+
continue
|
| 197 |
+
md.append(f"- {name}: n={metrics['count']}, Refined J&F={metrics['refined_jf']:.4f}")
|
| 198 |
+
(args.out_dir / "report.md").write_text("\n".join(md) + "\n")
|
| 199 |
+
print("\n".join(md))
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
if __name__ == "__main__":
|
| 203 |
+
main()
|
utils/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (198 Bytes). View file
|
|
|
utils/metric/__pycache__/pyutils.cpython-310.pyc
ADDED
|
Binary file (5.4 kB). View file
|
|
|
utils/metric/__pycache__/utility.cpython-310.pyc
ADDED
|
Binary file (2.94 kB). View file
|
|
|