Anonymous Authors commited on
Commit
343cdde
Β·
1 Parent(s): f14833d

Rename to ViTeX-Edit-14B in submissions and README

Browse files

submissions.jsonl regenerated from the Benchmark-code repo so all
eleven pre-populated baselines carry the new method labels
(ViTeX-Edit-14B and ViTeX-Edit-14B (Composite)). README parenthetical
also switched. The Model & Inference code URL remains
huggingface.co/ViTeX-Bench/ViTeX-14B.

Files changed (2) hide show
  1. README.md +1 -1
  2. submissions.jsonl +3 -2
README.md CHANGED
@@ -34,4 +34,4 @@ The full thirteen-metric vector is the unit of report. The table is sorted by **
34
  - 🌐 **Project page:** https://vitex-bench.github.io/
35
  - πŸ“Š **Dataset:** https://huggingface.co/datasets/ViTeX-Bench/ViTeX-Dataset
36
  - πŸ§ͺ **Benchmark code:** https://huggingface.co/ViTeX-Bench/ViTeX-Bench
37
- - πŸ€– **Model & Inference code** (ViTeX-14B): https://huggingface.co/ViTeX-Bench/ViTeX-14B
 
34
  - 🌐 **Project page:** https://vitex-bench.github.io/
35
  - πŸ“Š **Dataset:** https://huggingface.co/datasets/ViTeX-Bench/ViTeX-Dataset
36
  - πŸ§ͺ **Benchmark code:** https://huggingface.co/ViTeX-Bench/ViTeX-Bench
37
+ - πŸ€– **Model & Inference code** (ViTeX-Edit-14B): https://huggingface.co/ViTeX-Bench/ViTeX-14B
submissions.jsonl CHANGED
@@ -1,10 +1,11 @@
1
  {"method": "TextCtrl", "family": "A β€” per-frame image editor", "organization": "Zeng et al., 2024", "paper_url": "https://arxiv.org/abs/2410.10133", "code_url": "https://github.com/weichaozeng/TextCtrl", "submitter": "admin", "submitted_at": "2026-05-04 00:00:00 UTC", "approved_at": "2026-05-04 00:00:00 UTC", "status": "approved", "TextScore": 0.5623872104295862, "SeqAcc": 0.47475474732914913, "CharAcc": 0.733502509720617, "TTS": 0.5107817672969821, "Flicker_full": 3.8040257787170293, "Flicker_crop": 4.287049075959627, "Warp_full": 1.5883564410079873, "Warp_crop": 2.087607682354121, "MUSIQ_full": 70.32216657276115, "MUSIQ_crop": 42.77286880553454, "PSNR_loc": 41.143448625451185, "SSIM_loc": 0.9944056776770688, "LPIPS_loc": 0.007969770003940647, "DreamSim_loc": 0.0042883308893049855, "n_clips": 157}
2
- {"method": "ViTeX-14B (Composite)", "family": "Reference", "organization": "Anonymous (NeurIPS 2026 D&B submission)", "paper_url": "", "code_url": "https://huggingface.co/ViTeX-Bench/ViTeX-14B", "submitter": "admin", "submitted_at": "2026-05-04 00:00:00 UTC", "approved_at": "2026-05-04 00:00:00 UTC", "status": "approved", "TextScore": 0.5409733932921607, "SeqAcc": 0.34489601685376864, "CharAcc": 0.6892114595238374, "TTS": 0.6660196589960885, "Flicker_full": 3.730421558994263, "Flicker_crop": 3.826727295895001, "Warp_full": 1.5060892347506618, "Warp_crop": 1.5591366257464094, "MUSIQ_full": 70.27118598215141, "MUSIQ_crop": 44.944762801223376, "PSNR_loc": 42.950774276028774, "SSIM_loc": 0.9925085173386224, "LPIPS_loc": 0.005916571278483934, "DreamSim_loc": 0.0023257043362929306, "n_clips": 157}
3
- {"method": "ViTeX-14B", "family": "Reference", "organization": "Anonymous (NeurIPS 2026 D&B submission)", "paper_url": "", "code_url": "https://huggingface.co/ViTeX-Bench/ViTeX-14B", "submitter": "admin", "submitted_at": "2026-05-04 00:00:00 UTC", "approved_at": "2026-05-04 00:00:00 UTC", "status": "approved", "TextScore": 0.5337670099905598, "SeqAcc": 0.34121246792876553, "CharAcc": 0.6879770723988642, "TTS": 0.6478229475144797, "Flicker_full": 3.2739301912212606, "Flicker_crop": 3.424670762474799, "Warp_full": 1.5515188207705402, "Warp_crop": 1.5304199630586097, "MUSIQ_full": 69.63500067777694, "MUSIQ_crop": 43.52961571422055, "PSNR_loc": 29.077432591849323, "SSIM_loc": 0.9512201399006257, "LPIPS_loc": 0.06030903690911814, "DreamSim_loc": 0.023522706465862867, "n_clips": 157}
4
  {"method": "VideoPainter", "family": "C β€” mask-conditioned video inpainting", "organization": "Bian et al., 2025", "paper_url": "https://arxiv.org/abs/2503.05639", "code_url": "https://github.com/TencentARC/VideoPainter", "submitter": "admin", "submitted_at": "2026-05-04 00:00:00 UTC", "approved_at": "2026-05-04 00:00:00 UTC", "status": "approved", "TextScore": 0.51506756458757, "SeqAcc": 0.364495972867382, "CharAcc": 0.6187952902754302, "TTS": 0.6058329243574021, "Flicker_full": 2.383399970485585, "Flicker_crop": 2.619418716169186, "Warp_full": 2.9276182078188366, "Warp_crop": 3.3452600061138558, "MUSIQ_full": 67.16001260609637, "MUSIQ_crop": 40.58771384010968, "PSNR_loc": 28.555957743164843, "SSIM_loc": 0.9151628155450829, "LPIPS_loc": 0.10402342236567201, "DreamSim_loc": 0.023908750937496278, "n_clips": 157}
5
  {"method": "FLUX-Text", "family": "A β€” per-frame image editor", "organization": "Chen et al., 2025", "paper_url": "https://arxiv.org/abs/2505.03329", "code_url": "https://github.com/AMAP-ML/FluxText", "submitter": "admin", "submitted_at": "2026-05-04 00:00:00 UTC", "approved_at": "2026-05-04 00:00:00 UTC", "status": "approved", "TextScore": 0.5022999045945803, "SeqAcc": 0.5283744349135131, "CharAcc": 0.7367738685630717, "TTS": 0.32554668434702094, "Flicker_full": 5.114334507312302, "Flicker_crop": 14.81406893996351, "Warp_full": 3.0267581734528144, "Warp_crop": 13.009849474748862, "MUSIQ_full": 70.25921666161523, "MUSIQ_crop": 43.85439727157533, "PSNR_loc": 31.488873457756767, "SSIM_loc": 0.974685608615182, "LPIPS_loc": 0.028573400793733536, "DreamSim_loc": 0.012038936603600812, "n_clips": 157}
6
  {"method": "RS-STE", "family": "A β€” per-frame image editor", "organization": "Zhao et al., 2025", "paper_url": "https://arxiv.org/abs/2503.17774", "code_url": "https://github.com/honglei-zhao/RS-STE", "submitter": "admin", "submitted_at": "2026-05-04 00:00:00 UTC", "approved_at": "2026-05-04 00:00:00 UTC", "status": "approved", "TextScore": 0.4907994847015915, "SeqAcc": 0.3539735290248826, "CharAcc": 0.6258597181299173, "TTS": 0.5336598236730271, "Flicker_full": 3.728183996539611, "Flicker_crop": 3.66286053942277, "Warp_full": 1.6050723235894067, "Warp_crop": 1.8147908492065754, "MUSIQ_full": 69.57172569297175, "MUSIQ_crop": 34.26484699258097, "PSNR_loc": 37.00242438437832, "SSIM_loc": 0.9830883838061237, "LPIPS_loc": 0.02354780357549038, "DreamSim_loc": 0.007322213357421243, "n_clips": 157}
7
  {"method": "AnyText2", "family": "A β€” per-frame image editor", "organization": "Tuo et al., 2024", "paper_url": "https://arxiv.org/abs/2411.15245", "code_url": "https://github.com/tyxsspa/AnyText2", "submitter": "admin", "submitted_at": "2026-05-04 00:00:00 UTC", "approved_at": "2026-05-04 00:00:00 UTC", "status": "approved", "TextScore": 0.4074467792171563, "SeqAcc": 0.27973291436891057, "CharAcc": 0.6332210144114335, "TTS": 0.38186844987155294, "Flicker_full": 3.3398760175596904, "Flicker_crop": 4.9545532378085495, "Warp_full": 2.0425352598531266, "Warp_crop": 3.9516436691464083, "MUSIQ_full": 66.67552189796594, "MUSIQ_crop": 41.65317273156116, "PSNR_loc": 25.55582150532182, "SSIM_loc": 0.9047352040975998, "LPIPS_loc": 0.09148503486230188, "DreamSim_loc": 0.0430956823557552, "n_clips": 157}
8
  {"method": "TextCtrl + AnyV2V", "family": "B β€” first-frame + I2V propagation", "organization": "Composite of Zeng 2024 + Ku 2024", "paper_url": "", "code_url": "", "submitter": "admin", "submitted_at": "2026-05-04 00:00:00 UTC", "approved_at": "2026-05-04 00:00:00 UTC", "status": "approved", "TextScore": 0.16492592568061926, "SeqAcc": 0.056786779447094926, "CharAcc": 0.30779952787986514, "TTS": 0.2566561069340611, "Flicker_full": 4.9802852382134075, "Flicker_crop": 4.977045501548225, "Warp_full": 4.107769233013561, "Warp_crop": 3.9674498647429868, "MUSIQ_full": 69.41088303841349, "MUSIQ_crop": 33.85112622206451, "PSNR_loc": 21.084345629665755, "SSIM_loc": 0.7846697544754223, "LPIPS_loc": 0.2249758477633198, "DreamSim_loc": 0.07321823651876672, "n_clips": 157}
 
9
  {"method": "Wan2.1-VACE-14B", "family": "C β€” mask-conditioned video inpainting", "organization": "Wan-AI, 2025", "paper_url": "https://arxiv.org/abs/2503.07598", "code_url": "https://huggingface.co/Wan-AI/Wan2.1-VACE-14B", "submitter": "admin", "submitted_at": "2026-05-04 00:00:00 UTC", "approved_at": "2026-05-04 00:00:00 UTC", "status": "approved", "TextScore": 0.0, "SeqAcc": 0.0, "CharAcc": 0.29842756354252414, "TTS": 0.6894925190282424, "Flicker_full": 3.777380530928976, "Flicker_crop": 3.841096694996713, "Warp_full": 1.6876533062098615, "Warp_crop": 1.5609657061512832, "MUSIQ_full": 70.53707020378923, "MUSIQ_crop": 45.256712742544, "PSNR_loc": 35.21163969961195, "SSIM_loc": 0.9761949368626082, "LPIPS_loc": 0.021842662243107273, "DreamSim_loc": 0.007056991990078281, "n_clips": 157}
10
  {"method": "Kling Video 3.0 Omni", "family": "D β€” instruction-guided V2V", "organization": "Kuaishou (closed)", "paper_url": "", "code_url": "", "submitter": "admin", "submitted_at": "2026-05-04 00:00:00 UTC", "approved_at": "2026-05-04 00:00:00 UTC", "status": "approved", "TextScore": 0.0, "SeqAcc": 0.0, "CharAcc": 0.20752862556378657, "TTS": 0.6408892575809407, "Flicker_full": 4.247679066116782, "Flicker_crop": 4.081252510324976, "Warp_full": 3.1189079573144176, "Warp_crop": 2.902087520422552, "MUSIQ_full": 72.23268125973436, "MUSIQ_crop": 47.745845725613485, "PSNR_loc": 21.181631575850673, "SSIM_loc": 0.843030764793486, "LPIPS_loc": 0.17594784468030775, "DreamSim_loc": 0.060776721627595835, "n_clips": 157}
 
1
  {"method": "TextCtrl", "family": "A β€” per-frame image editor", "organization": "Zeng et al., 2024", "paper_url": "https://arxiv.org/abs/2410.10133", "code_url": "https://github.com/weichaozeng/TextCtrl", "submitter": "admin", "submitted_at": "2026-05-04 00:00:00 UTC", "approved_at": "2026-05-04 00:00:00 UTC", "status": "approved", "TextScore": 0.5623872104295862, "SeqAcc": 0.47475474732914913, "CharAcc": 0.733502509720617, "TTS": 0.5107817672969821, "Flicker_full": 3.8040257787170293, "Flicker_crop": 4.287049075959627, "Warp_full": 1.5883564410079873, "Warp_crop": 2.087607682354121, "MUSIQ_full": 70.32216657276115, "MUSIQ_crop": 42.77286880553454, "PSNR_loc": 41.143448625451185, "SSIM_loc": 0.9944056776770688, "LPIPS_loc": 0.007969770003940647, "DreamSim_loc": 0.0042883308893049855, "n_clips": 157}
2
+ {"method": "ViTeX-Edit-14B (Composite)", "family": "Reference", "organization": "Anonymous (NeurIPS 2026 D&B submission)", "paper_url": "", "code_url": "https://huggingface.co/ViTeX-Bench/ViTeX-14B", "submitter": "admin", "submitted_at": "2026-05-04 00:00:00 UTC", "approved_at": "2026-05-04 00:00:00 UTC", "status": "approved", "TextScore": 0.5409733932921607, "SeqAcc": 0.34489601685376864, "CharAcc": 0.6892114595238374, "TTS": 0.6660196589960885, "Flicker_full": 3.730421558994263, "Flicker_crop": 3.826727295895001, "Warp_full": 1.5060892347506618, "Warp_crop": 1.5591366257464094, "MUSIQ_full": 70.27118598215141, "MUSIQ_crop": 44.944762801223376, "PSNR_loc": 42.950774276028774, "SSIM_loc": 0.9925085173386224, "LPIPS_loc": 0.005916571278483934, "DreamSim_loc": 0.0023257043362929306, "n_clips": 157}
3
+ {"method": "ViTeX-Edit-14B", "family": "Reference", "organization": "Anonymous (NeurIPS 2026 D&B submission)", "paper_url": "", "code_url": "https://huggingface.co/ViTeX-Bench/ViTeX-14B", "submitter": "admin", "submitted_at": "2026-05-04 00:00:00 UTC", "approved_at": "2026-05-04 00:00:00 UTC", "status": "approved", "TextScore": 0.5337670099905598, "SeqAcc": 0.34121246792876553, "CharAcc": 0.6879770723988642, "TTS": 0.6478229475144797, "Flicker_full": 3.2739301912212606, "Flicker_crop": 3.424670762474799, "Warp_full": 1.5515188207705402, "Warp_crop": 1.5304199630586097, "MUSIQ_full": 69.63500067777694, "MUSIQ_crop": 43.52961571422055, "PSNR_loc": 29.077432591849323, "SSIM_loc": 0.9512201399006257, "LPIPS_loc": 0.06030903690911814, "DreamSim_loc": 0.023522706465862867, "n_clips": 157}
4
  {"method": "VideoPainter", "family": "C β€” mask-conditioned video inpainting", "organization": "Bian et al., 2025", "paper_url": "https://arxiv.org/abs/2503.05639", "code_url": "https://github.com/TencentARC/VideoPainter", "submitter": "admin", "submitted_at": "2026-05-04 00:00:00 UTC", "approved_at": "2026-05-04 00:00:00 UTC", "status": "approved", "TextScore": 0.51506756458757, "SeqAcc": 0.364495972867382, "CharAcc": 0.6187952902754302, "TTS": 0.6058329243574021, "Flicker_full": 2.383399970485585, "Flicker_crop": 2.619418716169186, "Warp_full": 2.9276182078188366, "Warp_crop": 3.3452600061138558, "MUSIQ_full": 67.16001260609637, "MUSIQ_crop": 40.58771384010968, "PSNR_loc": 28.555957743164843, "SSIM_loc": 0.9151628155450829, "LPIPS_loc": 0.10402342236567201, "DreamSim_loc": 0.023908750937496278, "n_clips": 157}
5
  {"method": "FLUX-Text", "family": "A β€” per-frame image editor", "organization": "Chen et al., 2025", "paper_url": "https://arxiv.org/abs/2505.03329", "code_url": "https://github.com/AMAP-ML/FluxText", "submitter": "admin", "submitted_at": "2026-05-04 00:00:00 UTC", "approved_at": "2026-05-04 00:00:00 UTC", "status": "approved", "TextScore": 0.5022999045945803, "SeqAcc": 0.5283744349135131, "CharAcc": 0.7367738685630717, "TTS": 0.32554668434702094, "Flicker_full": 5.114334507312302, "Flicker_crop": 14.81406893996351, "Warp_full": 3.0267581734528144, "Warp_crop": 13.009849474748862, "MUSIQ_full": 70.25921666161523, "MUSIQ_crop": 43.85439727157533, "PSNR_loc": 31.488873457756767, "SSIM_loc": 0.974685608615182, "LPIPS_loc": 0.028573400793733536, "DreamSim_loc": 0.012038936603600812, "n_clips": 157}
6
  {"method": "RS-STE", "family": "A β€” per-frame image editor", "organization": "Zhao et al., 2025", "paper_url": "https://arxiv.org/abs/2503.17774", "code_url": "https://github.com/honglei-zhao/RS-STE", "submitter": "admin", "submitted_at": "2026-05-04 00:00:00 UTC", "approved_at": "2026-05-04 00:00:00 UTC", "status": "approved", "TextScore": 0.4907994847015915, "SeqAcc": 0.3539735290248826, "CharAcc": 0.6258597181299173, "TTS": 0.5336598236730271, "Flicker_full": 3.728183996539611, "Flicker_crop": 3.66286053942277, "Warp_full": 1.6050723235894067, "Warp_crop": 1.8147908492065754, "MUSIQ_full": 69.57172569297175, "MUSIQ_crop": 34.26484699258097, "PSNR_loc": 37.00242438437832, "SSIM_loc": 0.9830883838061237, "LPIPS_loc": 0.02354780357549038, "DreamSim_loc": 0.007322213357421243, "n_clips": 157}
7
  {"method": "AnyText2", "family": "A β€” per-frame image editor", "organization": "Tuo et al., 2024", "paper_url": "https://arxiv.org/abs/2411.15245", "code_url": "https://github.com/tyxsspa/AnyText2", "submitter": "admin", "submitted_at": "2026-05-04 00:00:00 UTC", "approved_at": "2026-05-04 00:00:00 UTC", "status": "approved", "TextScore": 0.4074467792171563, "SeqAcc": 0.27973291436891057, "CharAcc": 0.6332210144114335, "TTS": 0.38186844987155294, "Flicker_full": 3.3398760175596904, "Flicker_crop": 4.9545532378085495, "Warp_full": 2.0425352598531266, "Warp_crop": 3.9516436691464083, "MUSIQ_full": 66.67552189796594, "MUSIQ_crop": 41.65317273156116, "PSNR_loc": 25.55582150532182, "SSIM_loc": 0.9047352040975998, "LPIPS_loc": 0.09148503486230188, "DreamSim_loc": 0.0430956823557552, "n_clips": 157}
8
  {"method": "TextCtrl + AnyV2V", "family": "B β€” first-frame + I2V propagation", "organization": "Composite of Zeng 2024 + Ku 2024", "paper_url": "", "code_url": "", "submitter": "admin", "submitted_at": "2026-05-04 00:00:00 UTC", "approved_at": "2026-05-04 00:00:00 UTC", "status": "approved", "TextScore": 0.16492592568061926, "SeqAcc": 0.056786779447094926, "CharAcc": 0.30779952787986514, "TTS": 0.2566561069340611, "Flicker_full": 4.9802852382134075, "Flicker_crop": 4.977045501548225, "Warp_full": 4.107769233013561, "Warp_crop": 3.9674498647429868, "MUSIQ_full": 69.41088303841349, "MUSIQ_crop": 33.85112622206451, "PSNR_loc": 21.084345629665755, "SSIM_loc": 0.7846697544754223, "LPIPS_loc": 0.2249758477633198, "DreamSim_loc": 0.07321823651876672, "n_clips": 157}
9
+ {"method": "Identity (sanity)", "family": "β€”", "organization": "β€”", "paper_url": "", "code_url": "", "submitter": "admin", "submitted_at": "2026-05-04 00:00:00 UTC", "approved_at": "2026-05-04 00:00:00 UTC", "status": "approved", "TextScore": 0.0, "SeqAcc": 0.0, "CharAcc": 0.3165408461004333, "TTS": 0.7598748923514798, "Flicker_full": 3.7225515203842283, "Flicker_crop": 3.6811893970338505, "Warp_full": 1.46410686887435, "Warp_crop": 1.2690219210512326, "MUSIQ_full": 70.32922646265375, "MUSIQ_crop": 45.12224410273406, "PSNR_loc": 100.0, "SSIM_loc": 1.0, "LPIPS_loc": 0.0, "DreamSim_loc": -4.1919402509231723e-08, "n_clips": 157}
10
  {"method": "Wan2.1-VACE-14B", "family": "C β€” mask-conditioned video inpainting", "organization": "Wan-AI, 2025", "paper_url": "https://arxiv.org/abs/2503.07598", "code_url": "https://huggingface.co/Wan-AI/Wan2.1-VACE-14B", "submitter": "admin", "submitted_at": "2026-05-04 00:00:00 UTC", "approved_at": "2026-05-04 00:00:00 UTC", "status": "approved", "TextScore": 0.0, "SeqAcc": 0.0, "CharAcc": 0.29842756354252414, "TTS": 0.6894925190282424, "Flicker_full": 3.777380530928976, "Flicker_crop": 3.841096694996713, "Warp_full": 1.6876533062098615, "Warp_crop": 1.5609657061512832, "MUSIQ_full": 70.53707020378923, "MUSIQ_crop": 45.256712742544, "PSNR_loc": 35.21163969961195, "SSIM_loc": 0.9761949368626082, "LPIPS_loc": 0.021842662243107273, "DreamSim_loc": 0.007056991990078281, "n_clips": 157}
11
  {"method": "Kling Video 3.0 Omni", "family": "D β€” instruction-guided V2V", "organization": "Kuaishou (closed)", "paper_url": "", "code_url": "", "submitter": "admin", "submitted_at": "2026-05-04 00:00:00 UTC", "approved_at": "2026-05-04 00:00:00 UTC", "status": "approved", "TextScore": 0.0, "SeqAcc": 0.0, "CharAcc": 0.20752862556378657, "TTS": 0.6408892575809407, "Flicker_full": 4.247679066116782, "Flicker_crop": 4.081252510324976, "Warp_full": 3.1189079573144176, "Warp_crop": 2.902087520422552, "MUSIQ_full": 72.23268125973436, "MUSIQ_crop": 47.745845725613485, "PSNR_loc": 21.181631575850673, "SSIM_loc": 0.843030764793486, "LPIPS_loc": 0.17594784468030775, "DreamSim_loc": 0.060776721627595835, "n_clips": 157}