nvidia
/

NVIDIA-Nemotron-Nano-12B-v2-VL-NVFP4-QAD

Image-Text-to-Text

8-bit precision

Model card Files Files and versions

zhiyucheng commited on 19 days ago

Commit

bc3d4ad

·

verified ·

1 Parent(s): 11591f7

Delete quick_test_video.py

Files changed (1) hide show

quick_test_video.py +0 -83

quick_test_video.py DELETED Viewed

@@ -1,83 +0,0 @@
-import torch
-from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoImageProcessor, AutoProcessor
-from PIL import Image
-import video_io
-model_path = "/lustre/fsw/portfolios/llmservice/users/charlwang/vlm-hf-code/_ga_ckpt/iter200_hf"
-device = "cuda:0"
-model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, device_map=device, torch_dtype=torch.bfloat16).eval()
-tokenizer = AutoTokenizer.from_pretrained(model_path)
-config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-image_processor = AutoImageProcessor.from_pretrained(model_path, trust_remote_code=True)
-processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
-generation_config = dict(max_new_tokens=1024, do_sample=False, eos_token_id=tokenizer.eos_token_id)
-video_path = "images/demo.mp4"
-video_fps = 1
-video_nframe = 8
-video_nframe_max = -1
-# Get frames and metadata
-image_urls, metadata = video_io.maybe_path_or_url_to_data_urls(
-    video_path,
-    fps=max(0, int(video_fps)),
-    nframe=max(0, int(video_nframe)),
-    nframe_max=int(video_nframe_max),
-)
-frames = [video_io.pil_image_from_base64(image_url) for image_url in image_urls]
-print(f"Metadata: {metadata}")
-messages = [
-    {
-        "role": "system",
-        "content": "/no_think"
-    },
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "video",
-                "video": f"file://{video_path}",
-            },
-            {
-                "type": "text",
-                "text": "\nDescribe what you see.",
-            },
-        ],
-    }
-]
-prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-# Process with FPS metadata
-if metadata:
-    inputs = processor(
-        text=[prompt],
-        videos=frames,
-        videos_kwargs={'video_metadata': metadata},
-        return_tensors="pt",
-    )
-else:
-    inputs = processor(
-        text=[prompt],
-        videos=frames,
-        return_tensors="pt",
-    )
-inputs = inputs.to(device)
-# Inference: Generation of the output
-model.video_pruning_rate = 0.75
-generated_ids = model.generate(
-    pixel_values_videos=inputs.pixel_values_videos,
-    input_ids=inputs.input_ids,
-    attention_mask=inputs.attention_mask,
-    max_new_tokens=128,
-)
-output_text = processor.batch_decode(
-    generated_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
-)
-print(f"Prompt: {prompt}\nOutput: {output_text[0]}\n\n\n")