Spaces:

akhaliq
/

gemma-3n-E4B-it

Paused

File size: 12,323 Bytes

import os
import pathlib
import tempfile
from collections.abc import Iterator
from threading import Thread

import av
import gradio as gr
import spaces
import torch
from gradio.utils import get_upload_folder
from transformers import AutoModelForImageTextToText, AutoProcessor
from transformers.generation.streamers import TextIteratorStreamer

model_id = "google/gemma-3n-E4B-it"

# Get HF token from environment
HF_TOKEN2 = os.getenv("HF_TOKEN2")
access_token = HF_TOKEN2

# Load processor and model with authentication token
processor = AutoProcessor.from_pretrained(model_id, token=access_token)
model = AutoModelForImageTextToText.from_pretrained(
    model_id, 
    device_map="auto", 
    torch_dtype=torch.bfloat16,
    token=access_token
)

IMAGE_FILE_TYPES = (".jpg", ".jpeg", ".png", ".webp")
VIDEO_FILE_TYPES = (".mp4", ".mov", ".webm")
AUDIO_FILE_TYPES = (".mp3", ".wav")

GRADIO_TEMP_DIR = get_upload_folder()

TARGET_FPS = int(os.getenv("TARGET_FPS", "3"))
MAX_FRAMES = int(os.getenv("MAX_FRAMES", "30"))
MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "10_000"))


def get_file_type(path: str) -> str:
    if path.endswith(IMAGE_FILE_TYPES):
        return "image"
    if path.endswith(VIDEO_FILE_TYPES):
        return "video"
    if path.endswith(AUDIO_FILE_TYPES):
        return "audio"
    error_message = f"Unsupported file type: {path}"
    raise ValueError(error_message)


def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
    video_count = 0
    non_video_count = 0
    for path in paths:
        if path.endswith(VIDEO_FILE_TYPES):
            video_count += 1
        else:
            non_video_count += 1
    return video_count, non_video_count


def validate_media_constraints(message: dict) -> bool:
    print(f"Debug - Validating message: {message}")
    
    if not message.get("files"):
        print("Debug - No files in message")
        return True
        
    files = message["files"]
    print(f"Debug - Files to validate: {files}")
    
    video_count, non_video_count = count_files_in_new_message(files)
    print(f"Debug - Video count: {video_count}, Non-video count: {non_video_count}")
    
    if video_count > 1:
        gr.Warning("⚠️ Only one video is supported per message.")
        return False
    if video_count == 1 and non_video_count > 0:
        gr.Warning("⚠️ Cannot mix videos with other media types.")
        return False
    return True


def extract_frames_to_tempdir(
    video_path: str,
    target_fps: float,
    max_frames: int | None = None,
    parent_dir: str | None = None,
    prefix: str = "frames_",
) -> str:
    print(f"Debug - Extracting frames from: {video_path}")
    
    # Validate video file exists
    if not os.path.exists(video_path):
        raise ValueError(f"Video file not found: {video_path}")
    
    temp_dir = tempfile.mkdtemp(prefix=prefix, dir=parent_dir)
    print(f"Debug - Created temp dir: {temp_dir}")

    try:
        container = av.open(video_path)
        video_stream = container.streams.video[0]
        print(f"Debug - Video stream found: {video_stream}")

        if video_stream.duration is None or video_stream.time_base is None:
            raise ValueError("Video stream is missing duration or time_base information")

        time_base = video_stream.time_base
        duration = float(video_stream.duration * time_base)
        interval = 1.0 / target_fps

        total_frames = int(duration * target_fps)
        if max_frames is not None:
            total_frames = min(total_frames, max_frames)

        print(f"Debug - Will extract {total_frames} frames over {duration:.2f} seconds")

        target_times = [i * interval for i in range(total_frames)]
        target_index = 0
        extracted_count = 0

        for frame in container.decode(video=0):
            if frame.pts is None:
                continue

            timestamp = float(frame.pts * time_base)

            if target_index < len(target_times) and abs(timestamp - target_times[target_index]) < (interval / 2):
                frame_path = pathlib.Path(temp_dir) / f"frame_{target_index:04d}.jpg"
                frame.to_image().save(frame_path)
                target_index += 1
                extracted_count += 1

                if max_frames is not None and target_index >= max_frames:
                    break

        container.close()
        print(f"Debug - Successfully extracted {extracted_count} frames to {temp_dir}")
        return temp_dir
        
    except Exception as e:
        print(f"Debug - Error during frame extraction: {e}")
        # Clean up temp directory on error
        import shutil
        shutil.rmtree(temp_dir, ignore_errors=True)
        raise


def process_new_user_message(message: dict) -> list[dict]:
    # Debug: Print the message structure
    print(f"Debug - Received message: {message}")
    
    if not message.get("files"):
        return [{"type": "text", "text": message["text"]}]

    file_types = [get_file_type(path) for path in message["files"]]
    print(f"Debug - Detected file types: {file_types}")

    # Handle video files
    if len(file_types) == 1 and file_types[0] == "video":
        print(f"Debug - Processing video: {message['files'][0]}")
        gr.Info(f"🎥 Processing video at {TARGET_FPS} FPS, max {MAX_FRAMES} frames. This may take a moment...")

        try:
            temp_dir = extract_frames_to_tempdir(
                message["files"][0],
                target_fps=TARGET_FPS,
                max_frames=MAX_FRAMES,
                parent_dir=GRADIO_TEMP_DIR,
            )
            paths = sorted(pathlib.Path(temp_dir).glob("*.jpg"))
            
            if not paths:
                gr.Warning("⚠️ Could not extract frames from video. Please try a different video format.")
                return [{"type": "text", "text": message["text"]}]
            
            gr.Success(f"✅ Extracted {len(paths)} frames from video successfully!")
            print(f"Debug - Extracted {len(paths)} frames")
            
            return [
                {"type": "text", "text": message["text"]},
                *[{"type": "image", "image": path.as_posix()} for path in paths],
            ]
        except Exception as e:
            print(f"Debug - Video processing error: {e}")
            gr.Error(f"❌ Error processing video: {str(e)}")
            return [{"type": "text", "text": message["text"]}]

    # Handle mixed files or multiple videos
    if "video" in file_types:
        video_count = file_types.count("video")
        if video_count > 1:
            gr.Warning("⚠️ Only one video is supported per message. Please upload one video at a time.")
            return [{"type": "text", "text": message["text"]}]
        
        non_video_count = len(file_types) - video_count
        if non_video_count > 0:
            gr.Warning("⚠️ Cannot mix videos with other file types. Please upload either a video alone or other files without video.")
            return [{"type": "text", "text": message["text"]}]

    # Handle other file types normally
    return [
        {"type": "text", "text": message["text"]},
        *[{"type": file_type, file_type: path} for path, file_type in zip(message["files"], file_types, strict=True)],
    ]


def process_history(history: list[dict]) -> list[dict]:
    messages = []
    current_user_content: list[dict] = []
    for item in history:
        if item["role"] == "assistant":
            if current_user_content:
                messages.append({"role": "user", "content": current_user_content})
                current_user_content = []
            messages.append({"role": "assistant", "content": [{"type": "text", "text": item["content"]}]})
        else:
            content = item["content"]
            if isinstance(content, str):
                current_user_content.append({"type": "text", "text": content})
            else:
                filepath = content[0]
                file_type = get_file_type(filepath)
                current_user_content.append({"type": file_type, file_type: filepath})
    return messages


@spaces.GPU(duration=120)
@torch.inference_mode()
def generate(message: dict, history: list[dict]) -> Iterator[str]:
    print(f"Debug - Generate called with message: {message}")
    print(f"Debug - Message keys: {message.keys()}")
    
    if not validate_media_constraints(message):
        print("Debug - Media constraints validation failed")
        yield "Sorry, there was an issue with the uploaded files. Please check the file types and try again."
        return

    messages = []
    system_prompt = "You are a helpful AI assistant. You can analyze images, transcribe audio, describe videos, and answer questions. Provide detailed, accurate, and helpful responses."
    if system_prompt:
        messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})
    messages.extend(process_history(history))
    
    try:
        user_content = process_new_user_message(message)
        print(f"Debug - Processed user content: {user_content}")
        messages.append({"role": "user", "content": user_content})
    except Exception as e:
        print(f"Debug - Error processing user message: {e}")
        yield f"Sorry, there was an error processing your message: {str(e)}"
        return

    try:
        inputs = processor.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
        )
        n_tokens = inputs["input_ids"].shape[1]
        if n_tokens > MAX_INPUT_TOKENS:
            gr.Warning(
                f"Input too long. Max {MAX_INPUT_TOKENS} tokens. Got {n_tokens} tokens. This limit is set to avoid CUDA out-of-memory errors in this Space."
            )
            yield "Sorry, your input is too long. Please try with shorter text or fewer files."
            return

        inputs = inputs.to(device=model.device, dtype=torch.bfloat16)

        streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
        generate_kwargs = dict(
            inputs,
            streamer=streamer,
            max_new_tokens=700,
            do_sample=False,
            disable_compile=True,
        )
        t = Thread(target=model.generate, kwargs=generate_kwargs)
        t.start()

        output = ""
        for delta in streamer:
            output += delta
            yield output
            
    except Exception as e:
        print(f"Debug - Error during generation: {e}")
        yield f"Sorry, there was an error generating the response: {str(e)}"


def chat_fn(message, history):
    """Main chat function that handles multimodal input and generates responses"""
    if not message:
        return ""
    
    # Handle multimodal input from MultimodalTextbox
    if isinstance(message, dict):
        text = message.get("text", "")
        files = message.get("files", [])
    else:
        text = str(message)
        files = []
    
    if not text.strip() and not files:
        return ""
    
    # Create message dict for processing
    message_dict = {
        "text": text,
        "files": [f.name if hasattr(f, 'name') else f for f in files] if files else []
    }
    
    # Generate streaming response
    for chunk in generate(message_dict, history):
        yield chunk


# Create the ChatInterface - pure Gradio with no custom CSS
demo = gr.ChatInterface(
    fn=chat_fn,
    multimodal=True,
    type="messages",
    textbox=gr.MultimodalTextbox(
        placeholder="Message Gemma...",
        container=False,
        scale=7,
        file_types=list(IMAGE_FILE_TYPES + VIDEO_FILE_TYPES + AUDIO_FILE_TYPES),
        file_count="multiple",
        show_label=False
    ),
    title="Gemma",
    description=None,
    examples=None,
    cache_examples=False,
    theme=gr.themes.Soft(
        primary_hue="emerald",
        secondary_hue="slate", 
        neutral_hue="slate",
        font=gr.themes.GoogleFont("Inter")
    ),
    fill_height=True,
    delete_cache=(100, 100),  # Keep some conversation history
    show_progress="minimal",
    concurrency_limit=10,
    autofocus=True
)

if __name__ == "__main__":
    demo.launch(share=True, show_error=True)