gemma-3-12b-it

Running on Zero

App Files Files Community

rmdhirr commited on Jul 9

Commit

d35b974

verified ·

1 Parent(s): 8be4895

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -123

app.py CHANGED Viewed

@@ -13,34 +13,44 @@ import torch
 from loguru import logger
 from PIL import Image
 from peft import PeftModel
-from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer, AutoModelForImageTextToText
-adapter_id = "rmdhirr/test4bit6ab"
 # Load processor (tokenizer + feature extractor)
 processor = AutoProcessor.from_pretrained(
-    adapter_id,
     padding_side="left"
 )
-model = Gemma3ForConditionalGeneration.from_pretrained(
-    adapter_id,                  # e.g. "rmdhirr/test4bit-b"
-    torch_dtype=torch.bfloat16,  # same dtype you were using
-    device_map="auto",           # or however you shard
-    ignore_mismatched_sizes=True # only if you still see tiny shape warnings
-)
 model.eval()
-# ########################################
 MAX_NUM_IMAGES = int(os.getenv("MAX_NUM_IMAGES", "5"))
 def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
-    image_count = 0
-    video_count = 0
     for path in paths:
         if path.endswith(".mp4"):
             video_count += 1
@@ -48,10 +58,8 @@ def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
             image_count += 1
     return image_count, video_count
 def count_files_in_history(history: list[dict]) -> tuple[int, int]:
-    image_count = 0
-    video_count = 0
     for item in history:
         if item["role"] != "user" or isinstance(item["content"], str):
             continue
@@ -61,122 +69,104 @@ def count_files_in_history(history: list[dict]) -> tuple[int, int]:
             image_count += 1
     return image_count, video_count
 def validate_media_constraints(message: dict, history: list[dict]) -> bool:
-    new_image_count, new_video_count = count_files_in_new_message(message["files"])
-    history_image_count, history_video_count = count_files_in_history(history)
-    image_count = history_image_count + new_image_count
-    video_count = history_video_count + new_video_count
-    if video_count > 1:
         gr.Warning("Only one video is supported.")
         return False
-    if video_count == 1:
-        if image_count > 0:
-            gr.Warning("Mixing images and videos is not allowed.")
-            return False
-        if "<image>" in message["text"]:
-            gr.Warning("Using <image> tags with video files is not supported.")
-            return False
-    if video_count == 0 and image_count > MAX_NUM_IMAGES:
-        gr.Warning(f"You can upload up to {MAX_NUM_IMAGES} images.")
         return False
-    if "<image>" in message["text"] and message["text"].count("<image>") != new_image_count:
-        gr.Warning("The number of <image> tags in the text does not match the number of images.")
         return False
     return True
 def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
     vidcap = cv2.VideoCapture(video_path)
     fps = vidcap.get(cv2.CAP_PROP_FPS)
-    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
-    frame_interval = max(total_frames // MAX_NUM_IMAGES, 1)
-    frames: list[tuple[Image.Image, float]] = []
-    for i in range(0, min(total_frames, MAX_NUM_IMAGES * frame_interval), frame_interval):
         if len(frames) >= MAX_NUM_IMAGES:
             break
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
-        success, image = vidcap.read()
-        if success:
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-            pil_image = Image.fromarray(image)
-            timestamp = round(i / fps, 2)
-            frames.append((pil_image, timestamp))
     vidcap.release()
     return frames
-def process_video(video_path: str) -> list[dict]:
-    content = []
-    frames = downsample_video(video_path)
-    for frame in frames:
-        pil_image, timestamp = frame
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
-            pil_image.save(temp_file.name)
-            content.append({"type": "text", "text": f"Frame {timestamp}:"})
-            content.append({"type": "image", "url": temp_file.name})
-    logger.debug(f"{content=}")
-    return content
 def process_interleaved_images(message: dict) -> list[dict]:
-    logger.debug(f"{message['files']=}")
     parts = re.split(r"(<image>)", message["text"])
-    logger.debug(f"{parts=}")
-    content = []
-    image_index = 0
-    for part in parts:
-        logger.debug(f"{part=}")
-        if part == "<image>":
-            content.append({"type": "image", "url": message["files"][image_index]})
-            logger.debug(f"file: {message['files'][image_index]}")
-            image_index += 1
-        elif part.strip():
-            content.append({"type": "text", "text": part.strip()})
-        elif isinstance(part, str) and part != "<image>":
-            content.append({"type": "text", "text": part})
-    logger.debug(f"{content=}")
-    return content
 def process_new_user_message(message: dict) -> list[dict]:
     if not message["files"]:
-        return [{"type": "text", "text": message["text"]}]
     if message["files"][0].endswith(".mp4"):
-        return [{"type": "text", "text": message["text"]}, *process_video(message["files"][0])]
     if "<image>" in message["text"]:
         return process_interleaved_images(message)
-    return [
-        {"type": "text", "text": message["text"]},
-        *[{"type": "image", "url": path} for path in message["files"]],
-    ]
 def process_history(history: list[dict]) -> list[dict]:
-    messages = []
-    current_user_content: list[dict] = []
     for item in history:
         if item["role"] == "assistant":
-            if current_user_content:
-                messages.append({"role": "user", "content": current_user_content})
-                current_user_content = []
-            messages.append({"role": "assistant", "content": [{"type": "text", "text": item["content"]}]})
         else:
-            content = item["content"]
-            if isinstance(content, str):
-                current_user_content.append({"type": "text", "text": content})
             else:
-                current_user_content.append({"type": "image", "url": content[0]})
-    return messages
 @spaces.GPU(duration=120)
 def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
@@ -184,34 +174,42 @@ def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tok
         yield ""
         return
-    messages = []
     if system_prompt:
-        messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})
-    messages.extend(process_history(history))
-    messages.append({"role": "user", "content": process_new_user_message(message)})
-    inputs = processor.apply_chat_template(
-        messages,
-        add_generation_prompt=True,
-        tokenize=True,
-        return_dict=True,
         return_tensors="pt",
     ).to(device=model.device, dtype=torch.bfloat16)
     streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
-        inputs,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
-    output = ""
     for delta in streamer:
-        output += delta
-        yield output
 examples = [
     [
@@ -269,8 +267,7 @@ examples = [
                 "assets/sample-images/09-2.png",
                 "assets/sample-images/09-3.png",
                 "assets/sample-images/09-4.png",
-                "assets/sample-images/09-5.png",
-            ],
         }
     ],
     [
@@ -305,13 +302,13 @@ examples = [
     ],
     [
         {
-            "text": "caption this image",
             "files": ["assets/sample-images/01.png"],
         }
     ],
     [
         {
-            "text": "What's the sign says?",
             "files": ["assets/sample-images/02.png"],
         }
     ],
@@ -362,4 +359,4 @@ demo = gr.ChatInterface(
 )
 if __name__ == "__main__":
-    demo.launch()

 from loguru import logger
 from PIL import Image
 from peft import PeftModel
+from transformers import (
+    AutoProcessor,
+    Gemma3ForConditionalGeneration,
+    TextIteratorStreamer,
+)
+# Set model and adapter IDs
+model_id = os.getenv("MODEL_ID", "google/gemma-3-12b-pt")
+adapter_id = os.getenv("ADAPTER_ID", "slavamarcin/HG_Gemma-3-12B-4bit-QLora_purpose")
+# Load Gemma base model and move to GPU, using bfloat16
+model = Gemma3ForConditionalGeneration.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    attn_implementation="eager"
+).to("cuda")
 # Load processor (tokenizer + feature extractor)
 processor = AutoProcessor.from_pretrained(
+    model_id,
     padding_side="left"
 )
+# Wrap with PEFT adapter and move to GPU
+#model = PeftModel.from_pretrained(
+#    model,
+#    adapter_id,
+#    ignore_mismatched_sizes=True
+#).to("cuda")
+# Switch to evaluation mode
 model.eval()
 MAX_NUM_IMAGES = int(os.getenv("MAX_NUM_IMAGES", "5"))
 def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
+    image_count = video_count = 0
     for path in paths:
         if path.endswith(".mp4"):
             video_count += 1
             image_count += 1
     return image_count, video_count
 def count_files_in_history(history: list[dict]) -> tuple[int, int]:
+    image_count = video_count = 0
     for item in history:
         if item["role"] != "user" or isinstance(item["content"], str):
             continue
             image_count += 1
     return image_count, video_count
 def validate_media_constraints(message: dict, history: list[dict]) -> bool:
+    new_i, new_v = count_files_in_new_message(message["files"])
+    hist_i, hist_v = count_files_in_history(history)
+    if hist_v + new_v > 1:
         gr.Warning("Only one video is supported.")
         return False
+    if hist_v + new_v == 1 and (hist_i + new_i) > 0:
+        gr.Warning("Mixing images and videos is not allowed.")
+        return False
+    if "<image>" in message["text"] and message["text"].count("<image>") != new_i:
+        gr.Warning("The number of <image> tags doesn't match the number of images.")
         return False
+    if hist_v + new_v == 0 and (hist_i + new_i) > MAX_NUM_IMAGES:
+        gr.Warning(f"You can upload up to {MAX_NUM_IMAGES} images.")
         return False
     return True
 def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
     vidcap = cv2.VideoCapture(video_path)
     fps = vidcap.get(cv2.CAP_PROP_FPS)
+    total = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    interval = max(total // MAX_NUM_IMAGES, 1)
+    frames = []
+    for i in range(0, min(total, MAX_NUM_IMAGES * interval), interval):
         if len(frames) >= MAX_NUM_IMAGES:
             break
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
+        ok, img = vidcap.read()
+        if not ok:
+            continue
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        pil = Image.fromarray(img)
+        frames.append((pil, round(i / fps, 2)))
     vidcap.release()
     return frames
+def process_video(path: str) -> list[dict]:
+    out = []
+    for pil, ts in downsample_video(path):
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
+            pil.save(tmp.name)
+            out.append({"type":"text", "text":f"Frame {ts}:"})
+            out.append({"type":"image", "url":tmp.name})
+    return out
 def process_interleaved_images(message: dict) -> list[dict]:
     parts = re.split(r"(<image>)", message["text"])
+    out = []
+    idx = 0
+    for p in parts:
+        if p == "<image>":
+            out.append({"type":"image","url":message["files"][idx]})
+            idx += 1
+        elif p.strip():
+            out.append({"type":"text","text":p.strip()})
+    return out
 def process_new_user_message(message: dict) -> list[dict]:
     if not message["files"]:
+        return [{"type":"text","text":message["text"]}]
     if message["files"][0].endswith(".mp4"):
+        return [{"type":"text","text":message["text"]}] + process_video(message["files"][0])
     if "<image>" in message["text"]:
         return process_interleaved_images(message)
+    return [{"type":"text","text":message["text"]}] + [{"type":"image","url":f} for f in message["files"]]
 def process_history(history: list[dict]) -> list[dict]:
+    msgs = []
+    user_buffer = []
     for item in history:
         if item["role"] == "assistant":
+            if user_buffer:
+                msgs.append({"role":"user","content":user_buffer})
+                user_buffer = []
+            msgs.append({"role":"assistant","content":[{"type":"text","text":item["content"]}]})
         else:
+            cnt = item["content"]
+            if isinstance(cnt, str):
+                user_buffer.append({"type":"text","text":cnt})
             else:
+                user_buffer.append({"type":"image","url":cnt[0]})
+    if user_buffer:
+        msgs.append({"role":"user","content":user_buffer})
+    return msgs
+# Build a simple ChatML-style prompt
+def build_prompt(messages: list[dict]) -> str:
+    prompt = ""
+    for msg in messages:
+        prompt += f"<|im_start|>{msg['role']}\n"
+        for part in msg["content"]:
+            if part["type"] == "text":
+                prompt += part["text"]
+            else:  # image placeholder
+                prompt += "<image>"
+        prompt += "\n"
+    prompt += "<|im_end|>\n"
+    return prompt
 @spaces.GPU(duration=120)
 def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
         yield ""
         return
+    msgs = []
     if system_prompt:
+        msgs.append({"role":"system","content":[{"type":"text","text":system_prompt}]})
+    msgs += process_history(history)
+    msgs.append({"role":"user","content":process_new_user_message(message)})
+    # Build text prompt and collect images
+    prompt = build_prompt(msgs)
+    images = []
+    for m in msgs:
+        for part in m["content"]:
+            if part["type"] == "image":
+                images.append(Image.open(part["url"]))
+    # Encode multimodal inputs directly
+    inputs = processor(
+        text=prompt,
+        images=images if images else None,
         return_tensors="pt",
+        padding=True
     ).to(device=model.device, dtype=torch.bfloat16)
+    # Stream generation
     streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
+        **inputs,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
+    out = ""
     for delta in streamer:
+        out += delta
+        yield out
 examples = [
     [
                 "assets/sample-images/09-2.png",
                 "assets/sample-images/09-3.png",
                 "assets/sample-images/09-4.png",
+                "assets/sample-images/09-5.png"],
         }
     ],
     [
     ],
     [
         {
+            "text": "Caption this image",
             "files": ["assets/sample-images/01.png"],
         }
     ],
     [
         {
+            "text": "What's the sign say?",
             "files": ["assets/sample-images/02.png"],
         }
     ],
 )
 if __name__ == "__main__":
+    demo.launch(share=True)