Spaces:

lmms-lab
/

LLaVA-OneVision-1.5

Running

App Files Files Community

xiangan commited on Sep 24

Commit

1014901

1 Parent(s): 8a816b1

init

Browse files

Files changed (2) hide show

app.py +129 -0
requirements.txt +1 -0

app.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import base64
+import mimetypes
+import os
+from pathlib import Path
+from typing import Any, Dict, List
+import gradio as gr
+from openai import OpenAI
+DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "LLaVA-OneVision-1.5-8B-Instruct")
+_client = OpenAI(
+    base_url=os.getenv("BASE_URL", ""),
+    api_key=os.getenv("API_KEY", ""),
+)
+def _data_url(path: str) -> str:
+    mime, _ = mimetypes.guess_type(path)
+    mime = mime or "application/octet-stream"
+    data = base64.b64encode(Path(path).read_bytes()).decode("utf-8")
+    return f"data:{mime};base64,{data}"
+def _image_content(path: str) -> Dict[str, Any]:
+    return {"type": "image_url", "image_url": {"url": _data_url(path)}}
+def _text_content(text: str) -> Dict[str, Any]:
+    return {"type": "text", "text": text}
+def _message(role: str, content: Any) -> Dict[str, Any]:
+    return {"role": role, "content": content}
+def _build_user_message(message: Dict[str, Any]) -> Dict[str, Any]:
+    files = message.get("files") or []
+    text = (message.get("text") or "").strip()
+    content: List[Dict[str, Any]] = [_image_content(p) for p in files]
+    if text:
+        content.append(_text_content(text))
+    return _message("user", content)
+def _convert_history(history: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    msgs: List[Dict[str, Any]] = []
+    user_content: List[Dict[str, Any]] = []
+    for turn in history or []:
+        role, content = turn.get("role"), turn.get("content")
+        if role == "user":
+            if isinstance(content, str):
+                user_content.append(_text_content(content))
+            elif isinstance(content, tuple):
+                user_content.extend(_image_content(path)
+                                    for path in content if path)
+        elif role == "assistant":
+            msgs.append(_message("user", user_content.copy()))
+            user_content.clear()
+            msgs.append(_message("assistant", content))
+    return msgs
+def stream_response(message: Dict[str, Any], history: List[Dict[str, Any]], model_name: str = DEFAULT_MODEL):
+    messages = _convert_history(history)
+    messages.append(_build_user_message(message))
+    try:
+        stream = _client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            temperature=0.000001,
+            top_p=1,
+            extra_body={
+                "repetition_penalty": 1.05,
+                "frequency_penalty": 0,
+                "presence_penalty": 0
+            },
+            stream=True
+        )
+        partial = ""
+        for chunk in stream:
+            delta = chunk.choices[0].delta.content
+            if delta:
+                partial += delta
+                yield partial
+    except Exception as e:
+        yield f"Failed to get response: {e}"
+def build_demo() -> gr.Blocks:
+    chatbot = gr.Chatbot(type="messages", allow_tags=["think"])
+    textbox = gr.MultimodalTextbox(
+        show_label=False,
+        placeholder="Enter text, or upload one or more images...",
+        file_types=["image"],
+        file_count="single",
+        max_plain_text_length=32768
+    )
+    model_selector = gr.Dropdown(
+        label="Model",
+        choices=[
+            ("LLaVA-OneVision-1.5-8B-Instruct", "LLaVA-OneVision-1.5-8B-Instruct"),
+            ("LLaVA-OneVision-1.5-4B-Instruct", "LLaVA-OneVision-1.5-4B-Instruct"),
+        ],
+        value=DEFAULT_MODEL,
+    )
+    return gr.ChatInterface(
+        fn=stream_response,
+        type="messages",
+        multimodal=True,
+        chatbot=chatbot,
+        textbox=textbox,
+        title="LLaVA-OneVision-1.5: Fully Open Framework for Democratized Multimodal Training",
+        description="""**LLaVA-OneVision1.5** introduces a novel family of fully open-source Large Multimodal Models (LMMs) that achieves state-of-the-art performance with substantially lower cost through training on native resolution images.
+🔗 **Links**: [GitHub](https://github.com/EvolvingLMMs-Lab/LLaVA-OneVision-1.5) | [HuggingFace](https://huggingface.co/lmms-lab)""",
+        additional_inputs=[model_selector],
+        additional_inputs_accordion=gr.Accordion("Options", open=True),
+    ).queue(default_concurrency_limit=8)
+def main():
+    build_demo().launch()
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ openai