xiangan commited on
Commit
1014901
·
1 Parent(s): 8a816b1
Files changed (2) hide show
  1. app.py +129 -0
  2. requirements.txt +1 -0
app.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import mimetypes
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Any, Dict, List
6
+
7
+ import gradio as gr
8
+ from openai import OpenAI
9
+
10
+ DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "LLaVA-OneVision-1.5-8B-Instruct")
11
+
12
+ _client = OpenAI(
13
+ base_url=os.getenv("BASE_URL", ""),
14
+ api_key=os.getenv("API_KEY", ""),
15
+ )
16
+
17
+
18
+ def _data_url(path: str) -> str:
19
+ mime, _ = mimetypes.guess_type(path)
20
+ mime = mime or "application/octet-stream"
21
+ data = base64.b64encode(Path(path).read_bytes()).decode("utf-8")
22
+ return f"data:{mime};base64,{data}"
23
+
24
+
25
+ def _image_content(path: str) -> Dict[str, Any]:
26
+ return {"type": "image_url", "image_url": {"url": _data_url(path)}}
27
+
28
+
29
+ def _text_content(text: str) -> Dict[str, Any]:
30
+ return {"type": "text", "text": text}
31
+
32
+
33
+ def _message(role: str, content: Any) -> Dict[str, Any]:
34
+ return {"role": role, "content": content}
35
+
36
+
37
+ def _build_user_message(message: Dict[str, Any]) -> Dict[str, Any]:
38
+ files = message.get("files") or []
39
+ text = (message.get("text") or "").strip()
40
+ content: List[Dict[str, Any]] = [_image_content(p) for p in files]
41
+ if text:
42
+ content.append(_text_content(text))
43
+ return _message("user", content)
44
+
45
+
46
+ def _convert_history(history: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
47
+ msgs: List[Dict[str, Any]] = []
48
+ user_content: List[Dict[str, Any]] = []
49
+
50
+ for turn in history or []:
51
+ role, content = turn.get("role"), turn.get("content")
52
+ if role == "user":
53
+ if isinstance(content, str):
54
+ user_content.append(_text_content(content))
55
+ elif isinstance(content, tuple):
56
+ user_content.extend(_image_content(path)
57
+ for path in content if path)
58
+ elif role == "assistant":
59
+ msgs.append(_message("user", user_content.copy()))
60
+ user_content.clear()
61
+ msgs.append(_message("assistant", content))
62
+ return msgs
63
+
64
+
65
+ def stream_response(message: Dict[str, Any], history: List[Dict[str, Any]], model_name: str = DEFAULT_MODEL):
66
+ messages = _convert_history(history)
67
+ messages.append(_build_user_message(message))
68
+ try:
69
+ stream = _client.chat.completions.create(
70
+ model=model_name,
71
+ messages=messages,
72
+ temperature=0.000001,
73
+ top_p=1,
74
+ extra_body={
75
+ "repetition_penalty": 1.05,
76
+ "frequency_penalty": 0,
77
+ "presence_penalty": 0
78
+ },
79
+ stream=True
80
+ )
81
+ partial = ""
82
+ for chunk in stream:
83
+ delta = chunk.choices[0].delta.content
84
+ if delta:
85
+ partial += delta
86
+ yield partial
87
+ except Exception as e:
88
+ yield f"Failed to get response: {e}"
89
+
90
+
91
+ def build_demo() -> gr.Blocks:
92
+ chatbot = gr.Chatbot(type="messages", allow_tags=["think"])
93
+ textbox = gr.MultimodalTextbox(
94
+ show_label=False,
95
+ placeholder="Enter text, or upload one or more images...",
96
+ file_types=["image"],
97
+ file_count="single",
98
+ max_plain_text_length=32768
99
+ )
100
+ model_selector = gr.Dropdown(
101
+ label="Model",
102
+ choices=[
103
+ ("LLaVA-OneVision-1.5-8B-Instruct", "LLaVA-OneVision-1.5-8B-Instruct"),
104
+ ("LLaVA-OneVision-1.5-4B-Instruct", "LLaVA-OneVision-1.5-4B-Instruct"),
105
+ ],
106
+ value=DEFAULT_MODEL,
107
+ )
108
+ return gr.ChatInterface(
109
+ fn=stream_response,
110
+ type="messages",
111
+ multimodal=True,
112
+ chatbot=chatbot,
113
+ textbox=textbox,
114
+ title="LLaVA-OneVision-1.5: Fully Open Framework for Democratized Multimodal Training",
115
+ description="""**LLaVA-OneVision1.5** introduces a novel family of fully open-source Large Multimodal Models (LMMs) that achieves state-of-the-art performance with substantially lower cost through training on native resolution images.
116
+
117
+ 🔗 **Links**: [GitHub](https://github.com/EvolvingLMMs-Lab/LLaVA-OneVision-1.5) | [HuggingFace](https://huggingface.co/lmms-lab)""",
118
+ additional_inputs=[model_selector],
119
+ additional_inputs_accordion=gr.Accordion("Options", open=True),
120
+ ).queue(default_concurrency_limit=8)
121
+
122
+
123
+ def main():
124
+ build_demo().launch()
125
+
126
+
127
+ if __name__ == "__main__":
128
+ main()
129
+
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ openai