import gradio as gr from langground import LangGround, text_palette state = {"loc_model": None, "llm_model": None, "model": None} def load_model(loc_model: str, llm_model: str) -> LangGround: if (loc_model, llm_model) != (state["loc_model"], state["llm_model"]): gr.Info("Loading models...", duration=5) state.update({"model": LangGround(loc_model=loc_model, llm_model=llm_model), "loc_model": loc_model, "llm_model": llm_model}) gr.Info("Models loaded!", duration=2.5) return state["model"] def predict(frame, question: str, threshold: float, loc_model: str, llm_model: str): if not frame or not question.strip(): gr.Warning("Please provide both an image and a question") return "", None, None model = load_model(loc_model, llm_model) return model.localize(frame, question, threshold=threshold) title = """

🔍 Language Localization

Upload an image and ask questions to find objects in it.
""" css = """.my-group {max-width: 600px !important; max-height: 600px !important;} .my-column {display: flex !important; justify-content: center !important; align-items: center !important;}""" with gr.Blocks(css=css) as demo: gr.HTML(title) with gr.Row(): with gr.Column(scale=1): frame_input = gr.Image(type="pil", label="Upload Frame") with gr.Column(scale=1): with gr.Row(): with gr.Column(scale=1): loc_model_input = gr.Dropdown( choices=["yolo", "owl"], value="yolo", label="Localization Model", ) with gr.Column(scale=2): llm_model_input = gr.Dropdown( choices=[ "Qwen/Qwen2.5-7B-Instruct", "OpenGVLab/InternVL2_5-8B", "OpenGVLab/InternVL2_5-4B", "OpenGVLab/InternVL2_5-2B", "OpenGVLab/InternVL2_5-1B", ], value="Qwen/Qwen2.5-7B-Instruct", label="LLM Model", ) threshold_input = gr.Slider(minimum=0, maximum=1, value=0.4, step=0.1, label="Threshold") question_input = gr.Textbox(lines=2, placeholder="Enter your question here", label="Question") objs = gr.Highlightedtext(show_legend=False, show_inline_category=False, color_map=text_palette, label="Objects Found") submit_btn = gr.Button("Submit") with gr.Row(): all_bbox_image = gr.Image(label="Found Objects") llm_bbox_image = gr.Image(label="Selected Objects") submit_btn.click( fn=predict, inputs=[frame_input, question_input, threshold_input, loc_model_input, llm_model_input], outputs=[objs, all_bbox_image, llm_bbox_image], ) examples = gr.Examples( examples=[ ["assets/demo.jpeg", "I'm thirsty"], ["assets/kitchen.webp", "The food has expired and is no longer safe to eat."], ["assets/kitchen.webp", "The food is about to expire."], ], inputs=[frame_input, question_input], ) if __name__ == "__main__": demo.launch()