Spaces:
Runtime error
Runtime error
| import os | |
| import re | |
| import subprocess | |
| import numpy as np | |
| from PIL import Image | |
| import gradio as gr | |
| import torch | |
| from transformers import AutoProcessor, Florence2ForConditionalGeneration | |
| # Load model and processor, enabling trust_remote_code if needed | |
| model_name = "PJMixers-Images/Florence-2-base-Castollux-v0.5" | |
| model = Florence2ForConditionalGeneration.from_pretrained(model_name).eval() | |
| processor = AutoProcessor.from_pretrained(model_name) | |
| # Set device (GPU if available) | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model.to(device) | |
| TITLE = f"# [{model_name}](https://huggingface.co/{model_name})" | |
| def process_image(image, task="<CAPTION>", num_beams=5, min_p=0.0, top_p=1.0): | |
| """ | |
| Process a single image to generate a caption. | |
| Supports image input as file path, numpy array, or PIL Image. | |
| Generation settings (num_beams, min_p, top_p) can be customized. | |
| """ | |
| try: | |
| # Convert input to PIL image if necessary | |
| if isinstance(image, np.ndarray): | |
| image = Image.fromarray(image) | |
| elif isinstance(image, str): | |
| image = Image.open(image) | |
| if image.mode != "RGB": | |
| image = image.convert("RGB") | |
| # Prepare inputs for the model | |
| inputs = processor( | |
| text=task, | |
| images=image, | |
| return_tensors="pt" | |
| ) | |
| # Move tensors to the appropriate device | |
| inputs = {k: v.to(device) for k, v in inputs.items()} | |
| # Disable gradients during inference | |
| with torch.no_grad(): | |
| generated_ids = model.generate( | |
| input_ids=inputs["input_ids"], | |
| pixel_values=inputs["pixel_values"], | |
| max_new_tokens=1024, | |
| num_beams=num_beams, | |
| do_sample=True, | |
| top_p=top_p, | |
| min_p=min_p, | |
| ) | |
| # Decode and post-process the generated text | |
| return processor.batch_decode( | |
| generated_ids, | |
| skip_special_tokens=False | |
| )[0].replace('</s>', '').replace('<s>', '').replace('<pad>', '').strip() | |
| except Exception as e: | |
| return f"Error processing image: {e}" | |
| # Custom CSS to style the output box | |
| css = """ | |
| #output { height: 500px; overflow: auto; border: 1px solid #ccc; } | |
| """ | |
| with gr.Blocks(css=css) as demo: | |
| gr.Markdown(TITLE) | |
| with gr.Tab(label="Single Image Processing"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| input_img = gr.Image(label="Input Picture") | |
| with gr.Column(): | |
| output_text = gr.Textbox(label="Output Text") | |
| submit_btn = gr.Button(value="Submit") | |
| task_dropdown = gr.Dropdown( | |
| choices=["<CAPTION>", "<DETAILED_CAPTION>", "<MORE_DETAILED_CAPTION>"], | |
| value="<CAPTION>", | |
| label="Captioning Mode" | |
| ) | |
| num_beams_slider = gr.Slider( | |
| minimum=1, | |
| maximum=5, | |
| step=1, | |
| value=5, | |
| label="Number of Beams" | |
| ) | |
| min_p_slider = gr.Slider( | |
| minimum=0, | |
| maximum=1, | |
| step=0.01, | |
| value=0.0, | |
| label="Min-P" | |
| ) | |
| top_p_slider = gr.Slider( | |
| minimum=0, | |
| maximum=1, | |
| step=0.01, | |
| value=1.0, | |
| label="Top-P" | |
| ) | |
| gr.Examples( | |
| [ | |
| ["eval_img_1.jpg", "<CAPTION>", 5, 0.0, 1.0], | |
| ["eval_img_2.jpg", "<CAPTION>", 5, 0.0, 1.0], | |
| ["eval_img_3.jpg", "<CAPTION>", 5, 0.0, 1.0], | |
| ["eval_img_4.jpg", "<CAPTION>", 5, 0.0, 1.0], | |
| ["eval_img_5.jpg", "<CAPTION>", 5, 0.0, 1.0], | |
| ["eval_img_6.jpg", "<CAPTION>", 5, 0.0, 1.0], | |
| ["eval_img_7.png", "<CAPTION>", 5, 0.0, 1.0], | |
| ["eval_img_8.jpg", "<CAPTION>", 5, 0.0, 1.0], | |
| ], | |
| inputs=[input_img, task_dropdown, num_beams_slider, min_p_slider, top_p_slider], | |
| outputs=[output_text], | |
| fn=process_image, | |
| label="Try captioning on below examples", | |
| ) | |
| submit_btn.click( | |
| process_image, | |
| [input_img, task_dropdown, num_beams_slider, min_p_slider, top_p_slider], | |
| [output_text] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(debug=True) | |