import gradio as gr import subprocess import torch from PIL import Image from transformers import AutoProcessor, AutoConfig import importlib.util, sys, os subprocess.run( "pip install --upgrade transformers>=4.50.0", shell=True, check=True ) model_id = "microsoft/Florence-2-base-ft" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) config_mod_name = config.__class__.__module__ config_mod = sys.modules[config_mod_name] code_dir = os.path.dirname(config_mod.__file__) spec = importlib.util.spec_from_file_location("florence2_modeling", modeling_path) flor_mod = importlib.util.module_from_spec(spec) sys.modules["florence2_modeling"] = flor_mod spec.loader.exec_module(flor_mod) FlorenceLM = flor_mod.Florence2LanguageForConditionalGeneration florence_model = FlorenceLM.from_pretrained( model_id, trust_remote_code=True ).to(device).eval() florence_processor = AutoProcessor.from_pretrained(model, trust_remote_code=True) def generate_caption(image): if not isinstance(image, Image.Image): image = Image.fromarray(image) inputs = florence_processor(text="", images=image, return_tensors="pt") inputs = {k: v.to(device) for k, v in inputs.items()} generated_ids = florence_model.generate( input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, early_stopping=False, do_sample=False, num_beams=3, ) generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0] parsed_answer = florence_processor.post_process_generation( generated_text, task="", image_size=(image.width, image.height) ) prompt = parsed_answer[""] print("\n\nGeneration completed!:"+ prompt) return prompt demo = gr.Interface(generate_caption, inputs=[gr.Image(label="Input Image")], outputs = [gr.Textbox(label="Output Prompt", lines=3, show_copy_button = True), ], theme="Yntec/HaleyCH_Theme_Orange", ) demo.launch(debug=True)