|
|
|
import spaces |
|
from transformers import AutoProcessor, AutoModelForCausalLM |
|
|
|
|
|
|
|
model_id = "microsoft/Florence-2-large-ft" |
|
model = ( |
|
AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).to("cuda").eval() |
|
) |
|
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) |
|
|
|
|
|
@spaces.GPU(duration=120) |
|
def run_example(task_prompt, image, text_input=None): |
|
""" |
|
Runs an example using the given task prompt and image. |
|
|
|
Args: |
|
task_prompt (str): The task prompt for the example. |
|
image (PIL.Image.Image): The image to be processed. |
|
text_input (str, optional): Additional text input to be appended to the task prompt. Defaults to None. |
|
|
|
Returns: |
|
str: The parsed answer generated by the model. |
|
""" |
|
|
|
|
|
if text_input is None: |
|
prompt = task_prompt |
|
else: |
|
prompt = task_prompt + text_input |
|
|
|
|
|
inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda") |
|
|
|
|
|
generated_ids = model.generate( |
|
input_ids=inputs["input_ids"], |
|
pixel_values=inputs["pixel_values"], |
|
max_new_tokens=1024, |
|
early_stopping=False, |
|
do_sample=False, |
|
num_beams=3, |
|
) |
|
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0] |
|
parsed_answer = processor.post_process_generation( |
|
generated_text, task=task_prompt, image_size=(image.width, image.height) |
|
) |
|
|
|
|
|
return parsed_answer |
|
|