Spaces:

bdsqlsz
/

SD3-Llava-Llama3-Captioner

Running on Zero

File size: 2,089 Bytes

from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig
from lmdeploy.vl import load_image
import spaces
import gradio as gr
from PIL import Image
import numpy as np

@spaces.GPU
def create_captions_llava_llama3_docci(image):
    pipe = pipeline('Lin-Chen/open-llava-next-llama3-8b')
    gen_config = GenerationConfig(repetition_penalty=1.10)
    image = Image.fromarray(np.uint8(image)).convert('RGB')
    response = pipe(('As an AI image annotation expert, provide accurate captions for this images to enhance the T5 model understanding of the content. Prioritize captions based on relevance. Your captions should include key elements of the image such as content, style, etc. As well as background content and any other important captions. if the image has an obvious style or filter, it needs to be captioned as well.your captions should be accurate and not repetitive. These captions will be used for image reconstruction, so the more similar to the original image, the better the quality of the captions. Special captions will be rewarded with $10 per image. Only the content of the captions will be output in the end.', image), gen_config=gen_config)
    return response.text

css = """
  #mkd {
    height: 500px; 
    overflow: auto; 
    border: 1px solid #ccc; 
  }
"""

with gr.Blocks(css=css) as demo:
    gr.HTML("<h1><center>Fine tuned version of xtuner/llava-llama-3-8b-v1_1 on google/docci dataset.<center><h1>")

    with gr.Tab(label="SD3 Llava Llama3 Captioner"):
        with gr.Row():
            with gr.Column():
                input_img = gr.Image(label="Input Picture")
                submit_btn = gr.Button(value="Submit")
                output = gr.Text(label="Caption")
            
        gr.Examples(
        [["image1.jpg"], ["image2.jpg"], ["image3.png"]],
        inputs = [input_img],
        outputs = [output],
        fn=create_captions_llava_llama3_docci,
        label='Try captioning on examples'
        )
        
        submit_btn.click(create_captions_llava_llama3_docci, [input_img], [output])
    

demo.launch(debug=True)