caption-max

Running

File size: 5,673 Bytes

fbf5e14
ecbc33a
 
 
 
c3db921
ecbc33a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb6ff47
ecbc33a
 
 
 
 
 
 
 
 
 
 
 
 
fb6ff47
74f815b
fb6ff47
ecbc33a
c3db921
ef825ae
ecbc33a
 
fb6ff47
ecbc33a
 
c3db921
ecbc33a
 
fb6ff47
ecbc33a
6f299f7
ecbc33a
 
 
66f8fc1
41e193d
6987040
 
 
 
41e193d
 
6987040
 
 
 
 
 
 
 
72d6681
 
6987040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36c27b0
6987040
 
 
1a1b115
72d6681
6987040
ecbc33a
6987040
5ce139d
 
e8f2900
5ce139d
e8f2900
 
 
 
5ce139d
e8f2900
 
 
 
ffcf10e
e8f2900
 
c7eed2a
 
 
c1f1f88
e8f2900
66f8fc1
6987040


import gradio as gr
from transformers import AutoProcessor, AutoTokenizer, AutoImageProcessor, AutoModelForCausalLM, BlipForConditionalGeneration, VisionEncoderDecoderModel
import torch



git_processor_base = AutoProcessor.from_pretrained("microsoft/git-base-coco")
git_model_base = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")

git_processor_large = AutoProcessor.from_pretrained("microsoft/git-large-coco")
git_model_large = AutoModelForCausalLM.from_pretrained("microsoft/git-large-coco")

blip_processor_base = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model_base = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

blip_processor_large = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
blip_model_large = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

vitgpt_processor = AutoImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
vitgpt_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
vitgpt_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = "cuda" if torch.cuda.is_available() else "cpu"

git_model_base.to(device)
blip_model_base.to(device)
git_model_large.to(device)
blip_model_large.to(device)
vitgpt_model.to(device)

def generate_caption(processor, model, image, tokenizer=None):
    inputs = processor(images=image, return_tensors="pt").to(device)
    
    generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=50)

    if tokenizer is not None:
        generated_caption = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    else:
        generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
   
    return generated_caption


def generate_captions(image):
    caption_git_base = generate_caption(git_processor_base, git_model_base, image)

    caption_git_large = generate_caption(git_processor_large, git_model_large, image)

    caption_blip_base = generate_caption(blip_processor_base, blip_model_base, image)

    caption_blip_large = generate_caption(blip_processor_large, blip_model_large, image)

    caption_vitgpt = generate_caption(vitgpt_processor, vitgpt_model, image, vitgpt_tokenizer)

    return caption_git_base, caption_git_large, caption_blip_base, caption_blip_large, caption_vitgpt

   
examples = [["cat.jpg"], ["dog.jpg"], ["horse.jpg"]]
outputs = [gr.outputs.Textbox(label="Caption generated by GIT-base"), gr.outputs.Textbox(label="Caption generated by GIT-large"), gr.outputs.Textbox(label="Caption generated by BLIP-base"), gr.outputs.Textbox(label="Caption generated by BLIP-large"), gr.outputs.Textbox(label="Caption generated by ViT+GPT-2")] 

title = "Image to Text : Multiple Models"
description = "Explore the Gradio Demo for comparing three state-of-the-art vision+language models: GIT, BLIP, and ViT+GPT2. To use the demo, upload your image and click 'submit,' or choose from the provided examples."
article = "<p style='text-align: center'><a href='https://huggingface.co/docs/transformers/main/model_doc/blip' target='_blank'>BLIP docs</a> | <a href='https://huggingface.co/docs/transformers/main/model_doc/git' target='_blank'>GIT docs</a></p>"

iface = gr.Interface(fn=generate_captions, 
                         inputs=gr.inputs.Image(type="pil"),
                         outputs=outputs,
                         examples=examples, 
                         title=title,
                         description=description,
                         article=article, 
                         enable_queue=True)
iface.launch(server_name="0.0.0.0", server_port=7860)

'''

import gradio as gr
import numpy as np
from PIL import Image

def generate_ascii_art(image):
    try:
        # Convert the numpy array to a PIL Image
        img = Image.fromarray(np.uint8(image))

        # Resize the image to a smaller size for faster processing
        img = img.resize((80, 60))

        # Convert the image to grayscale
        img = img.convert("L")

        # Define ASCII characters to represent different intensity levels
        #ascii_chars = "@%#*+=-:. "
        ascii_chars = "$@B%8&WM#*oahkbdpqwmZO0QLCJUYXzcvunxrjft/|()1{}[]?-_+~<>i!lI;:,\\^`'. "

        # Convert each pixel to ASCII character based on intensity
        ascii_image = ""
        for pixel_value in img.getdata():
            ascii_image += ascii_chars[pixel_value // 25]

        # Reshape the ASCII string to match the resized image dimensions
        ascii_image = "\n".join([ascii_image[i:i + img.width] for i in range(0, len(ascii_image), img.width)])

        return ascii_image
    except Exception as e:
        return f"Error: {e}"

iface = gr.Interface(
    fn=generate_ascii_art,
    inputs="image",
    outputs="text",
    title="ASCII Art Generator",
    description="Upload an image, and this app will turn it into ASCII art!  - Simple  Gradio App from  Docker",
    live=True
)

iface.launch(server_name="0.0.0.0", server_port=7860)



import gradio as gr
import subprocess

def run_command(command):
    try:
        result = subprocess.check_output(command, shell=True, text=True)
        return result
    except subprocess.CalledProcessError as e:
        return f"Error: {e}"

iface = gr.Interface(
    fn=run_command,
    inputs="text",
    outputs="text",
    #live=True,
    title="Command Output Viewer",
    description="Enter a command and view its output.",
    examples=[
    ["ls"],
    ["pwd"],
    ["echo 'Hello, Gradio!'"]]
)

iface.launch(server_name="0.0.0.0", server_port=7860)
'''