qnguyen3/nanoLLaVA · Gradio Demo addition to repo

I've been trying to port over the moondream gradio demo (apache 2.0), but I've been getting this error after trying to add streaming, absolutely befuddled since the input hasn't changed.

import torch
import transformers
import gradio as gr
from threading import Thread
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
import warnings

# disable some warnings
transformers.logging.set_verbosity_error()
transformers.logging.disable_progress_bar()
warnings.filterwarnings('ignore')


# set device
torch.set_default_device('cpu')  # or 'cpu'

# create model
model = AutoModelForCausalLM.from_pretrained(
    'qnguyen3/nanoLLaVA',
    torch_dtype=torch.float16,
    device_map='cpu',
    trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(
    'qnguyen3/nanoLLaVA',
    trust_remote_code=True)

def answer_question(img, prompt):
    # nanoLLaVA prompt tokenization stuff
    messages = [
        {"role": "user", "content": f'<image>\n{prompt}'}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    print(text)

    text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
    input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
    image_tensor = model.process_images([img], model.config).to(dtype=model.dtype)
    
    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
    thread = Thread(
        target=model.generate,
        kwargs={
            "input_ids": input_ids,
            "images": image_tensor,
            "max_new_tokens": 2048,
            "use_cache": True,
            "streamer": streamer,
        },
    )
    thread.start()

    buffer = ""
    for new_text in streamer:
        print(new_text)
        text = tokenizer.decode(new_text[input_ids.shape[1]:], skip_special_tokens=True).strip()
        buffer += text
        yield buffer

with gr.Blocks() as demo:
    gr.Markdown(
        """
        # NanoLLaVA
        ### A tiny vision language model. [GitHub](https://huggingface.co/qnguyen3/nanoLLaVA)
        """
    )
    with gr.Row():
        prompt = gr.Textbox(label="Input Prompt", placeholder="Type here...", scale=4)
        submit = gr.Button("Submit")
    with gr.Row():
        img = gr.Image(type="pil", label="Upload an Image")
        output = gr.TextArea(label="Response")
    submit.click(answer_question, [img, prompt], output)
    prompt.submit(answer_question, [img, prompt], output)

demo.queue().launch(debug=True)

Any help would be appreciated.
I'm also unable to run on cuda, since the image features somehow still are on cpu even after explicit image_tensor = image_tensor.to(device), so the script uses cpu.