import gradio as gr
import torch
from PIL import Image
from io import BytesIO
from huggingface_hub import hf_hub_download
from processing_llava import LlavaProcessor, OpenCLIPImageProcessor
from modeling_llava import LlavaForConditionalGeneration
from transformers import AutoTokenizer, TextStreamer

# Скачиваем необходимые файлы модели
hf_hub_download(repo_id="OEvortex/HelpingAI-Vision", filename="configuration_llava.py", local_dir="./", force_download=True)
hf_hub_download(repo_id="OEvortex/HelpingAI-Vision", filename="configuration_phi.py", local_dir="./", force_download=True)
hf_hub_download(repo_id="OEvortex/HelpingAI-Vision", filename="modeling_llava.py", local_dir="./", force_download=True)
hf_hub_download(repo_id="OEvortex/HelpingAI-Vision", filename="modeling_phi.py", local_dir="./", force_download=True)
hf_hub_download(repo_id="OEvortex/HelpingAI-Vision", filename="processing_llava.py", local_dir="./", force_download=True)

# Создаем модель
model = LlavaForConditionalGeneration.from_pretrained("OEvortex/HelpingAI-Vision", torch_dtype=torch.float16)
model = model.to("cuda")

# Создаем процессоры
tokenizer = AutoTokenizer.from_pretrained("OEvortex/HelpingAI-Vision")
image_processor = OpenCLIPImageProcessor(model.config.preprocess_config)
processor = LlavaProcessor(image_processor, tokenizer)

# Функция для генерации текста
def generate_text(image, initial_text):
    # Обрабатываем входные данные
    with torch.inference_mode():
        inputs = processor(initial_text, image, model, return_tensors='pt')
        inputs['input_ids'] = inputs['input_ids'].to(model.device)
        inputs['attention_mask'] = inputs['attention_mask'].to(model.device)

        streamer = TextStreamer(tokenizer)

        # Генерируем данные
        output = model.generate(**inputs, max_new_tokens=200, do_sample=True, top_p=0.9, temperature=1.2, eos_token_id=tokenizer.eos_token_id, streamer=streamer)
    
    # Возвращаем сгенерированный текст, убирая начальный и конечный токены
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Создаем интерфейс Gradio
with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="Загрузите изображение")
            text_input = gr.Textbox(label="Введите текст запроса")
        with gr.Column():
            output_text = gr.Textbox(label="Сгенерированный текст")
    
    generate_button = gr.Button("Генерировать текст")
    generate_button.click(generate_text, inputs=[image_input, text_input], outputs=output_text)

# Запускаем интерфейс
demo.launch()