import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import json
import os

# model_name = "meta-llama/Meta-Llama-3-8B-Instruct" # uses more than 16GB
# model_name = "meta-llama/Llama-2-7b-chat-hf" # only part of paid model
# model_name = "mistralai/Mistral-7B-Instruct-v0.1" # try this with quantisation to reduce memory usage; needs a GPU to run
model_name = "Qwen/Qwen2-1.5B-Instruct"
# google/gemma-2-9b-it # 18GB
# meta-llama/Meta-Llama-3-8B # 16GB
# Qwen/Qwen2-7B-Instruct # 15GB # wouldn't run due to Memory Limit Exceeded

# TODO try the following models:
# mistralai/Mistral-7B-Instruct-v0.3
# google/flan-t5


quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

# Authenticate using the token
token = os.getenv("HF_TOKEN")
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
# model = AutoModelForCausalLM.from_pretrained(model_name, token=token)
# model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config)
model = AutoModelForCausalLM.from_pretrained(model_name, token=token)

def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=200)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

def process_text(text):
    prompt = f"Hi my name is fratley"
    response = generate_response(prompt)
    try:
        json_response = json.loads(response)
        return json.dumps(json_response, indent=2)
    except json.JSONDecodeError:
        return "Error: Could not parse JSON from response"

iface = gr.Interface(
    fn=process_text,
    inputs="text",
    outputs="text",
    title="Patient Information Extractor"
)

iface.launch()