import torch
import gradio as gr
from transformers import pipeline

# Verify GPU
print(f"GPU available: {torch.cuda.is_available()}")
print(f"GPU name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

# Initialize pipeline WITHOUT 4-bit quantization
pipe = pipeline(
    "text-generation",
    model="agentica-org/DeepCoder-14B-Preview",
    device="cuda" if torch.cuda.is_available() else "cpu",
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
)

def chat(message, history):
    messages = [{"role": "user", "content": message}]
    response = pipe(
        messages,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )
    return response[0]["generated_text"]

gr.ChatInterface(
    chat,
    title="DeepCoder-14B",
    description="Code generation with DeepCoder-14B-Preview (A100 GPU)"
).launch()