import torch import gradio as gr from transformers import pipeline # Verify GPU print(f"GPU available: {torch.cuda.is_available()}") print(f"GPU name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}") # Initialize pipeline WITHOUT 4-bit quantization pipe = pipeline( "text-generation", model="agentica-org/DeepCoder-14B-Preview", device="cuda" if torch.cuda.is_available() else "cpu", torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, ) def chat(message, history): messages = [{"role": "user", "content": message}] response = pipe( messages, max_new_tokens=256, do_sample=True, temperature=0.7, top_p=0.9 ) return response[0]["generated_text"] gr.ChatInterface( chat, title="DeepCoder-14B", description="Code generation with DeepCoder-14B-Preview (A100 GPU)" ).launch()