llm / app.py
abhishekdeshmukh
Second
78a5883
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os
# Set environment variables to handle potential CUDA memory issues
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
# Initialize model and tokenizer
model_name = "deepseek-ai/DeepSeek-V3-0324"
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True,
# Add low_cpu_mem_usage for better memory management
low_cpu_mem_usage=True,
# Add loading optimization parameters
use_flash_attention_2=True,
use_cache=True
)
# Set model to evaluation mode
model.eval()
def generate_response(message, chat_history, system_prompt="You are a helpful AI assistant.", max_length=2048,
temperature=0.7):
try:
# Format the conversation
full_prompt = f"{system_prompt}\n\nUser: {message}\nAssistant:"
if chat_history:
history_text = ""
for user_msg, assistant_msg in chat_history:
history_text += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
full_prompt = f"{system_prompt}\n\n{history_text}User: {message}\nAssistant:"
# Tokenize input
inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)
# Generate response
with torch.no_grad():
outputs = model.generate(
inputs.input_ids,
max_length=max_length,
temperature=temperature,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
top_p=0.9,
repetition_penalty=1.1,
# Add generation optimization parameters
use_cache=True,
num_beams=1,
)
# Decode and return the response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
response = response.split("Assistant:")[-1].strip()
return response
except Exception as e:
return f"An error occurred: {str(e)}"
# Create the Gradio interface
with gr.Blocks(css="footer {visibility: hidden}") as demo:
gr.Markdown("# DeepSeek V3 Chatbot")
gr.Markdown("Welcome! This is a chatbot powered by the DeepSeek-V3-0324 model.")
chatbot = gr.Chatbot(height=600)
msg = gr.Textbox(label="Type your message here...", placeholder="Hello! How can I help you today?")
clear = gr.Button("Clear Conversation")
# Add temperature control
temperature = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.7,
step=0.1,
label="Temperature",
info="Higher = more creative, Lower = more focused"
)
def user(user_message, history):
return "", history + [[user_message, None]]
def bot(history, temp):
user_message = history[-1][0]
bot_message = generate_response(user_message, history[:-1], temperature=temp)
history[-1][1] = bot_message
return history
msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
bot, [chatbot, temperature], chatbot
)
clear.click(lambda: None, None, chatbot, queue=False)
demo.queue()
demo.launch()