import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import torch import os # Set environment variables to handle potential CUDA memory issues os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128' # Initialize model and tokenizer model_name = "deepseek-ai/DeepSeek-V3-0324" print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) print("Loading model...") model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True, # Add low_cpu_mem_usage for better memory management low_cpu_mem_usage=True, # Add loading optimization parameters use_flash_attention_2=True, use_cache=True ) # Set model to evaluation mode model.eval() def generate_response(message, chat_history, system_prompt="You are a helpful AI assistant.", max_length=2048, temperature=0.7): try: # Format the conversation full_prompt = f"{system_prompt}\n\nUser: {message}\nAssistant:" if chat_history: history_text = "" for user_msg, assistant_msg in chat_history: history_text += f"User: {user_msg}\nAssistant: {assistant_msg}\n" full_prompt = f"{system_prompt}\n\n{history_text}User: {message}\nAssistant:" # Tokenize input inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device) # Generate response with torch.no_grad(): outputs = model.generate( inputs.input_ids, max_length=max_length, temperature=temperature, do_sample=True, pad_token_id=tokenizer.eos_token_id, top_p=0.9, repetition_penalty=1.1, # Add generation optimization parameters use_cache=True, num_beams=1, ) # Decode and return the response response = tokenizer.decode(outputs[0], skip_special_tokens=True) response = response.split("Assistant:")[-1].strip() return response except Exception as e: return f"An error occurred: {str(e)}" # Create the Gradio interface with gr.Blocks(css="footer {visibility: hidden}") as demo: gr.Markdown("# DeepSeek V3 Chatbot") gr.Markdown("Welcome! This is a chatbot powered by the DeepSeek-V3-0324 model.") chatbot = gr.Chatbot(height=600) msg = gr.Textbox(label="Type your message here...", placeholder="Hello! How can I help you today?") clear = gr.Button("Clear Conversation") # Add temperature control temperature = gr.Slider( minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature", info="Higher = more creative, Lower = more focused" ) def user(user_message, history): return "", history + [[user_message, None]] def bot(history, temp): user_message = history[-1][0] bot_message = generate_response(user_message, history[:-1], temperature=temp) history[-1][1] = bot_message return history msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then( bot, [chatbot, temperature], chatbot ) clear.click(lambda: None, None, chatbot, queue=False) demo.queue() demo.launch()