Yongdong Wang
Add JSON processing support for Python dict format in model responses
a1f2008
raw
history blame
10.5 kB
import gradio as gr
import spaces # Import spaces module for ZeroGPU
from huggingface_hub import login
import os
from json_processor import JsonProcessor
import json
# 1) Read Secrets
hf_token = os.getenv("HUGGINGFACE_TOKEN")
if not hf_token:
raise RuntimeError("❌ HUGGINGFACE_TOKEN not detected, please check Space Settings β†’ Secrets")
# 2) Login to ensure all subsequent from_pretrained calls have proper permissions
login(hf_token)
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import warnings
import os
warnings.filterwarnings("ignore")
# Model configuration
MODEL_NAME = "meta-llama/Llama-3.1-8B"
LORA_MODEL = "YongdongWang/llama3.1-8b-lora-qlora-dart-llm"
# Global variables to store model and tokenizer
model = None
tokenizer = None
model_loaded = False
def load_model_and_tokenizer():
"""Load tokenizer - executed on CPU"""
global tokenizer, model_loaded
if model_loaded:
return
print("πŸ”„ Loading tokenizer...")
# Load tokenizer (on CPU)
tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME,
use_fast=False,
trust_remote_code=True
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model_loaded = True
print("βœ… Tokenizer loaded successfully!")
@spaces.GPU(duration=60) # Request GPU for loading model at startup
def load_model_on_gpu():
"""Load model on GPU"""
global model
if model is not None:
return model
print("πŸ”„ Loading model on GPU...")
try:
# 4-bit quantization configuration
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
quantization_config=bnb_config,
device_map="auto",
torch_dtype=torch.float16,
trust_remote_code=True,
low_cpu_mem_usage=True,
use_safetensors=True
)
# Load LoRA adapter
model = PeftModel.from_pretrained(
base_model,
LORA_MODEL,
torch_dtype=torch.float16,
use_safetensors=True
)
model.eval()
print("βœ… Model loaded on GPU successfully!")
return model
except Exception as load_error:
print(f"❌ Model loading failed: {load_error}")
raise load_error
def process_json_in_response(response):
"""Process and format JSON content in the response"""
try:
# Check if response contains JSON-like content
if '{' in response and '}' in response:
processor = JsonProcessor()
# Try to process the response for JSON content
processed_json = processor.process_response(response)
if processed_json:
# Format the JSON nicely
formatted_json = json.dumps(processed_json, indent=2, ensure_ascii=False)
# Replace the JSON part in the response
import re
json_pattern = r'\{.*\}'
match = re.search(json_pattern, response, re.DOTALL)
if match:
# Replace the matched JSON with the formatted version
response = response.replace(match.group(), formatted_json)
return response
except Exception:
# If processing fails, return original response
return response
@spaces.GPU(duration=60) # GPU inference
def generate_response_gpu(prompt, max_tokens=512):
"""Generate response - executed on GPU"""
global model
# Ensure tokenizer is loaded
if tokenizer is None:
load_model_and_tokenizer()
# Ensure model is loaded on GPU
if model is None:
model = load_model_on_gpu()
if model is None:
return "❌ Model failed to load. Please check the Space logs."
try:
formatted_prompt = (
"### Instruction:\n"
f"{prompt.strip()}\n\n"
"### Response:\n"
)
# Encode input
inputs = tokenizer(
formatted_prompt,
return_tensors="pt",
truncation=True,
max_length=2048
).to(model.device)
# Generate response
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
do_sample=False,
temperature=None,
top_p=None,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
repetition_penalty=1.1,
early_stopping=True,
no_repeat_ngram_size=3
)
# Decode output
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract generated part
if "### Response:" in response:
response = response.split("### Response:")[-1].strip()
elif len(response) > len(formatted_prompt):
response = response[len(formatted_prompt):].strip()
# Process JSON if present in response
response = process_json_in_response(response)
return response if response else "❌ No response generated. Please try again with a different prompt."
except Exception as generation_error:
return f"❌ Generation Error: {str(generation_error)}"
def chat_interface(message, history, max_tokens):
"""Chat interface - runs on CPU, calls GPU functions"""
if not message.strip():
return history, ""
# Initialize tokenizer (if needed)
if tokenizer is None:
load_model_and_tokenizer()
try:
# Call GPU function to generate response
response = generate_response_gpu(message, max_tokens)
history.append((message, response))
return history, ""
except Exception as chat_error:
error_msg = f"❌ Chat Error: {str(chat_error)}"
history.append((message, error_msg))
return history, ""
# Load tokenizer at startup
load_model_and_tokenizer()
# Create Gradio application
with gr.Blocks(
title="Robot Task Planning - Llama 3.1 8B",
theme=gr.themes.Soft(),
css="""
.gradio-container {
max-width: 1200px;
margin: auto;
}
"""
) as app:
gr.Markdown("""
# πŸ€– Llama 3.1 8B - Robot Task Planning
This is a fine-tuned version of Meta's Llama 3.1 8B model specialized for **robot task planning** using QLoRA technique.
**Capabilities**: Convert natural language robot commands into structured task sequences for excavators, dump trucks, and other construction robots.
**Model**: [YongdongWang/llama3.1-8b-lora-qlora-dart-llm](https://huggingface.co/YongdongWang/llama3.1-8b-lora-qlora-dart-llm)
⚑ **Using ZeroGPU**: This Space uses dynamic GPU allocation (Nvidia H200). First generation might take a bit longer.
""")
with gr.Row():
with gr.Column(scale=3):
chatbot = gr.Chatbot(
label="Task Planning Results",
height=500,
show_label=True,
container=True,
bubble_full_width=False,
show_copy_button=True
)
msg = gr.Textbox(
label="Robot Command",
placeholder="Enter robot task command (e.g., 'Deploy Excavator 1 to Soil Area 1')...",
lines=2,
max_lines=5,
show_label=True,
container=True
)
with gr.Row():
send_btn = gr.Button("πŸš€ Generate Tasks", variant="primary", size="sm")
clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary", size="sm")
with gr.Column(scale=1):
gr.Markdown("### βš™οΈ Generation Settings")
max_tokens = gr.Slider(
minimum=50,
maximum=5000,
value=512,
step=10,
label="Max Tokens",
info="Maximum number of tokens to generate"
)
gr.Markdown("""
### πŸ“Š Model Status
- **Hardware**: ZeroGPU (Dynamic Nvidia H200)
- **Status**: Ready
- **Note**: First generation allocates GPU resources
""")
# Example conversations
gr.Examples(
examples=[
"Dump truck 1 goes to the puddle for inspection, after which all robots avoid the puddle.",
"Drive the Excavator 1 to the obstacle, and perform excavation to clear the obstacle.",
"Send Excavator 1 and Dump Truck 1 to the soil area; Excavator 1 will excavate and unload, followed by Dump Truck 1 proceeding to the puddle for unloading.",
"Move Excavator 1 and Dump Truck 1 to soil area 2; Excavator 1 will excavate and unload, then Dump Truck 1 returns to the starting position to unload.",
"Excavator 1 is guided to the obstacle to excavate and unload to clear the obstacle, then excavator 1 and dump truck 1 are moved to the soil area, and the excavator excavates and unloads. Finally, dump truck 1 unloads the soil into the puddle.",
"Excavator 1 goes to the obstacle to excavate and unload to clear the obstacle. Once the obstacle is cleared, mobilize all available robots to proceed to the puddle area for inspection.",
],
inputs=msg,
label="πŸ’‘ Example Operator Commands"
)
# Event handling
msg.submit(
chat_interface,
inputs=[msg, chatbot, max_tokens],
outputs=[chatbot, msg]
)
send_btn.click(
chat_interface,
inputs=[msg, chatbot, max_tokens],
outputs=[chatbot, msg]
)
clear_btn.click(
lambda: ([], ""),
outputs=[chatbot, msg]
)
if __name__ == "__main__":
app.launch(
server_name="0.0.0.0",
server_port=7860,
share=True,
show_error=True
)