Spaces:

junaidbaber
/

demo_lowcode_llm

Running

App Files Files Community

junaidbaber commited on Jan 29

Commit

03b1321

verified ·

1 Parent(s): 0d5774d

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -58

app.py CHANGED Viewed

@@ -12,41 +12,22 @@ def initialize_model():
     if token:
         login(token)
-    # Use a smaller model that's more CPU-friendly
-    model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # Much smaller model
     # Load tokenizer
     tokenizer = AutoTokenizer.from_pretrained(model_id)
-    # Configure 4-bit quantization for CPU
     try:
-        # First try with bitsandbytes 4-bit quantization
-        from transformers import AutoModelForCausalLM, BitsAndBytesConfig
-        compute_dtype = getattr(torch, "float16")
-        bnb_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_compute_dtype=compute_dtype,
-            bnb_4bit_use_double_quant=False,
-        )
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            quantization_config=bnb_config,
-            device_map="auto",
-            trust_remote_code=True
-        )
-    except:
-        # Fallback to CPU without quantization
-        print("Falling back to CPU without quantization")
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
             device_map="cpu",
             trust_remote_code=True,
             low_cpu_mem_usage=True
         )
     # Ensure padding token is defined
     if tokenizer.pad_token is None:
@@ -54,54 +35,70 @@ def initialize_model():
     return model, tokenizer
-def format_conversation(conversation_history):
-    """Format the conversation history into a single string."""
-    formatted = ""
     for turn in conversation_history:
-        formatted += f"Human: {turn['user']}\nAssistant: {turn['assistant']}\n"
-    return formatted.strip()
 def generate_response(model, tokenizer, prompt, conversation_history):
     """Generate model response"""
-    # Format the entire conversation context
-    context = format_conversation(conversation_history[:-1])
-    if context:
-        full_prompt = f"{context}\nHuman: {prompt}"
-    else:
-        full_prompt = f"Human: {prompt}"
-    # Tokenize input
-    inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True)
-    # Move inputs to the same device as the model
-    device = next(model.parameters()).device
-    inputs = {k: v.to(device) for k, v in inputs.items()}
-    # Calculate max new tokens
-    input_length = inputs["input_ids"].shape[1]
-    max_model_length = 1024  # Reduced context window for memory efficiency
-    max_new_tokens = min(150, max_model_length - input_length)
-    try:
-        # Generate response with lower temperature for faster generation
         outputs = model.generate(
             inputs["input_ids"],
             attention_mask=inputs["attention_mask"],
             max_new_tokens=max_new_tokens,
-            temperature=0.5,  # Lower temperature for faster, more focused responses
             top_p=0.9,
             pad_token_id=tokenizer.pad_token_id,
             do_sample=True,
-            min_length=10,  # Reduced minimum length
-            no_repeat_ngram_size=3
         )
-        # Decode response
-        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        response_parts = response.split("Human: ")
-        model_response = response_parts[-1].split("Assistant: ")[-1].strip()
-        return model_response
     except RuntimeError as e:
         if "out of memory" in str(e):
             torch.cuda.empty_cache()
@@ -128,7 +125,7 @@ def main():
         </style>
     """, unsafe_allow_html=True)
-    st.title("Welcome to LowCode No Code Demo")
     # Initialize session state for chat history
     if "chat_history" not in st.session_state:
@@ -190,7 +187,7 @@ def main():
         st.markdown("""
         ### Model Info
         - Using TinyLlama 1.1B Chat
-        - Optimized for CPU usage
         - Context window: 1024 tokens
         """)

     if token:
         login(token)
+    model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
     # Load tokenizer
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     try:
+        # Try with regular CPU mode first (simpler and more reliable)
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
             device_map="cpu",
             trust_remote_code=True,
             low_cpu_mem_usage=True
         )
+    except Exception as e:
+        print(f"Error loading model: {str(e)}")
+        raise e
     # Ensure padding token is defined
     if tokenizer.pad_token is None:
     return model, tokenizer
+def format_prompt(user_input, conversation_history=[]):
+    """Format the prompt according to TinyLlama's expected chat format"""
+    messages = []
+    # Add conversation history
     for turn in conversation_history:
+        messages.append({"role": "user", "content": turn["user"]})
+        messages.append({"role": "assistant", "content": turn["assistant"]})
+    # Add current user input
+    messages.append({"role": "user", "content": user_input})
+    # Format into TinyLlama chat format
+    formatted_prompt = "<|system|>You are a helpful AI assistant.</s>"
+    for message in messages:
+        if message["role"] == "user":
+            formatted_prompt += f"<|user|>{message['content']}</s>"
+        else:
+            formatted_prompt += f"<|assistant|>{message['content']}</s>"
+    formatted_prompt += "<|assistant|>"
+    return formatted_prompt
 def generate_response(model, tokenizer, prompt, conversation_history):
     """Generate model response"""
+    try:
+        # Format prompt using TinyLlama's chat template
+        formatted_prompt = format_prompt(prompt, conversation_history[:-1])
+        # Tokenize input
+        inputs = tokenizer(formatted_prompt, return_tensors="pt", padding=True, truncation=True)
+        # Move inputs to the same device as the model
+        device = next(model.parameters()).device
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        # Calculate max new tokens
+        input_length = inputs["input_ids"].shape[1]
+        max_model_length = 1024
+        max_new_tokens = min(150, max_model_length - input_length)
+        # Generate response
         outputs = model.generate(
             inputs["input_ids"],
             attention_mask=inputs["attention_mask"],
             max_new_tokens=max_new_tokens,
+            temperature=0.7,
             top_p=0.9,
             pad_token_id=tokenizer.pad_token_id,
             do_sample=True,
+            min_length=10,
+            no_repeat_ngram_size=3,
+            eos_token_id=tokenizer.encode("</s>")[0]  # Set end token
         )
+        # Decode response and extract only the assistant's message
+        full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
+        # Extract only the last assistant response
+        assistant_response = full_response.split("<|assistant|>")[-1].split("</s>")[0].strip()
+        return assistant_response if assistant_response else "I apologize, but I couldn't generate a proper response."
     except RuntimeError as e:
         if "out of memory" in str(e):
             torch.cuda.empty_cache()
         </style>
     """, unsafe_allow_html=True)
+    st.title("Chat with TinyLlama 🤖")
     # Initialize session state for chat history
     if "chat_history" not in st.session_state:
         st.markdown("""
         ### Model Info
         - Using TinyLlama 1.1B Chat
+        - CPU optimized
         - Context window: 1024 tokens
         """)