Spaces:

junaidbaber
/

demo_lowcode_llm

Running

App Files Files Community

junaidbaber commited on Jan 29

Commit

fccfdf4

verified ·

1 Parent(s): ef628bc

Update app.py

Browse files

Files changed (1) hide show

app.py +131 -57

app.py CHANGED Viewed

@@ -1,63 +1,137 @@
 from huggingface_hub import login
 import os
-token = os.environ.get("hf")
-login(token)
-import streamlit as st
-from transformers import pipeline
-import torch
-# Model ID
-MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"
-@st.cache_resource
-def load_pipeline():
-    try:
-        st.write("Loading the instruct pipeline...")
-        instruct_pipeline = pipeline(
-            "text-generation",
-            model=MODEL_ID,
-            model_kwargs={"torch_dtype": torch.bfloat16},
-            device_map="auto",
-        )
-        st.write("Pipeline successfully loaded.")
-        return instruct_pipeline
-    except Exception as e:
-        st.error(f"Error loading pipeline: {e}")
-        return None
-# Load the pipeline
-instruct_pipeline = load_pipeline()
-# Streamlit UI
-st.title("Instruction Chatbot")
-st.write("Chat with the instruction-tuned model!")
-if instruct_pipeline is None:
-    st.error("Pipeline failed to load. Please check the configuration.")
-else:
-    # Message-based interaction
-    system_message = st.text_area("System Message", value="You are a helpful assistant.", height=100)
-    user_input = st.text_input("User:", placeholder="Ask a question or provide an instruction...")
-    if st.button("Send"):
-        if user_input.strip():
-            try:
-                messages = [
-                    {"role": "system", "content": system_message},
-                    {"role": "user", "content": user_input},
-                ]
-                # Generate response
-                outputs = instruct_pipeline(
-                    messages,
-                    max_new_tokens=150,  # Limit response length
                 )
-                # Display the generated response
-                response = outputs[0]["generated_text"]
-                st.write(f"**Assistant:** {response}")
-            except Exception as e:
-                st.error(f"Error generating response: {e}")
-        else:
-            st.warning("Please enter a valid message.")

+import streamlit as st
 from huggingface_hub import login
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+import torch
 import os
+def initialize_model():
+    """Initialize the model and tokenizer"""
+    # Log in to Hugging Face
+    token = os.environ.get("hf")
+    login(token)
+    # Define the model ID and device
+    model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Configure INT8 quantization
+    bnb_config = BitsAndBytesConfig(
+        load_in_8bit=True,
+        llm_int8_enable_fp32_cpu_offload=True
+    )
+    # Load tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        quantization_config=bnb_config,
+        device_map="auto"
+    )
+    # Ensure padding token is defined
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    return model, tokenizer, device
+def format_conversation(conversation_history):
+    """Format the conversation history into a single string."""
+    formatted = ""
+    for turn in conversation_history:
+        formatted += f"User: {turn['user']}\nAssistant: {turn['assistant']}\n"
+    return formatted.strip()
+def generate_response(model, tokenizer, device, prompt, conversation_history):
+    """Generate model response"""
+    # Format the entire conversation context
+    context = format_conversation(conversation_history[:-1])
+    if context:
+        full_prompt = f"{context}\nUser: {prompt}"
+    else:
+        full_prompt = f"User: {prompt}"
+    # Tokenize input
+    inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True).to(device)
+    # Calculate max new tokens
+    input_length = inputs["input_ids"].shape[1]
+    max_model_length = 2048
+    max_new_tokens = min(200, max_model_length - input_length)
+    # Generate response
+    outputs = model.generate(
+        inputs["input_ids"],
+        attention_mask=inputs["attention_mask"],
+        max_new_tokens=max_new_tokens,
+        temperature=0.7,
+        top_p=0.9,
+        pad_token_id=tokenizer.pad_token_id,
+        do_sample=True,
+        min_length=20,
+        no_repeat_ngram_size=3
+    )
+    # Decode response
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    response_parts = response.split("User: ")
+    model_response = response_parts[-1].split("Assistant: ")[-1].strip()
+    return model_response
+def main():
+    st.set_page_config(page_title="LLM Chat Interface", page_icon="🤖")
+    st.title("Chat with LLM 🤖")
+    # Initialize session state for chat history
+    if "chat_history" not in st.session_state:
+        st.session_state.chat_history = []
+    # Initialize model (only once)
+    if "model" not in st.session_state:
+        with st.spinner("Loading the model... This might take a minute..."):
+            model, tokenizer, device = initialize_model()
+            st.session_state.model = model
+            st.session_state.tokenizer = tokenizer
+            st.session_state.device = device
+    # Display chat messages
+    for message in st.session_state.chat_history:
+        with st.chat_message("user"):
+            st.write(message["user"])
+        with st.chat_message("assistant"):
+            st.write(message["assistant"])
+    # Chat input
+    if prompt := st.chat_input("What would you like to know?"):
+        # Display user message
+        with st.chat_message("user"):
+            st.write(prompt)
+        # Generate and display assistant response
+        with st.chat_message("assistant"):
+            with st.spinner("Thinking..."):
+                current_turn = {"user": prompt, "assistant": ""}
+                st.session_state.chat_history.append(current_turn)
+                response = generate_response(
+                    st.session_state.model,
+                    st.session_state.tokenizer,
+                    st.session_state.device,
+                    prompt,
+                    st.session_state.chat_history
                 )
+                st.write(response)
+                st.session_state.chat_history[-1]["assistant"] = response
+        # Manage context window
+        if len(st.session_state.chat_history) > 5:
+            st.session_state.chat_history = st.session_state.chat_history[-5:]
+    # Add a clear chat button
+    if st.sidebar.button("Clear Chat"):
+        st.session_state.chat_history = []
+        st.rerun()
+if __name__ == "__main__":
+    main()