Spaces:

junaidbaber
/

demo_lowcode_llm

Running

File size: 4,593 Bytes

fccfdf4
97a2367
fccfdf4
ef628bc
 
fccfdf4
97a2367
 
 
4257e69
0d5774d
 
97a2367
 
 
 
0d5774d
97a2367
0d5774d
97a2367
 
 
03b1321
 
 
fccfdf4
97a2367
fccfdf4
03b1321
97a2367
 
 
 
03b1321
97a2367
 
03b1321
97a2367
 
 
 
03b1321
0d5774d
97a2367
0d5774d
97a2367
 
0d5774d
97a2367
 
 
 
 
 
 
 
03b1321
97a2367
 
fccfdf4
 
97a2367
0d5774d
97a2367
fccfdf4
97a2367
fccfdf4
 
97a2367
 
 
fccfdf4
 
97a2367
 
0d5774d
97a2367
 
0d5774d
97a2367
0d5774d
 
 
fccfdf4
 
 
 
 
 
 
 
 
97a2367
fccfdf4
 
 
 
 
 
 
 
 
 
 
97a2367
fccfdf4
 
 
7077c22
fccfdf4
 
 
 
97a2367
fccfdf4
 
 
97a2367
0d5774d
 
 
 
 
 
 
97a2367
 
 
 
0d5774d
ef628bc
fccfdf4

import streamlit as st
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import os

def initialize_model():
    """Initialize a small and fast model for CPU"""
    # Using a tiny model optimized for CPU
    model_id = "facebook/opt-125m"  # Much smaller model (125M parameters)
    model_id ="GEB-AGI/geb-1.3b"
    
    try:
        # Initialize the pipeline directly - more efficient than loading model separately
        pipe = pipeline(
            "text-generation",
            model=model_id,
            device_map="cpu",
            model_kwargs={"low_cpu_mem_usage": True}
        )
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        
        return pipe, tokenizer
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        raise e

def generate_response(pipe, tokenizer, prompt, conversation_history):
    """Generate model response"""
    try:
        # Format conversation context
        context = ""
        for turn in conversation_history[-3:]:  # Only use last 3 turns for efficiency
            context += f"Human: {turn['user']}\nAssistant: {turn['assistant']}\n"
        
        # Create the full prompt
        full_prompt = f"{context}Human: {prompt}\nAssistant:"
        
        # Generate response with conservative parameters
        response = pipe(
            full_prompt,
            max_new_tokens=50,  # Limit response length
            temperature=0.7,
            top_p=0.9,
            num_return_sequences=1,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id
        )[0]['generated_text']
        
        # Extract only the assistant's response
        try:
            assistant_response = response.split("Assistant:")[-1].strip()
            if not assistant_response:
                return "I apologize, but I couldn't generate a proper response."
            return assistant_response
        except:
            return response.split(prompt)[-1].strip()

    except Exception as e:
        return f"An error occurred: {str(e)}"

def main():
    st.set_page_config(page_title="LLM Chat Interface", page_icon="🤖")
    
    st.title("💬 Quick Chat Assistant")

    # Initialize session state
    if "chat_history" not in st.session_state:
        st.session_state.chat_history = []
    
    if "model_loaded" not in st.session_state:
        st.session_state.model_loaded = False

    # Initialize model (only once)
    if not st.session_state.model_loaded:
        with st.spinner("Loading the model... (this should take just a few seconds)"):
            try:
                pipe, tokenizer = initialize_model()
                st.session_state.pipe = pipe
                st.session_state.tokenizer = tokenizer
                st.session_state.model_loaded = True
            except Exception as e:
                st.error(f"Error loading model: {str(e)}")
                return

    # Display chat messages
    for message in st.session_state.chat_history:
        with st.chat_message("user"):
            st.write(message["user"])
        with st.chat_message("assistant"):
            st.write(message["assistant"])

    # Chat input
    if prompt := st.chat_input("Ask me anything!"):
        # Display user message
        with st.chat_message("user"):
            st.write(prompt)

        # Generate and display assistant response
        with st.chat_message("assistant"):
            with st.spinner("Thinking..."):
                current_turn = {"user": prompt, "assistant": ""}
                st.session_state.chat_history.append(current_turn)
                
                response = generate_response(
                    st.session_state.pipe,
                    st.session_state.tokenizer,
                    prompt,
                    st.session_state.chat_history
                )
                
                st.write(response)
                st.session_state.chat_history[-1]["assistant"] = response

        # Keep only last 5 turns
        if len(st.session_state.chat_history) > 5:
            st.session_state.chat_history = st.session_state.chat_history[-5:]

    # Sidebar
    with st.sidebar:
        if st.button("Clear Chat"):
            st.session_state.chat_history = []
            st.rerun()
        
        st.markdown("---")
        st.markdown("""
        ### Chat Info
        - Using OPT-125M model
        - Optimized for quick responses
        - Best for short conversations
        """)

if __name__ == "__main__":
    main()