Smart_LLM

Running on Zero

App Files Files Community

Daemontatox commited on Dec 23, 2024

Commit

0b72fd3

verified ·

1 Parent(s): 9e07bfc

Update app.py

Browse files

Files changed (1) hide show

app.py +139 -102

app.py CHANGED Viewed

@@ -1,14 +1,10 @@
 import subprocess
 subprocess.run(
     'pip install flash-attn --no-build-isolation',
     env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
     shell=True
 )
 import os
 import re
 import time
@@ -59,11 +55,7 @@ Always organize your responses using these tags for clear reasoning structure.""
 # UI Configuration
 TITLE = "<h1><center>AI Reasoning Assistant</center></h1>"
-PLACEHOLDER = """
-<center>
-<p>Ask me anything! I'll think through it step by step.</p>
-</center>
-"""
 CSS = """
 .duplicate-button {
@@ -99,23 +91,24 @@ h3 {
     color: #0066cc;
     font-weight: bold;
 }
 """
 def initialize_model():
     """Initialize the model with appropriate configurations"""
-    # Quantization configuration
     quantization_config = BitsAndBytesConfig(
         load_in_4bit=True,
         bnb_4bit_compute_dtype=torch.bfloat16,
         bnb_4bit_use_double_quant=True
     )
-    # Initialize tokenizer
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
     if tokenizer.pad_token_id is None:
         tokenizer.pad_token_id = tokenizer.eos_token_id
-    # Initialize model
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         torch_dtype=torch.float16,
@@ -128,7 +121,6 @@ def initialize_model():
 def format_text(text):
     """Format text with proper spacing and tag highlighting"""
-    # Add newlines around tags
     tag_patterns = [
         (r'<Thinking>', '\n<Thinking>\n'),
         (r'</Thinking>', '\n</Thinking>\n'),
@@ -144,15 +136,24 @@ def format_text(text):
     for pattern, replacement in tag_patterns:
         formatted = re.sub(pattern, replacement, formatted)
-    # Remove extra blank lines
     formatted = '\n'.join(line for line in formatted.split('\n') if line.strip())
     return formatted
 @spaces.GPU()
-def stream_chat(
     message: str,
     history: list,
     system_prompt: str,
     temperature: float = 0.2,
     max_new_tokens: int = 8192,
@@ -160,30 +161,25 @@ def stream_chat(
     top_k: int = 20,
     penalty: float = 1.2,
 ):
-    """Generate streaming chat responses with proper tag handling"""
-    # Format conversation context
     conversation = [
         {"role": "system", "content": system_prompt}
     ]
-    # Add conversation history
     for prompt, answer in history:
         conversation.extend([
             {"role": "user", "content": prompt},
             {"role": "assistant", "content": answer}
         ])
-    # Add current message
     conversation.append({"role": "user", "content": message})
-    # Prepare input for model
     input_ids = tokenizer.apply_chat_template(
         conversation,
         add_generation_prompt=True,
         return_tensors="pt"
     ).to(model.device)
-    # Configure streamer
     streamer = TextIteratorStreamer(
         tokenizer,
         timeout=60.0,
@@ -191,7 +187,6 @@ def stream_chat(
         skip_special_tokens=True
     )
-    # Set generation parameters
     generate_kwargs = dict(
         input_ids=input_ids,
         max_new_tokens=max_new_tokens,
@@ -203,7 +198,6 @@ def stream_chat(
         streamer=streamer,
     )
-    # Generate and stream response
     buffer = ""
     current_line = ""
@@ -211,6 +205,8 @@ def stream_chat(
         thread = Thread(target=model.generate, kwargs=generate_kwargs)
         thread.start()
         for new_text in streamer:
             buffer += new_text
             current_line += new_text
@@ -219,35 +215,23 @@ def stream_chat(
                 lines = current_line.split('\n')
                 current_line = lines[-1]
                 formatted_buffer = format_text(buffer)
-                yield formatted_buffer
             else:
-                yield buffer
-def create_examples():
-    """Create example queries that demonstrate the system's capabilities"""
-    return [
-        ["Explain how neural networks learn through backpropagation."],
-        ["What are the key differences between classical and quantum computing?"],
-        ["Analyze the environmental impact of renewable energy sources."],
-        ["How does the human memory system work?"],
-        ["Explain the concept of ethical AI and its importance."]
-    ]
 def main():
     """Main function to set up and launch the Gradio interface"""
-    # Initialize model and tokenizer
     global model, tokenizer
     model, tokenizer = initialize_model()
-    # Create chatbot interface
-    chatbot = gr.Chatbot(
-        height=600,
-        placeholder=PLACEHOLDER,
-        bubble_full_width=False,
-        show_copy_button=True
-    )
-    # Create interface
     with gr.Blocks(css=CSS, theme="soft") as demo:
         gr.HTML(TITLE)
         gr.DuplicateButton(
@@ -255,66 +239,119 @@ def main():
             elem_classes="duplicate-button"
         )
-        gr.ChatInterface(
-            fn=stream_chat,
-            chatbot=chatbot,
-            fill_height=True,
-            additional_inputs_accordion=gr.Accordion(
-                label="⚙️ Advanced Settings",
-                open=False,
-                render=False
-            ),
-            additional_inputs=[
-                gr.Textbox(
-                    value=DEFAULT_SYSTEM_PROMPT,
-                    label="System Prompt",
-                    lines=5,
-                    render=False,
-                ),
-                gr.Slider(
-                    minimum=0,
-                    maximum=1,
-                    step=0.1,
-                    value=0.2,
-                    label="Temperature",
-                    render=False,
-                ),
-                gr.Slider(
-                    minimum=128,
-                    maximum=32000,
-                    step=128,
-                    value=8192,
-                    label="Max Tokens",
-                    render=False,
-                ),
-                gr.Slider(
-                    minimum=0.1,
-                    maximum=1.0,
-                    step=0.1,
-                    value=1.0,
-                    label="Top-p",
-                    render=False,
-                ),
-                gr.Slider(
-                    minimum=1,
-                    maximum=100,
-                    step=1,
-                    value=20,
-                    label="Top-k",
-                    render=False,
-                ),
-                gr.Slider(
-                    minimum=1.0,
-                    maximum=2.0,
-                    step=0.1,
-                    value=1.2,
-                    label="Repetition Penalty",
-                    render=False,
-                ),
             ],
-            examples=create_examples(),
-            cache_examples=False,
         )
     return demo

 import subprocess
 subprocess.run(
     'pip install flash-attn --no-build-isolation',
     env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
     shell=True
 )
 import os
 import re
 import time
 # UI Configuration
 TITLE = "<h1><center>AI Reasoning Assistant</center></h1>"
+PLACEHOLDER = "Ask me anything! I'll think through it step by step."
 CSS = """
 .duplicate-button {
     color: #0066cc;
     font-weight: bold;
 }
+.chat-area {
+    height: 500px !important;
+    overflow-y: auto !important;
+}
 """
 def initialize_model():
     """Initialize the model with appropriate configurations"""
     quantization_config = BitsAndBytesConfig(
         load_in_4bit=True,
         bnb_4bit_compute_dtype=torch.bfloat16,
         bnb_4bit_use_double_quant=True
     )
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
     if tokenizer.pad_token_id is None:
         tokenizer.pad_token_id = tokenizer.eos_token_id
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         torch_dtype=torch.float16,
 def format_text(text):
     """Format text with proper spacing and tag highlighting"""
     tag_patterns = [
         (r'<Thinking>', '\n<Thinking>\n'),
         (r'</Thinking>', '\n</Thinking>\n'),
     for pattern, replacement in tag_patterns:
         formatted = re.sub(pattern, replacement, formatted)
     formatted = '\n'.join(line for line in formatted.split('\n') if line.strip())
     return formatted
+def format_chat_history(history):
+    """Format chat history for display in text area"""
+    formatted = []
+    for user_msg, assistant_msg in history:
+        formatted.append(f"User: {user_msg}")
+        if assistant_msg:
+            formatted.append(f"Assistant: {assistant_msg}")
+    return "\n\n".join(formatted)
 @spaces.GPU()
+def chat_response(
     message: str,
     history: list,
+    chat_display: str,
     system_prompt: str,
     temperature: float = 0.2,
     max_new_tokens: int = 8192,
     top_k: int = 20,
     penalty: float = 1.2,
 ):
+    """Generate chat responses with proper tag handling"""
     conversation = [
         {"role": "system", "content": system_prompt}
     ]
     for prompt, answer in history:
         conversation.extend([
             {"role": "user", "content": prompt},
             {"role": "assistant", "content": answer}
         ])
     conversation.append({"role": "user", "content": message})
     input_ids = tokenizer.apply_chat_template(
         conversation,
         add_generation_prompt=True,
         return_tensors="pt"
     ).to(model.device)
     streamer = TextIteratorStreamer(
         tokenizer,
         timeout=60.0,
         skip_special_tokens=True
     )
     generate_kwargs = dict(
         input_ids=input_ids,
         max_new_tokens=max_new_tokens,
         streamer=streamer,
     )
     buffer = ""
     current_line = ""
         thread = Thread(target=model.generate, kwargs=generate_kwargs)
         thread.start()
+        history = history + [[message, ""]]
         for new_text in streamer:
             buffer += new_text
             current_line += new_text
                 lines = current_line.split('\n')
                 current_line = lines[-1]
                 formatted_buffer = format_text(buffer)
+                history[-1][1] = formatted_buffer
+                chat_display = format_chat_history(history)
+                yield history, chat_display
             else:
+                history[-1][1] = buffer
+                chat_display = format_chat_history(history)
+                yield history, chat_display
+def process_example(example: str) -> tuple:
+    """Process example query and return empty history and updated display"""
+    return [], f"User: {example}\n\n"
 def main():
     """Main function to set up and launch the Gradio interface"""
     global model, tokenizer
     model, tokenizer = initialize_model()
     with gr.Blocks(css=CSS, theme="soft") as demo:
         gr.HTML(TITLE)
         gr.DuplicateButton(
             elem_classes="duplicate-button"
         )
+        with gr.Row():
+            with gr.Column():
+                chat_history = gr.State([])
+                chat_display = gr.TextArea(
+                    value="",
+                    label="Chat History",
+                    interactive=False,
+                    elem_classes=["chat-area"],
+                )
+                message = gr.TextArea(
+                    placeholder=PLACEHOLDER,
+                    label="Your message",
+                    lines=3
+                )
+                with gr.Row():
+                    submit = gr.Button("Send")
+                    clear = gr.Button("Clear")
+                with gr.Accordion("⚙️ Advanced Settings", open=False):
+                    system_prompt = gr.TextArea(
+                        value=DEFAULT_SYSTEM_PROMPT,
+                        label="System Prompt",
+                        lines=5,
+                    )
+                    temperature = gr.Slider(
+                        minimum=0,
+                        maximum=1,
+                        step=0.1,
+                        value=0.2,
+                        label="Temperature",
+                    )
+                    max_tokens = gr.Slider(
+                        minimum=128,
+                        maximum=32000,
+                        step=128,
+                        value=8192,
+                        label="Max Tokens",
+                    )
+                    top_p = gr.Slider(
+                        minimum=0.1,
+                        maximum=1.0,
+                        step=0.1,
+                        value=1.0,
+                        label="Top-p",
+                    )
+                    top_k = gr.Slider(
+                        minimum=1,
+                        maximum=100,
+                        step=1,
+                        value=20,
+                        label="Top-k",
+                    )
+                    penalty = gr.Slider(
+                        minimum=1.0,
+                        maximum=2.0,
+                        step=0.1,
+                        value=1.2,
+                        label="Repetition Penalty",
+                    )
+                examples = gr.Examples(
+                    examples=create_examples(),
+                    inputs=[message],
+                    outputs=[chat_history, chat_display],
+                    fn=process_example,
+                    cache_examples=False,
+                )
+        # Set up event handlers
+        submit_click = submit.click(
+            chat_response,
+            inputs=[
+                message,
+                chat_history,
+                chat_display,
+                system_prompt,
+                temperature,
+                max_tokens,
+                top_p,
+                top_k,
+                penalty,
             ],
+            outputs=[chat_history, chat_display],
+            show_progress=True,
         )
+        message.submit(
+            chat_response,
+            inputs=[
+                message,
+                chat_history,
+                chat_display,
+                system_prompt,
+                temperature,
+                max_tokens,
+                top_p,
+                top_k,
+                penalty,
+            ],
+            outputs=[chat_history, chat_display],
+            show_progress=True,
+        )
+        clear.click(
+            lambda: ([], ""),
+            outputs=[chat_history, chat_display],
+            show_progress=True,
+        )
+        submit_click.then(lambda: "", outputs=message)
+        message.submit(lambda: "", outputs=message)
     return demo