Sphinx Reasoner

import subprocess
subprocess.run(
    'pip install flash-attn --no-build-isolation',
    env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
    shell=True
)

import os
import time
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
import gradio as gr
from threading import Thread

HF_TOKEN = os.environ.get("HF_TOKEN", None)
MODEL = "Daemontatox/AetherDrake"

TITLE = "<h1><center>Sphinx Reasoner</center></h1>"

PLACEHOLDER = """
<center>
<p>Ask me Anything !!</p>
</center>
"""

CSS = """
.duplicate-button {
    margin: auto !important;
    color: white !important;
    background: black !important;
    border-radius: 100vh !important;
}
h3 {
    text-align: center;
}
.message-wrap {
    overflow-x: auto;
    white-space: pre-wrap !important;
}
.message-wrap p {
    margin-bottom: 1em;
    white-space: pre-wrap !important;
}
.message-wrap pre {
    background-color: #f6f8fa;
    border-radius: 3px;
    padding: 16px;
    overflow-x: auto;
}
.message-wrap code {
    background-color: rgba(175,184,193,0.2);
    border-radius: 3px;
    padding: 0.2em 0.4em;
    font-family: monospace;
}
"""

device = "cuda"  # for GPU usage or "cpu" for CPU usage

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4")

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    torch_dtype=torch.float16,
    device_map="auto",
    attn_implementation="flash_attention_2",
    quantization_config=quantization_config)

# Ensure `pad_token_id` is set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

def format_text(text):
    """Helper function to format text with proper line breaks and spacing"""
    # Replace single newlines with double newlines for paragraph spacing
    formatted = text.replace('\n', '\n\n')
    # Remove extra spaces between paragraphs
    formatted = '\n'.join(line.strip() for line in formatted.split('\n'))
    return formatted

@spaces.GPU()
def stream_chat(
    message: str, 
    history: list,
    system_prompt: str,
    temperature: float = 1.0, 
    max_new_tokens: int = 8192, 
    top_p: float = 1.0, 
    top_k: int = 20, 
    penalty: float = 1.2,
):
    print(f'message: {message}')
    print(f'history: {history}')

    conversation = [
        {"role": "system", "content": system_prompt}
    ]
    for prompt, answer in history:
        conversation.extend([
            {"role": "user", "content": prompt}, 
            {"role": "assistant", "content": answer},
        ])

    conversation.append({"role": "user", "content": message})

    input_ids = tokenizer.apply_chat_template(
        conversation, 
        add_generation_prompt=True, 
        return_tensors="pt"
    ).to(model.device)
    
    streamer = TextIteratorStreamer(
        tokenizer, 
        timeout=60.0, 
        skip_prompt=True, 
        skip_special_tokens=True
    )
    
    generate_kwargs = dict(
        input_ids=input_ids, 
        max_new_tokens=max_new_tokens,
        do_sample=False if temperature == 0 else True,
        top_p=top_p,
        top_k=top_k,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        temperature=temperature,
        repetition_penalty=penalty,
        streamer=streamer,
    )

    buffer = ""
    current_line = ""
    
    with torch.no_grad():
        thread = Thread(target=model.generate, kwargs=generate_kwargs)
        thread.start()
        
    for new_text in streamer:
        # Add the new text to both buffers
        buffer += new_text
        current_line += new_text
        
        # Check if we have complete lines to process
        if '\n' in current_line:
            lines = current_line.split('\n')
            # The last element might be incomplete, so keep it in current_line
            current_line = lines[-1]
            # Format the complete text
            formatted_buffer = format_text(buffer)
            yield formatted_buffer
        else:
            yield buffer

            
chatbot = gr.Chatbot(
    height=600,
    placeholder=PLACEHOLDER,
    bubble_full_width=False,
    show_copy_button=True
)

DEFAULT_SYSTEM_PROMPT = """You are an AI expert at providing high-quality answers. Your process involves these steps:
1. Initial Thought: Use the <Thinking> tag to reason step-by-step and generate your best possible response to the following request: [User's Request Here].
Example:
<Thinking> Step 1: Understand the request. Step 2: Analyze potential solutions. Step 3: Choose the optimal response. </Thinking>
2. Self-Critique: Critically evaluate your initial response within <Critique> tags, focusing on:
Accuracy: Is it factually correct and verifiable?
Clarity: Is it easy to understand and free of ambiguity?
Completeness: Does it fully address the user's request?
Improvement: What specific aspects could be better?
Example:
<Critique> Accuracy: Verified. Clarity: Needs simplification. Completeness: Add examples. </Critique>
3. Revision: Based on your critique, use <Revising> tags to refine and improve your response.
Example:
<Revising> Adjusting for clarity and adding an example to improve understanding. </Revising>
4. Final Response: Present your revised answer clearly within <Final> tags.
Example:
<Final> This is the improved response. </Final>
5. Tag Innovation: If necessary, create and define new tags to better structure your reasoning or enhance clarity. Use them consistently.
Example:
<Definition> This tag defines a new term introduced in the response. </Definition>
Ensure every part of your thought process and output is properly enclosed in appropriate tags for clarity and organization."""

with gr.Blocks(css=CSS, theme="soft") as demo:
    gr.HTML(TITLE)
    gr.DuplicateButton(
        value="Duplicate Space for private use",
        elem_classes="duplicate-button"
    )
    
    gr.ChatInterface(
        fn=stream_chat,
        chatbot=chatbot,
        fill_height=True,
        additional_inputs_accordion=gr.Accordion(
            label="⚙️ Parameters",
            open=False,
            render=False
        ),
        additional_inputs=[
            gr.Textbox(
                value=DEFAULT_SYSTEM_PROMPT,
                label="System Prompt",
                lines=5,
                render=False,
            ),
            gr.Slider(
                minimum=0,
                maximum=1,
                step=0.1,
                value=0.5,
                label="Temperature",
                render=False,
            ),
            gr.Slider(
                minimum=128,
                maximum=32000,
                step=1,
                value=8192,
                label="Max new tokens",
                render=False,
            ),
            gr.Slider(
                minimum=0.0,
                maximum=1.0,
                step=0.1,
                value=1.0,
                label="top_p",
                render=False,
            ),
            gr.Slider(
                minimum=1,
                maximum=20,
                step=1,
                value=20,
                label="top_k",
                render=False,
            ),
            gr.Slider(
                minimum=0.0,
                maximum=2.0,
                step=0.1,
                value=1.2,
                label="Repetition penalty",
                render=False,
            ),
        ],
        examples=[
            ["What is meant by a Singularity?"],
            ["Explain the theory of Relativity"],
            ["Explain your thought process in details"],
            ["Explain how mamba2 structure LLMs work and how do they differ from transformers?"],
        ],
        cache_examples=False,
    )

if __name__ == "__main__":
    demo.launch()