Spaces:

Neurocognitive
/

agentic-RAG

Sleeping

File size: 8,263 Bytes

import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# --- Model and Tokenizer Loading ---
# It's recommended to load the model and tokenizer once globally
# so they are not reloaded on every prediction.
try:
    MODEL_NAME = "Vinnnf/Thinkless-1.5B-Warmup"
    
    print(f"Loading model: {MODEL_NAME}...")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype="auto",  # Use "auto" or torch.float16 if GPU is available and supports it
        device_map="auto"    # Automatically maps to GPU if available, otherwise CPU
    )
    print("Model loaded successfully.")

    print(f"Loading tokenizer for: {MODEL_NAME}...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    print("Tokenizer loaded successfully.")

except Exception as e:
    print(f"Error loading model or tokenizer: {e}")
    # Fallback or error handling if model loading fails
    # For a Gradio app, you might want to display this error in the UI
    # For now, we'll let it raise if essential components fail to load.
    raise

# --- Prediction Function ---
def generate_response(instruction_text, prompt_question, think_mode_active, max_tokens):
    """
    Generates a response from the language model based on the input.
    """
    if not instruction_text or not prompt_question:
        return "Error: Instruction and Prompt Question cannot be empty.", "", "N/A", "N/A"

    try:
        # 1. Combine instruction and prompt question
        full_prompt_content = f"{instruction_text}\n{prompt_question}"

        # 2. Format for chat model
        messages = [
            {"role": "user", "content": full_prompt_content}
        ]

        # 3. Apply chat template
        #    tokenize=False because we add special tags <think>/<short> afterwards
        text_from_template = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True  # Ensures the model knows to generate a response
        )

        # 4. Add <think> or <short> tag
        if think_mode_active:
            final_input_text = f"{text_from_template}<think>"
        else:
            final_input_text = f"{text_from_template}<short>"
        
        # 5. Tokenize the final input
        #    Ensure the tokenizer and model are on the same device
        model_inputs = tokenizer([final_input_text], return_tensors="pt").to(model.device)

        # 6. Generate response
        #    Ensure max_new_tokens is an integer
        try:
            max_new_tokens_int = int(max_tokens)
        except ValueError:
            return "Error: Max new tokens must be an integer.", final_input_text, "N/A", "N/A"

        if max_new_tokens_int <= 0:
             return "Error: Max new tokens must be a positive integer.", final_input_text, "N/A", "N/A"


        print(f"Generating with max_new_tokens: {max_new_tokens_int}")
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=max_new_tokens_int,
            # Common generation parameters you might want to add:
            # temperature=0.7,
            # top_k=50,
            # top_p=0.95,
            # num_return_sequences=1,
            # no_repeat_ngram_size=2, # to prevent some repetition
            # early_stopping=True
        )

        # 7. Decode the generated part only
        #    The generated_ids include the input_ids, so we slice them off.
        input_ids_length = model_inputs.input_ids.shape[1]
        output_only_ids = generated_ids[:, input_ids_length:]
        
        num_generated_tokens = len(output_only_ids[0])

        # 8. Batch decode
        response_text = tokenizer.batch_decode(output_only_ids, skip_special_tokens=True)[0]
        
        # For debugging: full generated text including prompt
        # full_response_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        # print(f"Full text (prompt + response): {full_response_text}")


        return final_input_text, response_text, num_generated_tokens, full_prompt_content

    except Exception as e:
        print(f"Error during generation: {e}")
        # Return the error message to be displayed in the Gradio UI
        return f"An error occurred: {str(e)}", "", "N/A", "N/A"

# --- Gradio Interface Definition ---
# Default values from the original script
DEFAULT_INSTRUCTION = "Please reason step by step, and put your final answer within \\boxed{}."
DEFAULT_PROMPT_QUESTION = "The arithmetic mean of 7, 2, $x$ and 10 is 9. What is the value of $x$?"
DEFAULT_THINK_MODE = True
DEFAULT_MAX_TOKENS = 512 # Default value for max_new_tokens

# Define input components
instruction_input = gr.Textbox(
    lines=3,
    label="Instruction",
    value=DEFAULT_INSTRUCTION,
    info="The overall instruction for the model (e.g., reasoning style)."
)
prompt_question_input = gr.Textbox(
    lines=3,
    label="Prompt Question",
    value=DEFAULT_PROMPT_QUESTION,
    info="The specific question or task for the model."
)
think_mode_checkbox = gr.Checkbox(
    label="Enable Think Mode (<think> tag)",
    value=DEFAULT_THINK_MODE,
    info="If checked, adds '<think>' for detailed reasoning. If unchecked, adds '<short>' for concise answers."
)
max_tokens_slider = gr.Slider(
    minimum=32,
    maximum=4096, # As per original script's max_new_tokens
    value=DEFAULT_MAX_TOKENS,
    step=32,
    label="Max New Tokens",
    info="Maximum number of tokens to generate for the response."
)

# Define output components
full_prompt_output = gr.Textbox(
    label="Actual Input to Model (with template and tag)",
    lines=5,
    interactive=False, # Read-only
    show_copy_button=True
)
response_output = gr.Textbox(
    label="Model Response",
    lines=10,
    interactive=False, # Read-only
    show_copy_button=True
)
num_tokens_output = gr.Textbox(
    label="Number of Generated Tokens",
    interactive=False # Read-only
)
original_prompt_output = gr.Textbox(
    label="Original User Prompt (Instruction + Question)",
    lines=3,
    interactive=False, # Read-only
    show_copy_button=True
)


# Create the Gradio interface
# We pass a list of inputs and outputs to gr.Interface
# The order in the list corresponds to the arguments of the `generate_response` function
app_interface = gr.Interface(
    fn=generate_response,
    inputs=[
        instruction_input,
        prompt_question_input,
        think_mode_checkbox,
        max_tokens_slider
    ],
    outputs=[
        full_prompt_output,
        response_output,
        num_tokens_output,
        original_prompt_output # Added to show the combined instruction + question
    ],
    title="Thinkless Model Interface",
    description=(
        "Interact with the Vinnnf/Thinkless-1.5B-Warmup model. "
        "Provide an instruction and a prompt, choose a thinking mode, and set max tokens. "
        "The model will generate a response based on your input. "
        "Note: Model loading might take a few moments when the app starts."
    ),
    allow_flagging='never', # or 'auto' if you want to enable flagging
    examples=[
        [
            "Please reason step by step, and put your final answer within \\boxed{}.",
            "Sarah has 5 apples. She gives 2 apples to John and then buys 3 more apples. How many apples does Sarah have now?",
            True,
            256
        ],
        [
            "Provide a concise answer.",
            "What is the capital of France?",
            False,
            64
        ],
        [
            "Explain the concept of photosynthesis in simple terms.",
            "What is photosynthesis?",
            True,
            512
        ]
    ]
)

# --- Launch the App ---
if __name__ == "__main__":
    print("Starting Gradio app...")
    # For Hugging Face Spaces, Gradio automatically handles the server.
    # When running locally, this will start a local server.
    app_interface.launch()
    # To share on Hugging Face Spaces, you would typically save this file as app.py
    # and ensure your requirements.txt includes:
    # gradio
    # transformers
    # torch
    # sentencepiece (often a dependency for tokenizers)
    # accelerate (if using device_map="auto" effectively with multiple GPUs/CPU offload)