Spaces:

Neurocognitive
/

agentic-RAG

Sleeping

App Files Files Community

agentic-RAG / app.py

APrmn8

thinkless

96d679e verified 4 days ago

raw

history blame contribute delete

8.26 kB

	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch

	# --- Model and Tokenizer Loading ---
	# It's recommended to load the model and tokenizer once globally
	# so they are not reloaded on every prediction.
	try:
	MODEL_NAME = "Vinnnf/Thinkless-1.5B-Warmup"

	print(f"Loading model: {MODEL_NAME}...")
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	torch_dtype="auto", # Use "auto" or torch.float16 if GPU is available and supports it
	device_map="auto" # Automatically maps to GPU if available, otherwise CPU
	)
	print("Model loaded successfully.")

	print(f"Loading tokenizer for: {MODEL_NAME}...")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	print("Tokenizer loaded successfully.")

	except Exception as e:
	print(f"Error loading model or tokenizer: {e}")
	# Fallback or error handling if model loading fails
	# For a Gradio app, you might want to display this error in the UI
	# For now, we'll let it raise if essential components fail to load.
	raise

	# --- Prediction Function ---
	def generate_response(instruction_text, prompt_question, think_mode_active, max_tokens):
	"""
	Generates a response from the language model based on the input.
	"""
	if not instruction_text or not prompt_question:
	return "Error: Instruction and Prompt Question cannot be empty.", "", "N/A", "N/A"

	try:
	# 1. Combine instruction and prompt question
	full_prompt_content = f"{instruction_text}\n{prompt_question}"

	# 2. Format for chat model
	messages = [
	{"role": "user", "content": full_prompt_content}
	]

	# 3. Apply chat template
	# tokenize=False because we add special tags <think>/<short> afterwards
	text_from_template = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True # Ensures the model knows to generate a response
	)

	# 4. Add <think> or <short> tag
	if think_mode_active:
	final_input_text = f"{text_from_template}<think>"
	else:
	final_input_text = f"{text_from_template}<short>"

	# 5. Tokenize the final input
	# Ensure the tokenizer and model are on the same device
	model_inputs = tokenizer([final_input_text], return_tensors="pt").to(model.device)

	# 6. Generate response
	# Ensure max_new_tokens is an integer
	try:
	max_new_tokens_int = int(max_tokens)
	except ValueError:
	return "Error: Max new tokens must be an integer.", final_input_text, "N/A", "N/A"

	if max_new_tokens_int <= 0:
	return "Error: Max new tokens must be a positive integer.", final_input_text, "N/A", "N/A"


	print(f"Generating with max_new_tokens: {max_new_tokens_int}")
	generated_ids = model.generate(
	**model_inputs,
	max_new_tokens=max_new_tokens_int,
	# Common generation parameters you might want to add:
	# temperature=0.7,
	# top_k=50,
	# top_p=0.95,
	# num_return_sequences=1,
	# no_repeat_ngram_size=2, # to prevent some repetition
	# early_stopping=True
	)

	# 7. Decode the generated part only
	# The generated_ids include the input_ids, so we slice them off.
	input_ids_length = model_inputs.input_ids.shape[1]
	output_only_ids = generated_ids[:, input_ids_length:]

	num_generated_tokens = len(output_only_ids[0])

	# 8. Batch decode
	response_text = tokenizer.batch_decode(output_only_ids, skip_special_tokens=True)[0]

	# For debugging: full generated text including prompt
	# full_response_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
	# print(f"Full text (prompt + response): {full_response_text}")


	return final_input_text, response_text, num_generated_tokens, full_prompt_content

	except Exception as e:
	print(f"Error during generation: {e}")
	# Return the error message to be displayed in the Gradio UI
	return f"An error occurred: {str(e)}", "", "N/A", "N/A"

	# --- Gradio Interface Definition ---
	# Default values from the original script
	DEFAULT_INSTRUCTION = "Please reason step by step, and put your final answer within \\boxed{}."
	DEFAULT_PROMPT_QUESTION = "The arithmetic mean of 7, 2, $x$ and 10 is 9. What is the value of $x$?"
	DEFAULT_THINK_MODE = True
	DEFAULT_MAX_TOKENS = 512 # Default value for max_new_tokens

	# Define input components
	instruction_input = gr.Textbox(
	lines=3,
	label="Instruction",
	value=DEFAULT_INSTRUCTION,
	info="The overall instruction for the model (e.g., reasoning style)."
	)
	prompt_question_input = gr.Textbox(
	lines=3,
	label="Prompt Question",
	value=DEFAULT_PROMPT_QUESTION,
	info="The specific question or task for the model."
	)
	think_mode_checkbox = gr.Checkbox(
	label="Enable Think Mode (<think> tag)",
	value=DEFAULT_THINK_MODE,
	info="If checked, adds '<think>' for detailed reasoning. If unchecked, adds '<short>' for concise answers."
	)
	max_tokens_slider = gr.Slider(
	minimum=32,
	maximum=4096, # As per original script's max_new_tokens
	value=DEFAULT_MAX_TOKENS,
	step=32,
	label="Max New Tokens",
	info="Maximum number of tokens to generate for the response."
	)

	# Define output components
	full_prompt_output = gr.Textbox(
	label="Actual Input to Model (with template and tag)",
	lines=5,
	interactive=False, # Read-only
	show_copy_button=True
	)
	response_output = gr.Textbox(
	label="Model Response",
	lines=10,
	interactive=False, # Read-only
	show_copy_button=True
	)
	num_tokens_output = gr.Textbox(
	label="Number of Generated Tokens",
	interactive=False # Read-only
	)
	original_prompt_output = gr.Textbox(
	label="Original User Prompt (Instruction + Question)",
	lines=3,
	interactive=False, # Read-only
	show_copy_button=True
	)


	# Create the Gradio interface
	# We pass a list of inputs and outputs to gr.Interface
	# The order in the list corresponds to the arguments of the `generate_response` function
	app_interface = gr.Interface(
	fn=generate_response,
	inputs=[
	instruction_input,
	prompt_question_input,
	think_mode_checkbox,
	max_tokens_slider
	],
	outputs=[
	full_prompt_output,
	response_output,
	num_tokens_output,
	original_prompt_output # Added to show the combined instruction + question
	],
	title="Thinkless Model Interface",
	description=(
	"Interact with the Vinnnf/Thinkless-1.5B-Warmup model. "
	"Provide an instruction and a prompt, choose a thinking mode, and set max tokens. "
	"The model will generate a response based on your input. "
	"Note: Model loading might take a few moments when the app starts."
	),
	allow_flagging='never', # or 'auto' if you want to enable flagging
	examples=[
	[
	"Please reason step by step, and put your final answer within \\boxed{}.",
	"Sarah has 5 apples. She gives 2 apples to John and then buys 3 more apples. How many apples does Sarah have now?",
	True,
	256
	],
	[
	"Provide a concise answer.",
	"What is the capital of France?",
	False,
	64
	],
	[
	"Explain the concept of photosynthesis in simple terms.",
	"What is photosynthesis?",
	True,
	512
	]
	]
	)

	# --- Launch the App ---
	if __name__ == "__main__":
	print("Starting Gradio app...")
	# For Hugging Face Spaces, Gradio automatically handles the server.
	# When running locally, this will start a local server.
	app_interface.launch()
	# To share on Hugging Face Spaces, you would typically save this file as app.py
	# and ensure your requirements.txt includes:
	# gradio
	# transformers
	# torch
	# sentencepiece (often a dependency for tokenizers)
	# accelerate (if using device_map="auto" effectively with multiple GPUs/CPU offload)