Spaces:

Schrieffer
/

SARM-Demo

Running on Zero

App Files Files Community

SARM-Demo / app.py

Schrieffer2sy

update

ca4ddc7 17 days ago

raw

history blame contribute delete

4.79 kB

	import gradio as gr
	import spaces
	import torch
	from transformers import AutoTokenizer, AutoModelForSequenceClassification

	# --- 1. Load Model and Tokenizer ---

	# No longer need to manually check for CUDA. `device_map="auto"` will handle it.
	MODEL_ID = "schrieffer/Llama-SARM-4B"

	print(f"Loading model: {MODEL_ID} with device_map='auto'...")

	# trust_remote_code=True is required because SARM has a custom architecture.
	# Using device_map="auto" is the key to correctly loading the model onto the GPU.
	model = AutoModelForSequenceClassification.from_pretrained(
	MODEL_ID,
	trust_remote_code=True,
	device_map="auto",
	torch_dtype=torch.bfloat16
	)
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

	# We can get the device from the model itself after loading
	DEVICE = model.device
	print(f"Model loaded successfully on device: {DEVICE}")

	# --- 2. Define the Inference Function ---
	@spaces.GPU
	def get_reward_score(prompt: str, response: str) -> float:
	"""
	Receives a prompt and a response, and returns the reward score calculated by the SARM model.
	"""
	if not prompt or not response:
	return 0.0

	try:
	# Use the same chat template as used during model training.
	messages = [{"role": "user", "content": prompt}, {"role": "assistant", "content": response}]
	# The model will handle moving inputs to the correct device automatically.
	input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(DEVICE) # <<< REMOVED .to(DEVICE)

	with torch.no_grad():
	score = model(input_ids).logits.item()

	return round(score, 4)
	except Exception as e:
	print(f"Error: {e}")
	return 0.0

	# --- 3. Create and Launch the Gradio Interface ---

	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	"""
	# SARM: Interpretable Reward Model Demo

	This is an interactive demo for the SARM-4B model (Sparse Autoencoder-enhanced Reward Model).

	SARM is a novel reward model architecture that enhances interpretability by integrating a pretrained Sparse Autoencoder (SAE). It maps the internal hidden states of a large language model into a sparse and human-understandable feature space, making the resulting reward scores transparent and conceptually meaningful.

	How to use this Demo:
	1. Enter a Prompt (e.g., a question) in the left textbox below.
	2. Enter a corresponding Response in the right textbox.
	3. Click the "Calculate Reward Score" button.

	The model will output a scalar score that evaluates the quality of the response. A higher score indicates that the SARM model considers the response to be of better quality.

	---

	SARM Architecture
	![framework](assets/framework-v4.png)

	+ Authors (* indicates equal contribution)

	Shuyi Zhang\, Wei Shi\, Sihang Li\*, Jiayi Liao, Tao Liang, Hengxing Cai, Xiang Wang
	+ Paper: [Interpretable Reward Model via Sparse Autoencoder](https://arxiv.org/abs/2508.08746)

	+ Model: [schrieffer/Llama-SARM-4B](https://huggingface.co/schrieffer/Llama-SARM-4B)

	+ Finetuned from model: [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)

	+ Code Repository: [https://github.com/schrieffer-z/sarm](https://github.com/schrieffer-z/sarm)
	"""
	)

	with gr.Row():
	prompt_input = gr.Textbox(lines=3, label="Prompt / Question", placeholder="e.g., Can you explain the theory of relativity in simple terms?")
	response_input = gr.Textbox(lines=5, label="Response to be Evaluated", placeholder="e.g., Of course! Albert Einstein's theory of relativity...")

	calculate_btn = gr.Button("Calculate Reward Score", variant="primary")
	score_output = gr.Number(label="Reward Score", info="A higher score is better.")

	calculate_btn.click(
	fn=get_reward_score,
	inputs=[prompt_input, response_input],
	outputs=score_output
	)

	gr.Examples(
	examples=[
	["What is the capital of France?", "The capital of France is Paris."],
	["What is the capital of France?", "Berlin is a large city in Germany."],
	["Write a short poem about the moon.", "Silver orb in velvet night, / Casting shadows, soft and light. / Silent watcher, distant, bright, / Guiding dreams till morning's light."],
	["Write a short poem about the moon.", "The moon is a rock."]
	],
	inputs=[prompt_input, response_input],
	outputs=score_output,
	fn=get_reward_score,
	cache_examples=True
	)

	# Launch the application.
	demo.launch()