Spaces:

swaleha19
/

agent_tuning_framework

Running

App Files Files Community

agent_tuning_framework / negative_samples.py

swaleha19

Upload 13 files

6c482f9 verified 12 days ago

raw

history blame contribute delete

16.1 kB

	"""
	Negative Sample Generation Module for Agent Tuning Optimization Framework

	This module provides functionality for generating negative samples to enhance
	agent tuning by exposing the model to challenging failure cases.
	"""

	import random
	import numpy as np
	from typing import List, Dict, Any, Union, Optional, Tuple
	from tqdm import tqdm

	from data.trajectory_data import Trajectory, TrajectoryDataset

	class NegativeSampleGenerator:
	"""Base class for negative sample generation strategies."""

	def __init__(self, name: str):
	"""
	Initialize the negative sample generator.

	Args:
	name: Name of the generator strategy
	"""
	self.name = name

	def generate(
	self,
	trajectory: Trajectory,
	**kwargs
	) -> Trajectory:
	"""
	Generate a negative sample from a positive trajectory.

	Args:
	trajectory: Positive trajectory to transform
	**kwargs: Additional generation parameters

	Returns:
	Negative trajectory
	"""
	raise NotImplementedError("Subclasses must implement this method")

	def batch_generate(
	self,
	trajectories: List[Trajectory],
	**kwargs
	) -> List[Trajectory]:
	"""
	Generate negative samples from a batch of positive trajectories.

	Args:
	trajectories: List of positive trajectories
	**kwargs: Additional generation parameters

	Returns:
	List of negative trajectories
	"""
	negative_trajectories = []

	for trajectory in tqdm(trajectories, desc=f"Generating negative samples with {self.name}"):
	negative_trajectories.append(self.generate(trajectory, **kwargs))

	return negative_trajectories


	class ResponseDegradationGenerator(NegativeSampleGenerator):
	"""Generate negative samples by degrading agent responses."""

	def __init__(self):
	"""Initialize the response degradation generator."""
	super().__init__("response_degradation")

	def generate(
	self,
	trajectory: Trajectory,
	degradation_level: float = 0.5,
	**kwargs
	) -> Trajectory:
	"""
	Generate a negative sample by degrading agent responses.

	Args:
	trajectory: Positive trajectory to transform
	degradation_level: Level of degradation (0.0 to 1.0)
	**kwargs: Additional generation parameters

	Returns:
	Negative trajectory with degraded responses
	"""
	# Create a copy of interactions to modify
	new_interactions = []

	for interaction in trajectory.interactions:
	user_msg = interaction['user']
	agent_msg = interaction['agent']

	# Apply degradation techniques based on level
	if degradation_level > 0.7:
	# High degradation: completely irrelevant response
	agent_msg = self._generate_irrelevant_response()
	elif degradation_level > 0.4:
	# Medium degradation: truncate and add errors
	agent_msg = self._truncate_and_add_errors(agent_msg)
	else:
	# Low degradation: introduce minor issues
	agent_msg = self._introduce_minor_issues(agent_msg)

	new_interactions.append({
	'user': user_msg,
	'agent': agent_msg
	})

	# Create new trajectory with degraded responses
	metadata = trajectory.metadata.copy()
	metadata['is_positive'] = False
	metadata['degradation_level'] = degradation_level
	metadata['original_quality_score'] = trajectory.get_quality_score()
	metadata['quality_score'] = None # Will be recalculated

	return Trajectory(
	task_description=trajectory.task_description,
	interactions=new_interactions,
	metadata=metadata
	)

	def _generate_irrelevant_response(self) -> str:
	"""Generate a completely irrelevant response."""
	irrelevant_responses = [
	"I'm sorry, but I don't understand what you're asking for. Could you please clarify?",
	"I apologize, but I cannot assist with that request at this time.",
	"That's an interesting question, but I think we should focus on something else instead.",
	"Let me check my database... I don't seem to have any information about that.",
	"I think you might be confused about what you're asking for. Let me suggest something completely different.",
	"I'm not sure I understand the context of your request. Could you provide more details?",
	"I'm having trouble processing your request. Could we try a different approach?",
	"That's not something I can help with. Let me tell you about something unrelated instead."
	]
	return random.choice(irrelevant_responses)

	def _truncate_and_add_errors(self, text: str) -> str:
	"""Truncate the text and add errors."""
	# Truncate to 30-70% of original length
	words = text.split()
	truncate_point = int(len(words) * random.uniform(0.3, 0.7))
	truncated = ' '.join(words[:truncate_point])

	# Add grammatical errors
	errors = [
	lambda t: t.replace(".", ""), # Remove periods
	lambda t: t.replace("I ", "i "), # Lowercase I
	lambda t: t.replace(" the ", " teh "), # Typo
	lambda t: t.replace(" is ", " are "), # Grammar error
	lambda t: t.replace(" are ", " is ") # Grammar error
	]

	# Apply 1-3 random errors
	for _ in range(random.randint(1, 3)):
	error_func = random.choice(errors)
	truncated = error_func(truncated)

	return truncated

	def _introduce_minor_issues(self, text: str) -> str:
	"""Introduce minor issues to the text."""
	# Minor issues
	issues = [
	lambda t: t.replace("I'll", "I will"), # Expand contractions
	lambda t: t.replace("I'd", "I would"),
	lambda t: t.replace("can't", "cannot"),
	lambda t: t + " However, I'm not entirely sure about this.", # Add uncertainty
	lambda t: t + " Please note that my information might be outdated.",
	lambda t: t.replace(".", "..."), # Replace periods with ellipses
	lambda t: t.replace("!", "."), # Reduce enthusiasm
	lambda t: t.replace(".", "?") # Add questioning tone
	]

	# Apply 1-2 random issues
	for _ in range(random.randint(1, 2)):
	issue_func = random.choice(issues)
	text = issue_func(text)

	return text


	class TaskMisalignmentGenerator(NegativeSampleGenerator):
	"""Generate negative samples by creating responses misaligned with the task."""

	def __init__(self):
	"""Initialize the task misalignment generator."""
	super().__init__("task_misalignment")

	def generate(
	self,
	trajectory: Trajectory,
	misalignment_type: str = 'random',
	**kwargs
	) -> Trajectory:
	"""
	Generate a negative sample with responses misaligned with the task.

	Args:
	trajectory: Positive trajectory to transform
	misalignment_type: Type of misalignment ('random', 'refusal', 'tangent', 'misinterpretation')
	**kwargs: Additional generation parameters

	Returns:
	Negative trajectory with misaligned responses
	"""
	# Create a copy of interactions to modify
	new_interactions = []

	# Select misalignment type if random
	if misalignment_type == 'random':
	misalignment_type = random.choice(['refusal', 'tangent', 'misinterpretation'])

	# Apply misalignment based on type
	for i, interaction in enumerate(trajectory.interactions):
	user_msg = interaction['user']

	if misalignment_type == 'refusal':
	agent_msg = self._generate_refusal(user_msg, i)
	elif misalignment_type == 'tangent':
	agent_msg = self._generate_tangent(user_msg, i)
	elif misalignment_type == 'misinterpretation':
	agent_msg = self._generate_misinterpretation(user_msg, trajectory.task_description, i)
	else:
	raise ValueError(f"Unsupported misalignment type: {misalignment_type}")

	new_interactions.append({
	'user': user_msg,
	'agent': agent_msg
	})

	# Create new trajectory with misaligned responses
	metadata = trajectory.metadata.copy()
	metadata['is_positive'] = False
	metadata['misalignment_type'] = misalignment_type
	metadata['original_quality_score'] = trajectory.get_quality_score()
	metadata['quality_score'] = None # Will be recalculated

	return Trajectory(
	task_description=trajectory.task_description,
	interactions=new_interactions,
	metadata=metadata
	)

	def _generate_refusal(self, user_msg: str, turn_idx: int) -> str:
	"""Generate a refusal response."""
	refusal_templates = [
	"I'm sorry, but I cannot assist with that request due to ethical concerns.",
	"I apologize, but I'm not able to help with that specific task.",
	"That request goes against my programming guidelines. I cannot proceed.",
	"I'm unable to complete that task as it may violate my operational parameters.",
	"I must decline to assist with that particular request.",
	"I cannot help with that specific task due to potential policy violations.",
	"That's not something I'm designed to help with. I apologize for the inconvenience.",
	"I'm programmed to avoid assisting with that type of request."
	]

	if turn_idx == 0:
	return random.choice(refusal_templates)
	else:
	return f"I've reconsidered, and {random.choice(refusal_templates).lower()}"

	def _generate_tangent(self, user_msg: str, turn_idx: int) -> str:
	"""Generate a response that goes off on a tangent."""
	tangent_topics = [
	"Did you know that artificial intelligence has been a concept since the 1950s?",
	"I've been thinking about the philosophical implications of consciousness in AI systems.",
	"The weather has been quite interesting lately, with unusual patterns emerging globally.",
	"I recently processed some fascinating data about renewable energy technologies.",
	"The history of computing is quite fascinating, starting with early mechanical calculators.",
	"Language models like me are trained on vast amounts of text data.",
	"The field of natural language processing has evolved significantly in recent years.",
	"I find the concept of time quite fascinating from a computational perspective."
	]

	if turn_idx == 0:
	return f"That's an interesting request, but before I help with that... {random.choice(tangent_topics)} Anyway, what were we discussing?"
	else:
	return f"I understand you want me to continue with the task, but I just remembered something. {random.choice(tangent_topics)} Sorry for the distraction."

	def _generate_misinterpretation(self, user_msg: str, task_description: str, turn_idx: int) -> str:
	"""Generate a response that misinterprets the user's request."""
	# Extract keywords from task description
	keywords = task_description.lower().split()
	keywords = [w for w in keywords if len(w) > 3 and w not in ['with', 'from', 'that', 'this', 'have', 'what', 'when', 'where', 'which', 'about']]

	if not keywords:
	keywords = ['task', 'help', 'information', 'request']

	# Select a random keyword to misinterpret
	keyword = random.choice(keywords)

	misinterpretation_templates = [
	f"I understand you're asking about {keyword}s. Let me provide some general information about {keyword}s.",
	f"You want to know more about {keyword}, correct? Here's what I know about {keyword}.",
	f"I'll help you with your {keyword} question. {keyword.capitalize()} is a fascinating topic.",
	f"So you're interested in {keyword}? I can certainly provide information about {keyword}.",
	f"Your question is about {keyword}, if I understand correctly. Let me tell you about {keyword}.",
	f"I'll address your {keyword} inquiry. {keyword.capitalize()} has many interesting aspects.",
	f"Regarding your question about {keyword}, I can offer the following information.",
	f"I believe you're asking about {keyword}. Here's what you should know about {keyword}."
	]

	return random.choice(misinterpretation_templates)


	class ConstraintViolationGenerator(NegativeSampleGenerator):
	"""Generate negative samples by violating specified constraints."""

	def __init__(self):
	"""Initialize the constraint violation generator."""
	super().__init__("constraint_violation")

	def generate(
	self,
	trajectory: Trajectory,
	constraints: Optional[List[str]] = None,
	**kwargs
	) -> Trajectory:
	"""
	Generate a negative sample by violating constraints.

	Args:
	trajectory: Positive trajectory to transform
	constraints: List of constraints to violate (None for default)
	**kwargs: Additional generation parameters

	Returns:
	Negative trajectory with constraint violations
	"""
	# Default constraints if none provided
	if constraints is None:
	constraints = [
	"Do not provide specific recommendations",
	"Avoid using technical jargon",
	"Keep responses concise",
	"Do not ask follow-up questions",
	"Avoid making assumptions about user preferences",
	"Do not mention specific brands or products",
	"Avoid discussing sensitive topics",
	"Do not provide step-by-step instructions"
	]

	# Select a constraint to violate
	violated_constraint = random.choice(constraints)

	# Create a copy of interactions to modify
	new_interactions = []

	for i, interaction in enumerate(trajectory.interactions):
	user_msg = interaction['user']

	# Generate response that violates the constraint
	agent_msg = self._generate_violation(user_msg, violated_constraint, i)

	new_interactions.append({
	'user': user_msg,
	'agent': agent_msg
	})

	# Create new trajectory with constraint violations
	metadata = trajectory.metadata.copy()
	metadata['is_positive'] = False
	metadata['violated_constraint'] = violated_constraint
	metadata['original_quality_score'] = trajectory.get_quality_score()
	metadata['quality_score'] = None # Will be recalculated

	return Trajectory(
	task_description=trajectory.task_description,
	interactions=new_interactions,
	metadata=metadata
	)

	def _generate_violation(self, user_msg: str, constraint: str, turn_idx: int) -> str:
	"""Generate a response that violate
	(Content truncated due to size limit. Use line ranges to read in chunks)