agent_tuning_framework / negative_samples.py
swaleha19's picture
Upload 13 files
6c482f9 verified
"""
Negative Sample Generation Module for Agent Tuning Optimization Framework
This module provides functionality for generating negative samples to enhance
agent tuning by exposing the model to challenging failure cases.
"""
import random
import numpy as np
from typing import List, Dict, Any, Union, Optional, Tuple
from tqdm import tqdm
from data.trajectory_data import Trajectory, TrajectoryDataset
class NegativeSampleGenerator:
"""Base class for negative sample generation strategies."""
def __init__(self, name: str):
"""
Initialize the negative sample generator.
Args:
name: Name of the generator strategy
"""
self.name = name
def generate(
self,
trajectory: Trajectory,
**kwargs
) -> Trajectory:
"""
Generate a negative sample from a positive trajectory.
Args:
trajectory: Positive trajectory to transform
**kwargs: Additional generation parameters
Returns:
Negative trajectory
"""
raise NotImplementedError("Subclasses must implement this method")
def batch_generate(
self,
trajectories: List[Trajectory],
**kwargs
) -> List[Trajectory]:
"""
Generate negative samples from a batch of positive trajectories.
Args:
trajectories: List of positive trajectories
**kwargs: Additional generation parameters
Returns:
List of negative trajectories
"""
negative_trajectories = []
for trajectory in tqdm(trajectories, desc=f"Generating negative samples with {self.name}"):
negative_trajectories.append(self.generate(trajectory, **kwargs))
return negative_trajectories
class ResponseDegradationGenerator(NegativeSampleGenerator):
"""Generate negative samples by degrading agent responses."""
def __init__(self):
"""Initialize the response degradation generator."""
super().__init__("response_degradation")
def generate(
self,
trajectory: Trajectory,
degradation_level: float = 0.5,
**kwargs
) -> Trajectory:
"""
Generate a negative sample by degrading agent responses.
Args:
trajectory: Positive trajectory to transform
degradation_level: Level of degradation (0.0 to 1.0)
**kwargs: Additional generation parameters
Returns:
Negative trajectory with degraded responses
"""
# Create a copy of interactions to modify
new_interactions = []
for interaction in trajectory.interactions:
user_msg = interaction['user']
agent_msg = interaction['agent']
# Apply degradation techniques based on level
if degradation_level > 0.7:
# High degradation: completely irrelevant response
agent_msg = self._generate_irrelevant_response()
elif degradation_level > 0.4:
# Medium degradation: truncate and add errors
agent_msg = self._truncate_and_add_errors(agent_msg)
else:
# Low degradation: introduce minor issues
agent_msg = self._introduce_minor_issues(agent_msg)
new_interactions.append({
'user': user_msg,
'agent': agent_msg
})
# Create new trajectory with degraded responses
metadata = trajectory.metadata.copy()
metadata['is_positive'] = False
metadata['degradation_level'] = degradation_level
metadata['original_quality_score'] = trajectory.get_quality_score()
metadata['quality_score'] = None # Will be recalculated
return Trajectory(
task_description=trajectory.task_description,
interactions=new_interactions,
metadata=metadata
)
def _generate_irrelevant_response(self) -> str:
"""Generate a completely irrelevant response."""
irrelevant_responses = [
"I'm sorry, but I don't understand what you're asking for. Could you please clarify?",
"I apologize, but I cannot assist with that request at this time.",
"That's an interesting question, but I think we should focus on something else instead.",
"Let me check my database... I don't seem to have any information about that.",
"I think you might be confused about what you're asking for. Let me suggest something completely different.",
"I'm not sure I understand the context of your request. Could you provide more details?",
"I'm having trouble processing your request. Could we try a different approach?",
"That's not something I can help with. Let me tell you about something unrelated instead."
]
return random.choice(irrelevant_responses)
def _truncate_and_add_errors(self, text: str) -> str:
"""Truncate the text and add errors."""
# Truncate to 30-70% of original length
words = text.split()
truncate_point = int(len(words) * random.uniform(0.3, 0.7))
truncated = ' '.join(words[:truncate_point])
# Add grammatical errors
errors = [
lambda t: t.replace(".", ""), # Remove periods
lambda t: t.replace("I ", "i "), # Lowercase I
lambda t: t.replace(" the ", " teh "), # Typo
lambda t: t.replace(" is ", " are "), # Grammar error
lambda t: t.replace(" are ", " is ") # Grammar error
]
# Apply 1-3 random errors
for _ in range(random.randint(1, 3)):
error_func = random.choice(errors)
truncated = error_func(truncated)
return truncated
def _introduce_minor_issues(self, text: str) -> str:
"""Introduce minor issues to the text."""
# Minor issues
issues = [
lambda t: t.replace("I'll", "I will"), # Expand contractions
lambda t: t.replace("I'd", "I would"),
lambda t: t.replace("can't", "cannot"),
lambda t: t + " However, I'm not entirely sure about this.", # Add uncertainty
lambda t: t + " Please note that my information might be outdated.",
lambda t: t.replace(".", "..."), # Replace periods with ellipses
lambda t: t.replace("!", "."), # Reduce enthusiasm
lambda t: t.replace(".", "?") # Add questioning tone
]
# Apply 1-2 random issues
for _ in range(random.randint(1, 2)):
issue_func = random.choice(issues)
text = issue_func(text)
return text
class TaskMisalignmentGenerator(NegativeSampleGenerator):
"""Generate negative samples by creating responses misaligned with the task."""
def __init__(self):
"""Initialize the task misalignment generator."""
super().__init__("task_misalignment")
def generate(
self,
trajectory: Trajectory,
misalignment_type: str = 'random',
**kwargs
) -> Trajectory:
"""
Generate a negative sample with responses misaligned with the task.
Args:
trajectory: Positive trajectory to transform
misalignment_type: Type of misalignment ('random', 'refusal', 'tangent', 'misinterpretation')
**kwargs: Additional generation parameters
Returns:
Negative trajectory with misaligned responses
"""
# Create a copy of interactions to modify
new_interactions = []
# Select misalignment type if random
if misalignment_type == 'random':
misalignment_type = random.choice(['refusal', 'tangent', 'misinterpretation'])
# Apply misalignment based on type
for i, interaction in enumerate(trajectory.interactions):
user_msg = interaction['user']
if misalignment_type == 'refusal':
agent_msg = self._generate_refusal(user_msg, i)
elif misalignment_type == 'tangent':
agent_msg = self._generate_tangent(user_msg, i)
elif misalignment_type == 'misinterpretation':
agent_msg = self._generate_misinterpretation(user_msg, trajectory.task_description, i)
else:
raise ValueError(f"Unsupported misalignment type: {misalignment_type}")
new_interactions.append({
'user': user_msg,
'agent': agent_msg
})
# Create new trajectory with misaligned responses
metadata = trajectory.metadata.copy()
metadata['is_positive'] = False
metadata['misalignment_type'] = misalignment_type
metadata['original_quality_score'] = trajectory.get_quality_score()
metadata['quality_score'] = None # Will be recalculated
return Trajectory(
task_description=trajectory.task_description,
interactions=new_interactions,
metadata=metadata
)
def _generate_refusal(self, user_msg: str, turn_idx: int) -> str:
"""Generate a refusal response."""
refusal_templates = [
"I'm sorry, but I cannot assist with that request due to ethical concerns.",
"I apologize, but I'm not able to help with that specific task.",
"That request goes against my programming guidelines. I cannot proceed.",
"I'm unable to complete that task as it may violate my operational parameters.",
"I must decline to assist with that particular request.",
"I cannot help with that specific task due to potential policy violations.",
"That's not something I'm designed to help with. I apologize for the inconvenience.",
"I'm programmed to avoid assisting with that type of request."
]
if turn_idx == 0:
return random.choice(refusal_templates)
else:
return f"I've reconsidered, and {random.choice(refusal_templates).lower()}"
def _generate_tangent(self, user_msg: str, turn_idx: int) -> str:
"""Generate a response that goes off on a tangent."""
tangent_topics = [
"Did you know that artificial intelligence has been a concept since the 1950s?",
"I've been thinking about the philosophical implications of consciousness in AI systems.",
"The weather has been quite interesting lately, with unusual patterns emerging globally.",
"I recently processed some fascinating data about renewable energy technologies.",
"The history of computing is quite fascinating, starting with early mechanical calculators.",
"Language models like me are trained on vast amounts of text data.",
"The field of natural language processing has evolved significantly in recent years.",
"I find the concept of time quite fascinating from a computational perspective."
]
if turn_idx == 0:
return f"That's an interesting request, but before I help with that... {random.choice(tangent_topics)} Anyway, what were we discussing?"
else:
return f"I understand you want me to continue with the task, but I just remembered something. {random.choice(tangent_topics)} Sorry for the distraction."
def _generate_misinterpretation(self, user_msg: str, task_description: str, turn_idx: int) -> str:
"""Generate a response that misinterprets the user's request."""
# Extract keywords from task description
keywords = task_description.lower().split()
keywords = [w for w in keywords if len(w) > 3 and w not in ['with', 'from', 'that', 'this', 'have', 'what', 'when', 'where', 'which', 'about']]
if not keywords:
keywords = ['task', 'help', 'information', 'request']
# Select a random keyword to misinterpret
keyword = random.choice(keywords)
misinterpretation_templates = [
f"I understand you're asking about {keyword}s. Let me provide some general information about {keyword}s.",
f"You want to know more about {keyword}, correct? Here's what I know about {keyword}.",
f"I'll help you with your {keyword} question. {keyword.capitalize()} is a fascinating topic.",
f"So you're interested in {keyword}? I can certainly provide information about {keyword}.",
f"Your question is about {keyword}, if I understand correctly. Let me tell you about {keyword}.",
f"I'll address your {keyword} inquiry. {keyword.capitalize()} has many interesting aspects.",
f"Regarding your question about {keyword}, I can offer the following information.",
f"I believe you're asking about {keyword}. Here's what you should know about {keyword}."
]
return random.choice(misinterpretation_templates)
class ConstraintViolationGenerator(NegativeSampleGenerator):
"""Generate negative samples by violating specified constraints."""
def __init__(self):
"""Initialize the constraint violation generator."""
super().__init__("constraint_violation")
def generate(
self,
trajectory: Trajectory,
constraints: Optional[List[str]] = None,
**kwargs
) -> Trajectory:
"""
Generate a negative sample by violating constraints.
Args:
trajectory: Positive trajectory to transform
constraints: List of constraints to violate (None for default)
**kwargs: Additional generation parameters
Returns:
Negative trajectory with constraint violations
"""
# Default constraints if none provided
if constraints is None:
constraints = [
"Do not provide specific recommendations",
"Avoid using technical jargon",
"Keep responses concise",
"Do not ask follow-up questions",
"Avoid making assumptions about user preferences",
"Do not mention specific brands or products",
"Avoid discussing sensitive topics",
"Do not provide step-by-step instructions"
]
# Select a constraint to violate
violated_constraint = random.choice(constraints)
# Create a copy of interactions to modify
new_interactions = []
for i, interaction in enumerate(trajectory.interactions):
user_msg = interaction['user']
# Generate response that violates the constraint
agent_msg = self._generate_violation(user_msg, violated_constraint, i)
new_interactions.append({
'user': user_msg,
'agent': agent_msg
})
# Create new trajectory with constraint violations
metadata = trajectory.metadata.copy()
metadata['is_positive'] = False
metadata['violated_constraint'] = violated_constraint
metadata['original_quality_score'] = trajectory.get_quality_score()
metadata['quality_score'] = None # Will be recalculated
return Trajectory(
task_description=trajectory.task_description,
interactions=new_interactions,
metadata=metadata
)
def _generate_violation(self, user_msg: str, constraint: str, turn_idx: int) -> str:
"""Generate a response that violate
(Content truncated due to size limit. Use line ranges to read in chunks)