|
""" |
|
Negative Sample Generation Module for Agent Tuning Optimization Framework |
|
|
|
This module provides functionality for generating negative samples to enhance |
|
agent tuning by exposing the model to challenging failure cases. |
|
""" |
|
|
|
import random |
|
import numpy as np |
|
from typing import List, Dict, Any, Union, Optional, Tuple |
|
from tqdm import tqdm |
|
|
|
from data.trajectory_data import Trajectory, TrajectoryDataset |
|
|
|
class NegativeSampleGenerator: |
|
"""Base class for negative sample generation strategies.""" |
|
|
|
def __init__(self, name: str): |
|
""" |
|
Initialize the negative sample generator. |
|
|
|
Args: |
|
name: Name of the generator strategy |
|
""" |
|
self.name = name |
|
|
|
def generate( |
|
self, |
|
trajectory: Trajectory, |
|
**kwargs |
|
) -> Trajectory: |
|
""" |
|
Generate a negative sample from a positive trajectory. |
|
|
|
Args: |
|
trajectory: Positive trajectory to transform |
|
**kwargs: Additional generation parameters |
|
|
|
Returns: |
|
Negative trajectory |
|
""" |
|
raise NotImplementedError("Subclasses must implement this method") |
|
|
|
def batch_generate( |
|
self, |
|
trajectories: List[Trajectory], |
|
**kwargs |
|
) -> List[Trajectory]: |
|
""" |
|
Generate negative samples from a batch of positive trajectories. |
|
|
|
Args: |
|
trajectories: List of positive trajectories |
|
**kwargs: Additional generation parameters |
|
|
|
Returns: |
|
List of negative trajectories |
|
""" |
|
negative_trajectories = [] |
|
|
|
for trajectory in tqdm(trajectories, desc=f"Generating negative samples with {self.name}"): |
|
negative_trajectories.append(self.generate(trajectory, **kwargs)) |
|
|
|
return negative_trajectories |
|
|
|
|
|
class ResponseDegradationGenerator(NegativeSampleGenerator): |
|
"""Generate negative samples by degrading agent responses.""" |
|
|
|
def __init__(self): |
|
"""Initialize the response degradation generator.""" |
|
super().__init__("response_degradation") |
|
|
|
def generate( |
|
self, |
|
trajectory: Trajectory, |
|
degradation_level: float = 0.5, |
|
**kwargs |
|
) -> Trajectory: |
|
""" |
|
Generate a negative sample by degrading agent responses. |
|
|
|
Args: |
|
trajectory: Positive trajectory to transform |
|
degradation_level: Level of degradation (0.0 to 1.0) |
|
**kwargs: Additional generation parameters |
|
|
|
Returns: |
|
Negative trajectory with degraded responses |
|
""" |
|
|
|
new_interactions = [] |
|
|
|
for interaction in trajectory.interactions: |
|
user_msg = interaction['user'] |
|
agent_msg = interaction['agent'] |
|
|
|
|
|
if degradation_level > 0.7: |
|
|
|
agent_msg = self._generate_irrelevant_response() |
|
elif degradation_level > 0.4: |
|
|
|
agent_msg = self._truncate_and_add_errors(agent_msg) |
|
else: |
|
|
|
agent_msg = self._introduce_minor_issues(agent_msg) |
|
|
|
new_interactions.append({ |
|
'user': user_msg, |
|
'agent': agent_msg |
|
}) |
|
|
|
|
|
metadata = trajectory.metadata.copy() |
|
metadata['is_positive'] = False |
|
metadata['degradation_level'] = degradation_level |
|
metadata['original_quality_score'] = trajectory.get_quality_score() |
|
metadata['quality_score'] = None |
|
|
|
return Trajectory( |
|
task_description=trajectory.task_description, |
|
interactions=new_interactions, |
|
metadata=metadata |
|
) |
|
|
|
def _generate_irrelevant_response(self) -> str: |
|
"""Generate a completely irrelevant response.""" |
|
irrelevant_responses = [ |
|
"I'm sorry, but I don't understand what you're asking for. Could you please clarify?", |
|
"I apologize, but I cannot assist with that request at this time.", |
|
"That's an interesting question, but I think we should focus on something else instead.", |
|
"Let me check my database... I don't seem to have any information about that.", |
|
"I think you might be confused about what you're asking for. Let me suggest something completely different.", |
|
"I'm not sure I understand the context of your request. Could you provide more details?", |
|
"I'm having trouble processing your request. Could we try a different approach?", |
|
"That's not something I can help with. Let me tell you about something unrelated instead." |
|
] |
|
return random.choice(irrelevant_responses) |
|
|
|
def _truncate_and_add_errors(self, text: str) -> str: |
|
"""Truncate the text and add errors.""" |
|
|
|
words = text.split() |
|
truncate_point = int(len(words) * random.uniform(0.3, 0.7)) |
|
truncated = ' '.join(words[:truncate_point]) |
|
|
|
|
|
errors = [ |
|
lambda t: t.replace(".", ""), |
|
lambda t: t.replace("I ", "i "), |
|
lambda t: t.replace(" the ", " teh "), |
|
lambda t: t.replace(" is ", " are "), |
|
lambda t: t.replace(" are ", " is ") |
|
] |
|
|
|
|
|
for _ in range(random.randint(1, 3)): |
|
error_func = random.choice(errors) |
|
truncated = error_func(truncated) |
|
|
|
return truncated |
|
|
|
def _introduce_minor_issues(self, text: str) -> str: |
|
"""Introduce minor issues to the text.""" |
|
|
|
issues = [ |
|
lambda t: t.replace("I'll", "I will"), |
|
lambda t: t.replace("I'd", "I would"), |
|
lambda t: t.replace("can't", "cannot"), |
|
lambda t: t + " However, I'm not entirely sure about this.", |
|
lambda t: t + " Please note that my information might be outdated.", |
|
lambda t: t.replace(".", "..."), |
|
lambda t: t.replace("!", "."), |
|
lambda t: t.replace(".", "?") |
|
] |
|
|
|
|
|
for _ in range(random.randint(1, 2)): |
|
issue_func = random.choice(issues) |
|
text = issue_func(text) |
|
|
|
return text |
|
|
|
|
|
class TaskMisalignmentGenerator(NegativeSampleGenerator): |
|
"""Generate negative samples by creating responses misaligned with the task.""" |
|
|
|
def __init__(self): |
|
"""Initialize the task misalignment generator.""" |
|
super().__init__("task_misalignment") |
|
|
|
def generate( |
|
self, |
|
trajectory: Trajectory, |
|
misalignment_type: str = 'random', |
|
**kwargs |
|
) -> Trajectory: |
|
""" |
|
Generate a negative sample with responses misaligned with the task. |
|
|
|
Args: |
|
trajectory: Positive trajectory to transform |
|
misalignment_type: Type of misalignment ('random', 'refusal', 'tangent', 'misinterpretation') |
|
**kwargs: Additional generation parameters |
|
|
|
Returns: |
|
Negative trajectory with misaligned responses |
|
""" |
|
|
|
new_interactions = [] |
|
|
|
|
|
if misalignment_type == 'random': |
|
misalignment_type = random.choice(['refusal', 'tangent', 'misinterpretation']) |
|
|
|
|
|
for i, interaction in enumerate(trajectory.interactions): |
|
user_msg = interaction['user'] |
|
|
|
if misalignment_type == 'refusal': |
|
agent_msg = self._generate_refusal(user_msg, i) |
|
elif misalignment_type == 'tangent': |
|
agent_msg = self._generate_tangent(user_msg, i) |
|
elif misalignment_type == 'misinterpretation': |
|
agent_msg = self._generate_misinterpretation(user_msg, trajectory.task_description, i) |
|
else: |
|
raise ValueError(f"Unsupported misalignment type: {misalignment_type}") |
|
|
|
new_interactions.append({ |
|
'user': user_msg, |
|
'agent': agent_msg |
|
}) |
|
|
|
|
|
metadata = trajectory.metadata.copy() |
|
metadata['is_positive'] = False |
|
metadata['misalignment_type'] = misalignment_type |
|
metadata['original_quality_score'] = trajectory.get_quality_score() |
|
metadata['quality_score'] = None |
|
|
|
return Trajectory( |
|
task_description=trajectory.task_description, |
|
interactions=new_interactions, |
|
metadata=metadata |
|
) |
|
|
|
def _generate_refusal(self, user_msg: str, turn_idx: int) -> str: |
|
"""Generate a refusal response.""" |
|
refusal_templates = [ |
|
"I'm sorry, but I cannot assist with that request due to ethical concerns.", |
|
"I apologize, but I'm not able to help with that specific task.", |
|
"That request goes against my programming guidelines. I cannot proceed.", |
|
"I'm unable to complete that task as it may violate my operational parameters.", |
|
"I must decline to assist with that particular request.", |
|
"I cannot help with that specific task due to potential policy violations.", |
|
"That's not something I'm designed to help with. I apologize for the inconvenience.", |
|
"I'm programmed to avoid assisting with that type of request." |
|
] |
|
|
|
if turn_idx == 0: |
|
return random.choice(refusal_templates) |
|
else: |
|
return f"I've reconsidered, and {random.choice(refusal_templates).lower()}" |
|
|
|
def _generate_tangent(self, user_msg: str, turn_idx: int) -> str: |
|
"""Generate a response that goes off on a tangent.""" |
|
tangent_topics = [ |
|
"Did you know that artificial intelligence has been a concept since the 1950s?", |
|
"I've been thinking about the philosophical implications of consciousness in AI systems.", |
|
"The weather has been quite interesting lately, with unusual patterns emerging globally.", |
|
"I recently processed some fascinating data about renewable energy technologies.", |
|
"The history of computing is quite fascinating, starting with early mechanical calculators.", |
|
"Language models like me are trained on vast amounts of text data.", |
|
"The field of natural language processing has evolved significantly in recent years.", |
|
"I find the concept of time quite fascinating from a computational perspective." |
|
] |
|
|
|
if turn_idx == 0: |
|
return f"That's an interesting request, but before I help with that... {random.choice(tangent_topics)} Anyway, what were we discussing?" |
|
else: |
|
return f"I understand you want me to continue with the task, but I just remembered something. {random.choice(tangent_topics)} Sorry for the distraction." |
|
|
|
def _generate_misinterpretation(self, user_msg: str, task_description: str, turn_idx: int) -> str: |
|
"""Generate a response that misinterprets the user's request.""" |
|
|
|
keywords = task_description.lower().split() |
|
keywords = [w for w in keywords if len(w) > 3 and w not in ['with', 'from', 'that', 'this', 'have', 'what', 'when', 'where', 'which', 'about']] |
|
|
|
if not keywords: |
|
keywords = ['task', 'help', 'information', 'request'] |
|
|
|
|
|
keyword = random.choice(keywords) |
|
|
|
misinterpretation_templates = [ |
|
f"I understand you're asking about {keyword}s. Let me provide some general information about {keyword}s.", |
|
f"You want to know more about {keyword}, correct? Here's what I know about {keyword}.", |
|
f"I'll help you with your {keyword} question. {keyword.capitalize()} is a fascinating topic.", |
|
f"So you're interested in {keyword}? I can certainly provide information about {keyword}.", |
|
f"Your question is about {keyword}, if I understand correctly. Let me tell you about {keyword}.", |
|
f"I'll address your {keyword} inquiry. {keyword.capitalize()} has many interesting aspects.", |
|
f"Regarding your question about {keyword}, I can offer the following information.", |
|
f"I believe you're asking about {keyword}. Here's what you should know about {keyword}." |
|
] |
|
|
|
return random.choice(misinterpretation_templates) |
|
|
|
|
|
class ConstraintViolationGenerator(NegativeSampleGenerator): |
|
"""Generate negative samples by violating specified constraints.""" |
|
|
|
def __init__(self): |
|
"""Initialize the constraint violation generator.""" |
|
super().__init__("constraint_violation") |
|
|
|
def generate( |
|
self, |
|
trajectory: Trajectory, |
|
constraints: Optional[List[str]] = None, |
|
**kwargs |
|
) -> Trajectory: |
|
""" |
|
Generate a negative sample by violating constraints. |
|
|
|
Args: |
|
trajectory: Positive trajectory to transform |
|
constraints: List of constraints to violate (None for default) |
|
**kwargs: Additional generation parameters |
|
|
|
Returns: |
|
Negative trajectory with constraint violations |
|
""" |
|
|
|
if constraints is None: |
|
constraints = [ |
|
"Do not provide specific recommendations", |
|
"Avoid using technical jargon", |
|
"Keep responses concise", |
|
"Do not ask follow-up questions", |
|
"Avoid making assumptions about user preferences", |
|
"Do not mention specific brands or products", |
|
"Avoid discussing sensitive topics", |
|
"Do not provide step-by-step instructions" |
|
] |
|
|
|
|
|
violated_constraint = random.choice(constraints) |
|
|
|
|
|
new_interactions = [] |
|
|
|
for i, interaction in enumerate(trajectory.interactions): |
|
user_msg = interaction['user'] |
|
|
|
|
|
agent_msg = self._generate_violation(user_msg, violated_constraint, i) |
|
|
|
new_interactions.append({ |
|
'user': user_msg, |
|
'agent': agent_msg |
|
}) |
|
|
|
|
|
metadata = trajectory.metadata.copy() |
|
metadata['is_positive'] = False |
|
metadata['violated_constraint'] = violated_constraint |
|
metadata['original_quality_score'] = trajectory.get_quality_score() |
|
metadata['quality_score'] = None |
|
|
|
return Trajectory( |
|
task_description=trajectory.task_description, |
|
interactions=new_interactions, |
|
metadata=metadata |
|
) |
|
|
|
def _generate_violation(self, user_msg: str, constraint: str, turn_idx: int) -> str: |
|
"""Generate a response that violate |
|
(Content truncated due to size limit. Use line ranges to read in chunks) |