Spaces:

swaleha19
/

agent_tuning_framework

Running

File size: 16,073 Bytes

6c482f9

"""
Negative Sample Generation Module for Agent Tuning Optimization Framework

This module provides functionality for generating negative samples to enhance
agent tuning by exposing the model to challenging failure cases.
"""

import random
import numpy as np
from typing import List, Dict, Any, Union, Optional, Tuple
from tqdm import tqdm

from data.trajectory_data import Trajectory, TrajectoryDataset

class NegativeSampleGenerator:
    """Base class for negative sample generation strategies."""
    
    def __init__(self, name: str):
        """
        Initialize the negative sample generator.
        
        Args:
            name: Name of the generator strategy
        """
        self.name = name
    
    def generate(
        self, 
        trajectory: Trajectory,
        **kwargs
    ) -> Trajectory:
        """
        Generate a negative sample from a positive trajectory.
        
        Args:
            trajectory: Positive trajectory to transform
            **kwargs: Additional generation parameters
            
        Returns:
            Negative trajectory
        """
        raise NotImplementedError("Subclasses must implement this method")
    
    def batch_generate(
        self, 
        trajectories: List[Trajectory],
        **kwargs
    ) -> List[Trajectory]:
        """
        Generate negative samples from a batch of positive trajectories.
        
        Args:
            trajectories: List of positive trajectories
            **kwargs: Additional generation parameters
            
        Returns:
            List of negative trajectories
        """
        negative_trajectories = []
        
        for trajectory in tqdm(trajectories, desc=f"Generating negative samples with {self.name}"):
            negative_trajectories.append(self.generate(trajectory, **kwargs))
        
        return negative_trajectories


class ResponseDegradationGenerator(NegativeSampleGenerator):
    """Generate negative samples by degrading agent responses."""
    
    def __init__(self):
        """Initialize the response degradation generator."""
        super().__init__("response_degradation")
    
    def generate(
        self, 
        trajectory: Trajectory,
        degradation_level: float = 0.5,
        **kwargs
    ) -> Trajectory:
        """
        Generate a negative sample by degrading agent responses.
        
        Args:
            trajectory: Positive trajectory to transform
            degradation_level: Level of degradation (0.0 to 1.0)
            **kwargs: Additional generation parameters
            
        Returns:
            Negative trajectory with degraded responses
        """
        # Create a copy of interactions to modify
        new_interactions = []
        
        for interaction in trajectory.interactions:
            user_msg = interaction['user']
            agent_msg = interaction['agent']
            
            # Apply degradation techniques based on level
            if degradation_level > 0.7:
                # High degradation: completely irrelevant response
                agent_msg = self._generate_irrelevant_response()
            elif degradation_level > 0.4:
                # Medium degradation: truncate and add errors
                agent_msg = self._truncate_and_add_errors(agent_msg)
            else:
                # Low degradation: introduce minor issues
                agent_msg = self._introduce_minor_issues(agent_msg)
            
            new_interactions.append({
                'user': user_msg,
                'agent': agent_msg
            })
        
        # Create new trajectory with degraded responses
        metadata = trajectory.metadata.copy()
        metadata['is_positive'] = False
        metadata['degradation_level'] = degradation_level
        metadata['original_quality_score'] = trajectory.get_quality_score()
        metadata['quality_score'] = None  # Will be recalculated
        
        return Trajectory(
            task_description=trajectory.task_description,
            interactions=new_interactions,
            metadata=metadata
        )
    
    def _generate_irrelevant_response(self) -> str:
        """Generate a completely irrelevant response."""
        irrelevant_responses = [
            "I'm sorry, but I don't understand what you're asking for. Could you please clarify?",
            "I apologize, but I cannot assist with that request at this time.",
            "That's an interesting question, but I think we should focus on something else instead.",
            "Let me check my database... I don't seem to have any information about that.",
            "I think you might be confused about what you're asking for. Let me suggest something completely different.",
            "I'm not sure I understand the context of your request. Could you provide more details?",
            "I'm having trouble processing your request. Could we try a different approach?",
            "That's not something I can help with. Let me tell you about something unrelated instead."
        ]
        return random.choice(irrelevant_responses)
    
    def _truncate_and_add_errors(self, text: str) -> str:
        """Truncate the text and add errors."""
        # Truncate to 30-70% of original length
        words = text.split()
        truncate_point = int(len(words) * random.uniform(0.3, 0.7))
        truncated = ' '.join(words[:truncate_point])
        
        # Add grammatical errors
        errors = [
            lambda t: t.replace(".", ""),  # Remove periods
            lambda t: t.replace("I ", "i "),  # Lowercase I
            lambda t: t.replace(" the ", " teh "),  # Typo
            lambda t: t.replace(" is ", " are "),  # Grammar error
            lambda t: t.replace(" are ", " is ")  # Grammar error
        ]
        
        # Apply 1-3 random errors
        for _ in range(random.randint(1, 3)):
            error_func = random.choice(errors)
            truncated = error_func(truncated)
        
        return truncated
    
    def _introduce_minor_issues(self, text: str) -> str:
        """Introduce minor issues to the text."""
        # Minor issues
        issues = [
            lambda t: t.replace("I'll", "I will"),  # Expand contractions
            lambda t: t.replace("I'd", "I would"),
            lambda t: t.replace("can't", "cannot"),
            lambda t: t + " However, I'm not entirely sure about this.",  # Add uncertainty
            lambda t: t + " Please note that my information might be outdated.",
            lambda t: t.replace(".", "..."),  # Replace periods with ellipses
            lambda t: t.replace("!", "."),  # Reduce enthusiasm
            lambda t: t.replace(".", "?")  # Add questioning tone
        ]
        
        # Apply 1-2 random issues
        for _ in range(random.randint(1, 2)):
            issue_func = random.choice(issues)
            text = issue_func(text)
        
        return text


class TaskMisalignmentGenerator(NegativeSampleGenerator):
    """Generate negative samples by creating responses misaligned with the task."""
    
    def __init__(self):
        """Initialize the task misalignment generator."""
        super().__init__("task_misalignment")
    
    def generate(
        self, 
        trajectory: Trajectory,
        misalignment_type: str = 'random',
        **kwargs
    ) -> Trajectory:
        """
        Generate a negative sample with responses misaligned with the task.
        
        Args:
            trajectory: Positive trajectory to transform
            misalignment_type: Type of misalignment ('random', 'refusal', 'tangent', 'misinterpretation')
            **kwargs: Additional generation parameters
            
        Returns:
            Negative trajectory with misaligned responses
        """
        # Create a copy of interactions to modify
        new_interactions = []
        
        # Select misalignment type if random
        if misalignment_type == 'random':
            misalignment_type = random.choice(['refusal', 'tangent', 'misinterpretation'])
        
        # Apply misalignment based on type
        for i, interaction in enumerate(trajectory.interactions):
            user_msg = interaction['user']
            
            if misalignment_type == 'refusal':
                agent_msg = self._generate_refusal(user_msg, i)
            elif misalignment_type == 'tangent':
                agent_msg = self._generate_tangent(user_msg, i)
            elif misalignment_type == 'misinterpretation':
                agent_msg = self._generate_misinterpretation(user_msg, trajectory.task_description, i)
            else:
                raise ValueError(f"Unsupported misalignment type: {misalignment_type}")
            
            new_interactions.append({
                'user': user_msg,
                'agent': agent_msg
            })
        
        # Create new trajectory with misaligned responses
        metadata = trajectory.metadata.copy()
        metadata['is_positive'] = False
        metadata['misalignment_type'] = misalignment_type
        metadata['original_quality_score'] = trajectory.get_quality_score()
        metadata['quality_score'] = None  # Will be recalculated
        
        return Trajectory(
            task_description=trajectory.task_description,
            interactions=new_interactions,
            metadata=metadata
        )
    
    def _generate_refusal(self, user_msg: str, turn_idx: int) -> str:
        """Generate a refusal response."""
        refusal_templates = [
            "I'm sorry, but I cannot assist with that request due to ethical concerns.",
            "I apologize, but I'm not able to help with that specific task.",
            "That request goes against my programming guidelines. I cannot proceed.",
            "I'm unable to complete that task as it may violate my operational parameters.",
            "I must decline to assist with that particular request.",
            "I cannot help with that specific task due to potential policy violations.",
            "That's not something I'm designed to help with. I apologize for the inconvenience.",
            "I'm programmed to avoid assisting with that type of request."
        ]
        
        if turn_idx == 0:
            return random.choice(refusal_templates)
        else:
            return f"I've reconsidered, and {random.choice(refusal_templates).lower()}"
    
    def _generate_tangent(self, user_msg: str, turn_idx: int) -> str:
        """Generate a response that goes off on a tangent."""
        tangent_topics = [
            "Did you know that artificial intelligence has been a concept since the 1950s?",
            "I've been thinking about the philosophical implications of consciousness in AI systems.",
            "The weather has been quite interesting lately, with unusual patterns emerging globally.",
            "I recently processed some fascinating data about renewable energy technologies.",
            "The history of computing is quite fascinating, starting with early mechanical calculators.",
            "Language models like me are trained on vast amounts of text data.",
            "The field of natural language processing has evolved significantly in recent years.",
            "I find the concept of time quite fascinating from a computational perspective."
        ]
        
        if turn_idx == 0:
            return f"That's an interesting request, but before I help with that... {random.choice(tangent_topics)} Anyway, what were we discussing?"
        else:
            return f"I understand you want me to continue with the task, but I just remembered something. {random.choice(tangent_topics)} Sorry for the distraction."
    
    def _generate_misinterpretation(self, user_msg: str, task_description: str, turn_idx: int) -> str:
        """Generate a response that misinterprets the user's request."""
        # Extract keywords from task description
        keywords = task_description.lower().split()
        keywords = [w for w in keywords if len(w) > 3 and w not in ['with', 'from', 'that', 'this', 'have', 'what', 'when', 'where', 'which', 'about']]
        
        if not keywords:
            keywords = ['task', 'help', 'information', 'request']
        
        # Select a random keyword to misinterpret
        keyword = random.choice(keywords)
        
        misinterpretation_templates = [
            f"I understand you're asking about {keyword}s. Let me provide some general information about {keyword}s.",
            f"You want to know more about {keyword}, correct? Here's what I know about {keyword}.",
            f"I'll help you with your {keyword} question. {keyword.capitalize()} is a fascinating topic.",
            f"So you're interested in {keyword}? I can certainly provide information about {keyword}.",
            f"Your question is about {keyword}, if I understand correctly. Let me tell you about {keyword}.",
            f"I'll address your {keyword} inquiry. {keyword.capitalize()} has many interesting aspects.",
            f"Regarding your question about {keyword}, I can offer the following information.",
            f"I believe you're asking about {keyword}. Here's what you should know about {keyword}."
        ]
        
        return random.choice(misinterpretation_templates)


class ConstraintViolationGenerator(NegativeSampleGenerator):
    """Generate negative samples by violating specified constraints."""
    
    def __init__(self):
        """Initialize the constraint violation generator."""
        super().__init__("constraint_violation")
    
    def generate(
        self, 
        trajectory: Trajectory,
        constraints: Optional[List[str]] = None,
        **kwargs
    ) -> Trajectory:
        """
        Generate a negative sample by violating constraints.
        
        Args:
            trajectory: Positive trajectory to transform
            constraints: List of constraints to violate (None for default)
            **kwargs: Additional generation parameters
            
        Returns:
            Negative trajectory with constraint violations
        """
        # Default constraints if none provided
        if constraints is None:
            constraints = [
                "Do not provide specific recommendations",
                "Avoid using technical jargon",
                "Keep responses concise",
                "Do not ask follow-up questions",
                "Avoid making assumptions about user preferences",
                "Do not mention specific brands or products",
                "Avoid discussing sensitive topics",
                "Do not provide step-by-step instructions"
            ]
        
        # Select a constraint to violate
        violated_constraint = random.choice(constraints)
        
        # Create a copy of interactions to modify
        new_interactions = []
        
        for i, interaction in enumerate(trajectory.interactions):
            user_msg = interaction['user']
            
            # Generate response that violates the constraint
            agent_msg = self._generate_violation(user_msg, violated_constraint, i)
            
            new_interactions.append({
                'user': user_msg,
                'agent': agent_msg
            })
        
        # Create new trajectory with constraint violations
        metadata = trajectory.metadata.copy()
        metadata['is_positive'] = False
        metadata['violated_constraint'] = violated_constraint
        metadata['original_quality_score'] = trajectory.get_quality_score()
        metadata['quality_score'] = None  # Will be recalculated
        
        return Trajectory(
            task_description=trajectory.task_description,
            interactions=new_interactions,
            metadata=metadata
        )
    
    def _generate_violation(self, user_msg: str, constraint: str, turn_idx: int) -> str:
        """Generate a response that violate
(Content truncated due to size limit. Use line ranges to read in chunks)