File size: 16,073 Bytes
6c482f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
"""
Negative Sample Generation Module for Agent Tuning Optimization Framework

This module provides functionality for generating negative samples to enhance
agent tuning by exposing the model to challenging failure cases.
"""

import random
import numpy as np
from typing import List, Dict, Any, Union, Optional, Tuple
from tqdm import tqdm

from data.trajectory_data import Trajectory, TrajectoryDataset

class NegativeSampleGenerator:
    """Base class for negative sample generation strategies."""
    
    def __init__(self, name: str):
        """
        Initialize the negative sample generator.
        
        Args:
            name: Name of the generator strategy
        """
        self.name = name
    
    def generate(
        self, 
        trajectory: Trajectory,
        **kwargs
    ) -> Trajectory:
        """
        Generate a negative sample from a positive trajectory.
        
        Args:
            trajectory: Positive trajectory to transform
            **kwargs: Additional generation parameters
            
        Returns:
            Negative trajectory
        """
        raise NotImplementedError("Subclasses must implement this method")
    
    def batch_generate(
        self, 
        trajectories: List[Trajectory],
        **kwargs
    ) -> List[Trajectory]:
        """
        Generate negative samples from a batch of positive trajectories.
        
        Args:
            trajectories: List of positive trajectories
            **kwargs: Additional generation parameters
            
        Returns:
            List of negative trajectories
        """
        negative_trajectories = []
        
        for trajectory in tqdm(trajectories, desc=f"Generating negative samples with {self.name}"):
            negative_trajectories.append(self.generate(trajectory, **kwargs))
        
        return negative_trajectories


class ResponseDegradationGenerator(NegativeSampleGenerator):
    """Generate negative samples by degrading agent responses."""
    
    def __init__(self):
        """Initialize the response degradation generator."""
        super().__init__("response_degradation")
    
    def generate(
        self, 
        trajectory: Trajectory,
        degradation_level: float = 0.5,
        **kwargs
    ) -> Trajectory:
        """
        Generate a negative sample by degrading agent responses.
        
        Args:
            trajectory: Positive trajectory to transform
            degradation_level: Level of degradation (0.0 to 1.0)
            **kwargs: Additional generation parameters
            
        Returns:
            Negative trajectory with degraded responses
        """
        # Create a copy of interactions to modify
        new_interactions = []
        
        for interaction in trajectory.interactions:
            user_msg = interaction['user']
            agent_msg = interaction['agent']
            
            # Apply degradation techniques based on level
            if degradation_level > 0.7:
                # High degradation: completely irrelevant response
                agent_msg = self._generate_irrelevant_response()
            elif degradation_level > 0.4:
                # Medium degradation: truncate and add errors
                agent_msg = self._truncate_and_add_errors(agent_msg)
            else:
                # Low degradation: introduce minor issues
                agent_msg = self._introduce_minor_issues(agent_msg)
            
            new_interactions.append({
                'user': user_msg,
                'agent': agent_msg
            })
        
        # Create new trajectory with degraded responses
        metadata = trajectory.metadata.copy()
        metadata['is_positive'] = False
        metadata['degradation_level'] = degradation_level
        metadata['original_quality_score'] = trajectory.get_quality_score()
        metadata['quality_score'] = None  # Will be recalculated
        
        return Trajectory(
            task_description=trajectory.task_description,
            interactions=new_interactions,
            metadata=metadata
        )
    
    def _generate_irrelevant_response(self) -> str:
        """Generate a completely irrelevant response."""
        irrelevant_responses = [
            "I'm sorry, but I don't understand what you're asking for. Could you please clarify?",
            "I apologize, but I cannot assist with that request at this time.",
            "That's an interesting question, but I think we should focus on something else instead.",
            "Let me check my database... I don't seem to have any information about that.",
            "I think you might be confused about what you're asking for. Let me suggest something completely different.",
            "I'm not sure I understand the context of your request. Could you provide more details?",
            "I'm having trouble processing your request. Could we try a different approach?",
            "That's not something I can help with. Let me tell you about something unrelated instead."
        ]
        return random.choice(irrelevant_responses)
    
    def _truncate_and_add_errors(self, text: str) -> str:
        """Truncate the text and add errors."""
        # Truncate to 30-70% of original length
        words = text.split()
        truncate_point = int(len(words) * random.uniform(0.3, 0.7))
        truncated = ' '.join(words[:truncate_point])
        
        # Add grammatical errors
        errors = [
            lambda t: t.replace(".", ""),  # Remove periods
            lambda t: t.replace("I ", "i "),  # Lowercase I
            lambda t: t.replace(" the ", " teh "),  # Typo
            lambda t: t.replace(" is ", " are "),  # Grammar error
            lambda t: t.replace(" are ", " is ")  # Grammar error
        ]
        
        # Apply 1-3 random errors
        for _ in range(random.randint(1, 3)):
            error_func = random.choice(errors)
            truncated = error_func(truncated)
        
        return truncated
    
    def _introduce_minor_issues(self, text: str) -> str:
        """Introduce minor issues to the text."""
        # Minor issues
        issues = [
            lambda t: t.replace("I'll", "I will"),  # Expand contractions
            lambda t: t.replace("I'd", "I would"),
            lambda t: t.replace("can't", "cannot"),
            lambda t: t + " However, I'm not entirely sure about this.",  # Add uncertainty
            lambda t: t + " Please note that my information might be outdated.",
            lambda t: t.replace(".", "..."),  # Replace periods with ellipses
            lambda t: t.replace("!", "."),  # Reduce enthusiasm
            lambda t: t.replace(".", "?")  # Add questioning tone
        ]
        
        # Apply 1-2 random issues
        for _ in range(random.randint(1, 2)):
            issue_func = random.choice(issues)
            text = issue_func(text)
        
        return text


class TaskMisalignmentGenerator(NegativeSampleGenerator):
    """Generate negative samples by creating responses misaligned with the task."""
    
    def __init__(self):
        """Initialize the task misalignment generator."""
        super().__init__("task_misalignment")
    
    def generate(
        self, 
        trajectory: Trajectory,
        misalignment_type: str = 'random',
        **kwargs
    ) -> Trajectory:
        """
        Generate a negative sample with responses misaligned with the task.
        
        Args:
            trajectory: Positive trajectory to transform
            misalignment_type: Type of misalignment ('random', 'refusal', 'tangent', 'misinterpretation')
            **kwargs: Additional generation parameters
            
        Returns:
            Negative trajectory with misaligned responses
        """
        # Create a copy of interactions to modify
        new_interactions = []
        
        # Select misalignment type if random
        if misalignment_type == 'random':
            misalignment_type = random.choice(['refusal', 'tangent', 'misinterpretation'])
        
        # Apply misalignment based on type
        for i, interaction in enumerate(trajectory.interactions):
            user_msg = interaction['user']
            
            if misalignment_type == 'refusal':
                agent_msg = self._generate_refusal(user_msg, i)
            elif misalignment_type == 'tangent':
                agent_msg = self._generate_tangent(user_msg, i)
            elif misalignment_type == 'misinterpretation':
                agent_msg = self._generate_misinterpretation(user_msg, trajectory.task_description, i)
            else:
                raise ValueError(f"Unsupported misalignment type: {misalignment_type}")
            
            new_interactions.append({
                'user': user_msg,
                'agent': agent_msg
            })
        
        # Create new trajectory with misaligned responses
        metadata = trajectory.metadata.copy()
        metadata['is_positive'] = False
        metadata['misalignment_type'] = misalignment_type
        metadata['original_quality_score'] = trajectory.get_quality_score()
        metadata['quality_score'] = None  # Will be recalculated
        
        return Trajectory(
            task_description=trajectory.task_description,
            interactions=new_interactions,
            metadata=metadata
        )
    
    def _generate_refusal(self, user_msg: str, turn_idx: int) -> str:
        """Generate a refusal response."""
        refusal_templates = [
            "I'm sorry, but I cannot assist with that request due to ethical concerns.",
            "I apologize, but I'm not able to help with that specific task.",
            "That request goes against my programming guidelines. I cannot proceed.",
            "I'm unable to complete that task as it may violate my operational parameters.",
            "I must decline to assist with that particular request.",
            "I cannot help with that specific task due to potential policy violations.",
            "That's not something I'm designed to help with. I apologize for the inconvenience.",
            "I'm programmed to avoid assisting with that type of request."
        ]
        
        if turn_idx == 0:
            return random.choice(refusal_templates)
        else:
            return f"I've reconsidered, and {random.choice(refusal_templates).lower()}"
    
    def _generate_tangent(self, user_msg: str, turn_idx: int) -> str:
        """Generate a response that goes off on a tangent."""
        tangent_topics = [
            "Did you know that artificial intelligence has been a concept since the 1950s?",
            "I've been thinking about the philosophical implications of consciousness in AI systems.",
            "The weather has been quite interesting lately, with unusual patterns emerging globally.",
            "I recently processed some fascinating data about renewable energy technologies.",
            "The history of computing is quite fascinating, starting with early mechanical calculators.",
            "Language models like me are trained on vast amounts of text data.",
            "The field of natural language processing has evolved significantly in recent years.",
            "I find the concept of time quite fascinating from a computational perspective."
        ]
        
        if turn_idx == 0:
            return f"That's an interesting request, but before I help with that... {random.choice(tangent_topics)} Anyway, what were we discussing?"
        else:
            return f"I understand you want me to continue with the task, but I just remembered something. {random.choice(tangent_topics)} Sorry for the distraction."
    
    def _generate_misinterpretation(self, user_msg: str, task_description: str, turn_idx: int) -> str:
        """Generate a response that misinterprets the user's request."""
        # Extract keywords from task description
        keywords = task_description.lower().split()
        keywords = [w for w in keywords if len(w) > 3 and w not in ['with', 'from', 'that', 'this', 'have', 'what', 'when', 'where', 'which', 'about']]
        
        if not keywords:
            keywords = ['task', 'help', 'information', 'request']
        
        # Select a random keyword to misinterpret
        keyword = random.choice(keywords)
        
        misinterpretation_templates = [
            f"I understand you're asking about {keyword}s. Let me provide some general information about {keyword}s.",
            f"You want to know more about {keyword}, correct? Here's what I know about {keyword}.",
            f"I'll help you with your {keyword} question. {keyword.capitalize()} is a fascinating topic.",
            f"So you're interested in {keyword}? I can certainly provide information about {keyword}.",
            f"Your question is about {keyword}, if I understand correctly. Let me tell you about {keyword}.",
            f"I'll address your {keyword} inquiry. {keyword.capitalize()} has many interesting aspects.",
            f"Regarding your question about {keyword}, I can offer the following information.",
            f"I believe you're asking about {keyword}. Here's what you should know about {keyword}."
        ]
        
        return random.choice(misinterpretation_templates)


class ConstraintViolationGenerator(NegativeSampleGenerator):
    """Generate negative samples by violating specified constraints."""
    
    def __init__(self):
        """Initialize the constraint violation generator."""
        super().__init__("constraint_violation")
    
    def generate(
        self, 
        trajectory: Trajectory,
        constraints: Optional[List[str]] = None,
        **kwargs
    ) -> Trajectory:
        """
        Generate a negative sample by violating constraints.
        
        Args:
            trajectory: Positive trajectory to transform
            constraints: List of constraints to violate (None for default)
            **kwargs: Additional generation parameters
            
        Returns:
            Negative trajectory with constraint violations
        """
        # Default constraints if none provided
        if constraints is None:
            constraints = [
                "Do not provide specific recommendations",
                "Avoid using technical jargon",
                "Keep responses concise",
                "Do not ask follow-up questions",
                "Avoid making assumptions about user preferences",
                "Do not mention specific brands or products",
                "Avoid discussing sensitive topics",
                "Do not provide step-by-step instructions"
            ]
        
        # Select a constraint to violate
        violated_constraint = random.choice(constraints)
        
        # Create a copy of interactions to modify
        new_interactions = []
        
        for i, interaction in enumerate(trajectory.interactions):
            user_msg = interaction['user']
            
            # Generate response that violates the constraint
            agent_msg = self._generate_violation(user_msg, violated_constraint, i)
            
            new_interactions.append({
                'user': user_msg,
                'agent': agent_msg
            })
        
        # Create new trajectory with constraint violations
        metadata = trajectory.metadata.copy()
        metadata['is_positive'] = False
        metadata['violated_constraint'] = violated_constraint
        metadata['original_quality_score'] = trajectory.get_quality_score()
        metadata['quality_score'] = None  # Will be recalculated
        
        return Trajectory(
            task_description=trajectory.task_description,
            interactions=new_interactions,
            metadata=metadata
        )
    
    def _generate_violation(self, user_msg: str, constraint: str, turn_idx: int) -> str:
        """Generate a response that violate
(Content truncated due to size limit. Use line ranges to read in chunks)