File size: 4,163 Bytes
d1eeaf0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85624c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1eeaf0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import logging
import random
import math

def process_data(instance, noise_rate, passage_num, filename, correct_rate=0):
    """Process the data for generating a noisy document set."""
    query = instance['query']
    ans = instance['answer']
    logging.info(f"Query: {query}")
    logging.info(f"Answer: {ans}")
    
    neg_num = math.ceil(passage_num * noise_rate)
    pos_num = passage_num - neg_num
    docs = []
    
    # Handling the '_int' case in filename
    if '_int' in filename:
        for i in instance['positive']:
            random.shuffle(i)
        docs = [i[0] for i in instance['positive']]
        if len(docs) < pos_num:
            maxnum = max([len(i) for i in instance['positive']])
            for i in range(1, maxnum):
                for j in instance['positive']:
                    if len(j) > i:
                        docs.append(j[i])
                        if len(docs) == pos_num:
                            break
                if len(docs) == pos_num:
                    break
        neg_num = passage_num - len(docs)
        if neg_num > 0:
            negative = instance['negative'][:neg_num]
            docs += negative
    
    # Handling the '_fact' case in filename
    elif '_fact' in filename:
        correct_num = math.ceil(passage_num * correct_rate)
        # Adjust correct_num to not exceed passage_num - neg_num, excluding positive_wrong
        if correct_rate == 1.0:
            # For factual-only with noise, use only positive and negative documents
            correct_num = min(correct_num, passage_num - neg_num)
            pos_num = 0  # No positive_wrong documents when correct_rate = 1.0
        else:
            # For other correct_rate values, calculate pos_num for positive_wrong
            pos_num = passage_num - neg_num - correct_num
            if pos_num < 0:
                pos_num = 0  # Ensure pos_num is not negative

        # Select positive documents (factual) first
        indexs_positive = list(range(len(instance['positive'])))
        selected_positive = random.sample(indexs_positive, min(len(indexs_positive), correct_num))
        docs = [instance['positive'][i] for i in selected_positive]

        # Add negative documents (noise) if needed
        if neg_num > 0 and 'negative' in instance:
            docs += instance['negative'][:min(neg_num, len(instance['negative']))]

        # Only add positive_wrong documents if pos_num > 0 and correct_rate < 1.0
        if pos_num > 0 and correct_rate < 1.0:
            indexs_positive_wrong = list(range(len(instance['positive_wrong'])))
            selected_positive_wrong = random.sample(indexs_positive_wrong, min(len(indexs_positive_wrong), pos_num))
            docs += [instance['positive_wrong'][i] for i in selected_positive_wrong]

        # Ensure docs length does not exceed passage_num
        if len(docs) > passage_num:
            random.shuffle(docs)
            docs = docs[:passage_num]
        elif len(docs) < passage_num and 'negative' in instance:
            remaining = passage_num - len(docs)
            docs += instance['negative'][:min(remaining, len(instance['negative']))]
    
    # Default case (when filename doesn't match '_int' or '_fact')
    else:
        if noise_rate == 1:
            neg_num = passage_num
            pos_num = 0
        else:
            if neg_num > len(instance['negative']):
                neg_num = len(instance['negative'])
            elif pos_num > len(instance['positive']):
                pos_num = len(instance['positive'])

        positive = instance['positive'][:pos_num]
        negative = instance['negative'][:neg_num]

        docs = positive + negative
        # Count the positive and negative documents
        num_positive = sum(1 for doc in docs if doc in positive)
        num_negative = sum(1 for doc in docs if doc in negative)
        logging.info(f"Using {num_positive} positive and {num_negative} negative documents as context")
    
    # Shuffle the final document list
    random.shuffle(docs)
    return query, ans, docs