File size: 1,828 Bytes
49f0c5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import jsonlines

# Input file containing classified data
input_file = "data/thirdStep_file.jsonl"

# Output file to store transformed data
output_file = "data/train4465"

# Threshold for considering a label
threshold = 0.5

# Options for different categories
options = [
    {"id": "CapitalRequirements", "text": "Capital Requirements", "meta": "0.00"},
    {"id": "ConsumerProtection", "text": "Consumer Protection", "meta": "0.00"},
    {"id": "RiskManagement", "text": "Risk Management", "meta": "0.00"},
    {"id": "ReportingAndCompliance", "text": "Reporting And Compliance", "meta": "0.00"},
    {"id": "CorporateGovernance", "text": "Corporate Governance", "meta": "0.00"}
]

# Function to process each record
def process_record(record):
    # Extract text and predicted labels
    text = record["text"]
    predicted_labels = record["predicted_labels"]
    
    # Determine accepted categories based on threshold
    accepted_categories = [label for label, score in predicted_labels.items() if score > threshold]
    
    # Determine answer based on accepted categories
    answer = "accept" if accepted_categories else "reject"
    
    # Prepare options with meta
    options_with_meta = [
        {"id": option["id"], "text": option["text"], "meta": option["meta"]} for option in options
    ]
    
    # Construct the output record
    output_record = {
        "text": text,
        "cats": predicted_labels,
        "accept": accepted_categories,
        "answer": answer,
        "options": options_with_meta
    }
    
    return output_record

# Process input file and write transformed data to output file
with jsonlines.open(input_file, "r") as infile, jsonlines.open(output_file, "w") as outfile:
    for record in infile:
        output_record = process_record(record)
        outfile.write(output_record)