File size: 1,711 Bytes
49f0c5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import spacy
from spacy.training import Example
import jsonlines
import random

# Load a blank English model
nlp = spacy.blank("en")

# Add text classification pipeline to the model
textcat = nlp.add_pipe('textcat_multilabel', last=True)
textcat.add_label("CapitalRequirements")
textcat.add_label("ConsumerProtection")
textcat.add_label("RiskManagement")
textcat.add_label("ReportingAndCompliance")
textcat.add_label("CorporateGovernance")

# Path to the processed data file
processed_data_file = "data/firstStep_file.jsonl"

# Open the JSONL file and extract text and labels
with jsonlines.open(processed_data_file) as reader:
    processed_data = list(reader)

# Convert processed data to spaCy format
spacy_train_data = []
for obj in processed_data:
    text = obj["text"]
    label = {
        "CapitalRequirements": obj["label"] == "CapitalRequirements",
        "ConsumerProtection": obj["label"] == "ConsumerProtection",
        "RiskManagement": obj["label"] == "RiskManagement",
        "ReportingAndCompliance": obj["label"] == "ReportingAndCompliance",
        "CorporateGovernance": obj["label"] == "CorporateGovernance"
    }
    spacy_train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": label}))

# Initialize the model and get the optimizer
optimizer = nlp.initialize()

# Train the text classification model
n_iter = 10
for i in range(n_iter):
    spacy.util.fix_random_seed(1)
    random.shuffle(spacy_train_data)
    losses = {}
    for batch in spacy.util.minibatch(spacy_train_data, size=8):
        nlp.update(batch, losses=losses, sgd=optimizer)
    print("Iteration:", i, "Losses:", losses)

# Save the trained model
output_dir = "./my_trained_model"
nlp.to_disk(output_dir)