File size: 4,421 Bytes
5285b7f
 
 
 
 
 
 
 
fe9a4a8
5285b7f
 
86f28e8
 
 
3aef37d
 
 
5285b7f
86f28e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3aef37d
 
5285b7f
75ce96c
5285b7f
86f28e8
 
5285b7f
 
75ce96c
 
b24d99b
5285b7f
3aef37d
5285b7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b24d99b
5285b7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import datasets
import numpy as np
import torch
import transformers
from config import epochs, batch_size, learning_rate
from model import tokenizer, multitask_model
from mtm import MultitaskTrainer, NLPDataCollator, DataLoaderWithTaskname
import pandas as pd
from datasets import Dataset, DatasetDict
from data_predict import convert_to_stsb_features,convert_to_features

from huggingface_hub import hf_hub_download,snapshot_download


# features_dict = {}
# extra_feature_dict = {}
# sentinews_location = ""

# df_document_croatian_test = pd.read_csv(sentinews_location+"textlabel.tsv", sep="\t")
# df_document_croatian_test = df_document_croatian_test[["content"]]
def predict():
    # gather everyone if you want to have a single DatasetDict
    document = DatasetDict({
        # "train": Dataset.from_pandas(df_document_sl_hr_train),
        # "valid": Dataset.from_pandas(df_document_sl_hr_valid),
        "test": Dataset.from_dict({"content":["Volim ti"]})
    })
    
    dataset_dict = {
        "document": document,
    }
    
    for task_name, dataset in dataset_dict.items():
        print(task_name)
        print(dataset_dict[task_name]["test"][0])
        print()
    
    
    convert_func_dict = {
        "document": convert_to_stsb_features,
        # "paragraph": convert_to_stsb_features,
        # "sentence": convert_to_stsb_features,
    }
    
    features_dict = convert_to_features(dataset_dict, convert_func_dict)
    
    return features_dict

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#model_link = snapshot_download(repo_id="FFZG-cleopatra/Croatian-News-Classifier")
model_link = hf_hub_download(repo_id="FFZG-cleopatra/Croatian-News-Classifier",filename = "pytorch_model.bin")

# multitask_model.from_pretrained(, config="/media/gaurish/angela/projects/CroatianSlovenEnglishBert/i-got-u-brother-cleopatra-workshop/src/models/multitask_model_3ep/config.json")
multitask_model.load_state_dict(torch.load(model_link, map_location=device))

# multitask_model.to(device)
predictions = []
features_dict = predict()
for _, batch in enumerate(features_dict["document"]['test']):
    for key, value in batch.items():
        batch[key] = batch[key].to(device)
    
    task_model = multitask_model.get_model("document")
    classifier_output = task_model.forward(
            torch.unsqueeze(batch["input_ids"], 0),
            torch.unsqueeze(batch["attention_mask"], 0),)
    
    print(tokenizer.decode(batch["input_ids"],skip_special_tokens=True))
    prediction =torch.max(classifier_output.logits, axis=1)
    predictions.append(prediction.indices.item())

pd.DataFrame({"original_predictions":predictions}).to_csv("eacl_slavic.tsv")


trainer = MultitaskTrainer(
    model=multitask_model,
    args=transformers.TrainingArguments(
        learning_rate=learning_rate,
        output_dir="/tmp",
        do_train=False,
        do_eval=True,
        # evaluation_strategy ="steps",
        # num_train_epochs=epochs,
        # fp16=True,
        # Adjust batch size if this doesn't fit on the Colab GPU
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        save_steps=3000,
        # eval_steps=50,
        load_best_model_at_end=True,
        
    ),
    data_collator=NLPDataCollator(tokenizer=tokenizer),
    callbacks=[],
    
)
print(features_dict["document"]["test"])
tests_dict = {}
for task_name in ["document"]: # "paragraph", "sentence"
    test_dataloader = DataLoaderWithTaskname(
        task_name,
        trainer.get_eval_dataloader(features_dict[task_name]["test"])
    )
    print(len(trainer.get_eval_dataloader(features_dict[task_name]["test"])))
    print(test_dataloader.data_loader.collate_fn)
    print(len(test_dataloader.data_loader))
    tests_dict[task_name] = trainer.prediction_loop(
        test_dataloader,
        description=f"Testing: {task_name}"
    )
print(tests_dict)
for task_name in ["document",  ]: #"paragraph","sentence"
    for metric in ["precision", "recall", "f1"]:
        print("test {} {}:".format(metric, task_name),
              datasets.load_metric(metric,
                                   name="dev {} {}".format(metric, task_name)).compute(
                  predictions=np.argmax(
                      tests_dict[task_name].predictions, axis=1),
                  references=tests_dict[task_name].label_ids, average="macro"
              ))
print()