thak123 commited on
Commit
5285b7f
1 Parent(s): bd50f4a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -0
app.py CHANGED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+ import numpy as np
3
+ import torch
4
+ import transformers
5
+ from config import epochs, batch_size, learning_rate
6
+ from model import tokenizer, multitask_model
7
+ from mtm import MultitaskTrainer, NLPDataCollator, DataLoaderWithTaskname
8
+ import pandas as pd
9
+ # from data_5_LT23 import features_dict,extra_feature_dict
10
+ from data_predict import convert_to_stsb_features,convert_to_features
11
+
12
+ features_dict = {}
13
+ extra_feature_dict = {}
14
+ sentinews_location = ""
15
+
16
+ df_document_croatian_test = pd.read_csv(sentinews_location+"textlabel.tsv", sep="\t")
17
+ df_document_croatian_test = df_document_croatian_test[["content"]]
18
+
19
+ # gather everyone if you want to have a single DatasetDict
20
+ document = DatasetDict({
21
+ # "train": Dataset.from_pandas(df_document_sl_hr_train),
22
+ # "valid": Dataset.from_pandas(df_document_sl_hr_valid),
23
+ "test": Dataset.from_pandas(df_document_croatian_test)
24
+ })
25
+
26
+ dataset_dict = {
27
+ "document": document,
28
+ }
29
+
30
+ for task_name, dataset in dataset_dict.items():
31
+ print(task_name)
32
+ print(dataset_dict[task_name]["test"][0])
33
+ print()
34
+
35
+
36
+ convert_func_dict = {
37
+ "document": convert_to_stsb_features,
38
+ # "paragraph": convert_to_stsb_features,
39
+ # "sentence": convert_to_stsb_features,
40
+ }
41
+
42
+ features_dict = convert_to_features(dataset_dict, convert_func_dict)
43
+
44
+ from huggingface_hub import hf_hub_download,snapshot_download
45
+ snapshot_download(repo_id="FFZG-cleopatra/Croatian-News-Classifier")
46
+
47
+
48
+
49
+ # multitask_model.from_pretrained(, config="/media/gaurish/angela/projects/CroatianSlovenEnglishBert/i-got-u-brother-cleopatra-workshop/src/models/multitask_model_3ep/config.json")
50
+ multitask_model.load_state_dict(torch.load(
51
+ "/home/gaurishthakkar/projects/i-got-u-brother-cleopatra-workshop/src/models/multitask_model_3ep/pytorch_model.bin"
52
+ ))
53
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
54
+ multitask_model.to(device)
55
+ predictions = []
56
+ for _, batch in enumerate(features_dict["document"]['test']):
57
+ for key, value in batch.items():
58
+ batch[key] = batch[key].to(device)
59
+
60
+ task_model = multitask_model.get_model("document")
61
+ classifier_output = task_model.forward(
62
+ torch.unsqueeze(batch["input_ids"], 0),
63
+ torch.unsqueeze(batch["attention_mask"], 0),)
64
+
65
+ print(tokenizer.decode(batch["input_ids"],skip_special_tokens=True))
66
+ prediction =torch.max(classifier_output.logits, axis=1)
67
+ predictions.append(prediction.indices.item())
68
+
69
+ pd.DataFrame({"original_predictions":predictions}).to_csv("eacl_slavic.tsv")
70
+
71
+
72
+ trainer = MultitaskTrainer(
73
+ model=multitask_model,
74
+ args=transformers.TrainingArguments(
75
+ learning_rate=learning_rate,
76
+ output_dir="/tmp",
77
+ do_train=False,
78
+ do_eval=True,
79
+ # evaluation_strategy ="steps",
80
+ # num_train_epochs=epochs,
81
+ fp16=True,
82
+ # Adjust batch size if this doesn't fit on the Colab GPU
83
+ per_device_train_batch_size=batch_size,
84
+ per_device_eval_batch_size=batch_size,
85
+ save_steps=3000,
86
+ # eval_steps=50,
87
+ load_best_model_at_end=True,
88
+
89
+ ),
90
+ data_collator=NLPDataCollator(tokenizer=tokenizer),
91
+ callbacks=[],
92
+
93
+ )
94
+ print(features_dict["document"]["test"])
95
+ tests_dict = {}
96
+ for task_name in ["document"]: # "paragraph", "sentence"
97
+ test_dataloader = DataLoaderWithTaskname(
98
+ task_name,
99
+ trainer.get_eval_dataloader(features_dict[task_name]["test"])
100
+ )
101
+ print(len(trainer.get_eval_dataloader(features_dict[task_name]["test"])))
102
+ print(test_dataloader.data_loader.collate_fn)
103
+ print(len(test_dataloader.data_loader))
104
+ tests_dict[task_name] = trainer.prediction_loop(
105
+ test_dataloader,
106
+ description=f"Testing: {task_name}"
107
+ )
108
+ print(tests_dict)
109
+ for task_name in ["document", ]: #"paragraph","sentence"
110
+ for metric in ["precision", "recall", "f1"]:
111
+ print("test {} {}:".format(metric, task_name),
112
+ datasets.load_metric(metric,
113
+ name="dev {} {}".format(metric, task_name)).compute(
114
+ predictions=np.argmax(
115
+ tests_dict[task_name].predictions, axis=1),
116
+ references=tests_dict[task_name].label_ids, average="macro"
117
+ ))
118
+ print()
119
+
120
+