Spaces:
Runtime error
Runtime error
import datasets | |
import numpy as np | |
import torch | |
import transformers | |
from config import epochs, batch_size, learning_rate | |
from model import tokenizer, multitask_model | |
from mtm import MultitaskTrainer, NLPDataCollator, DataLoaderWithTaskname | |
import pandas as pd | |
# from data_5_LT23 import features_dict,extra_feature_dict | |
from data_predict import convert_to_stsb_features,convert_to_features | |
features_dict = {} | |
extra_feature_dict = {} | |
sentinews_location = "" | |
df_document_croatian_test = pd.read_csv(sentinews_location+"textlabel.tsv", sep="\t") | |
df_document_croatian_test = df_document_croatian_test[["content"]] | |
# gather everyone if you want to have a single DatasetDict | |
document = DatasetDict({ | |
# "train": Dataset.from_pandas(df_document_sl_hr_train), | |
# "valid": Dataset.from_pandas(df_document_sl_hr_valid), | |
"test": Dataset.from_pandas(df_document_croatian_test) | |
}) | |
dataset_dict = { | |
"document": document, | |
} | |
for task_name, dataset in dataset_dict.items(): | |
print(task_name) | |
print(dataset_dict[task_name]["test"][0]) | |
print() | |
convert_func_dict = { | |
"document": convert_to_stsb_features, | |
# "paragraph": convert_to_stsb_features, | |
# "sentence": convert_to_stsb_features, | |
} | |
features_dict = convert_to_features(dataset_dict, convert_func_dict) | |
from huggingface_hub import hf_hub_download,snapshot_download | |
snapshot_download(repo_id="FFZG-cleopatra/Croatian-News-Classifier") | |
# multitask_model.from_pretrained(, config="/media/gaurish/angela/projects/CroatianSlovenEnglishBert/i-got-u-brother-cleopatra-workshop/src/models/multitask_model_3ep/config.json") | |
multitask_model.load_state_dict(torch.load( | |
"/home/gaurishthakkar/projects/i-got-u-brother-cleopatra-workshop/src/models/multitask_model_3ep/pytorch_model.bin" | |
)) | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
multitask_model.to(device) | |
predictions = [] | |
for _, batch in enumerate(features_dict["document"]['test']): | |
for key, value in batch.items(): | |
batch[key] = batch[key].to(device) | |
task_model = multitask_model.get_model("document") | |
classifier_output = task_model.forward( | |
torch.unsqueeze(batch["input_ids"], 0), | |
torch.unsqueeze(batch["attention_mask"], 0),) | |
print(tokenizer.decode(batch["input_ids"],skip_special_tokens=True)) | |
prediction =torch.max(classifier_output.logits, axis=1) | |
predictions.append(prediction.indices.item()) | |
pd.DataFrame({"original_predictions":predictions}).to_csv("eacl_slavic.tsv") | |
trainer = MultitaskTrainer( | |
model=multitask_model, | |
args=transformers.TrainingArguments( | |
learning_rate=learning_rate, | |
output_dir="/tmp", | |
do_train=False, | |
do_eval=True, | |
# evaluation_strategy ="steps", | |
# num_train_epochs=epochs, | |
fp16=True, | |
# Adjust batch size if this doesn't fit on the Colab GPU | |
per_device_train_batch_size=batch_size, | |
per_device_eval_batch_size=batch_size, | |
save_steps=3000, | |
# eval_steps=50, | |
load_best_model_at_end=True, | |
), | |
data_collator=NLPDataCollator(tokenizer=tokenizer), | |
callbacks=[], | |
) | |
print(features_dict["document"]["test"]) | |
tests_dict = {} | |
for task_name in ["document"]: # "paragraph", "sentence" | |
test_dataloader = DataLoaderWithTaskname( | |
task_name, | |
trainer.get_eval_dataloader(features_dict[task_name]["test"]) | |
) | |
print(len(trainer.get_eval_dataloader(features_dict[task_name]["test"]))) | |
print(test_dataloader.data_loader.collate_fn) | |
print(len(test_dataloader.data_loader)) | |
tests_dict[task_name] = trainer.prediction_loop( | |
test_dataloader, | |
description=f"Testing: {task_name}" | |
) | |
print(tests_dict) | |
for task_name in ["document", ]: #"paragraph","sentence" | |
for metric in ["precision", "recall", "f1"]: | |
print("test {} {}:".format(metric, task_name), | |
datasets.load_metric(metric, | |
name="dev {} {}".format(metric, task_name)).compute( | |
predictions=np.argmax( | |
tests_dict[task_name].predictions, axis=1), | |
references=tests_dict[task_name].label_ids, average="macro" | |
)) | |
print() | |