import datasets import numpy as np import torch import transformers from config import epochs, batch_size, learning_rate from model import tokenizer, multitask_model from mtm import MultitaskTrainer, NLPDataCollator, DataLoaderWithTaskname import pandas as pd from datasets import Dataset, DatasetDict from data_predict import convert_to_stsb_features,convert_to_features from huggingface_hub import hf_hub_download,snapshot_download # features_dict = {} # extra_feature_dict = {} # sentinews_location = "" # df_document_croatian_test = pd.read_csv(sentinews_location+"textlabel.tsv", sep="\t") # df_document_croatian_test = df_document_croatian_test[["content"]] def predict(): # gather everyone if you want to have a single DatasetDict document = DatasetDict({ # "train": Dataset.from_pandas(df_document_sl_hr_train), # "valid": Dataset.from_pandas(df_document_sl_hr_valid), "test": Dataset.from_dict({"content":["Volim ti"]}) }) dataset_dict = { "document": document, } for task_name, dataset in dataset_dict.items(): print(task_name) print(dataset_dict[task_name]["test"][0]) print() convert_func_dict = { "document": convert_to_stsb_features, # "paragraph": convert_to_stsb_features, # "sentence": convert_to_stsb_features, } features_dict = convert_to_features(dataset_dict, convert_func_dict) return features_dict #model_link = snapshot_download(repo_id="FFZG-cleopatra/Croatian-News-Classifier") model_link = hf_hub_download(repo_id="FFZG-cleopatra/Croatian-News-Classifier",filename = "pytorch_model.bin") # multitask_model.from_pretrained(, config="/media/gaurish/angela/projects/CroatianSlovenEnglishBert/i-got-u-brother-cleopatra-workshop/src/models/multitask_model_3ep/config.json") multitask_model.load_state_dict(torch.load(model_link, map_location=torch.device('cpu'))) # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # multitask_model.to(device) predictions = [] features_dict = predict() for _, batch in enumerate(features_dict["document"]['test']): for key, value in batch.items(): batch[key] = batch[key].to(device) task_model = multitask_model.get_model("document") classifier_output = task_model.forward( torch.unsqueeze(batch["input_ids"], 0), torch.unsqueeze(batch["attention_mask"], 0),) print(tokenizer.decode(batch["input_ids"],skip_special_tokens=True)) prediction =torch.max(classifier_output.logits, axis=1) predictions.append(prediction.indices.item()) pd.DataFrame({"original_predictions":predictions}).to_csv("eacl_slavic.tsv") trainer = MultitaskTrainer( model=multitask_model, args=transformers.TrainingArguments( learning_rate=learning_rate, output_dir="/tmp", do_train=False, do_eval=True, # evaluation_strategy ="steps", # num_train_epochs=epochs, # fp16=True, # Adjust batch size if this doesn't fit on the Colab GPU per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, save_steps=3000, # eval_steps=50, load_best_model_at_end=True, ), data_collator=NLPDataCollator(tokenizer=tokenizer), callbacks=[], ) print(features_dict["document"]["test"]) tests_dict = {} for task_name in ["document"]: # "paragraph", "sentence" test_dataloader = DataLoaderWithTaskname( task_name, trainer.get_eval_dataloader(features_dict[task_name]["test"]) ) print(len(trainer.get_eval_dataloader(features_dict[task_name]["test"]))) print(test_dataloader.data_loader.collate_fn) print(len(test_dataloader.data_loader)) tests_dict[task_name] = trainer.prediction_loop( test_dataloader, description=f"Testing: {task_name}" ) print(tests_dict) for task_name in ["document", ]: #"paragraph","sentence" for metric in ["precision", "recall", "f1"]: print("test {} {}:".format(metric, task_name), datasets.load_metric(metric, name="dev {} {}".format(metric, task_name)).compute( predictions=np.argmax( tests_dict[task_name].predictions, axis=1), references=tests_dict[task_name].label_ids, average="macro" )) print()