In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [24]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    'test-trainer',
    save_strategy='epoch',
    push_to_hub=True
)

In [26]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    # data_collator=data_collator, THE DEFAULT DATACOLLATOR IS DataCollatorWithPadding
    tokenizer=tokenizer
)

In [33]:
trainer.train()

Step,Training Loss
500,0.5281
1000,0.2847


TrainOutput(global_step=1377, training_loss=0.33464579696489055, metrics={'train_runtime': 225.3138, 'train_samples_per_second': 48.839, 'train_steps_per_second': 6.111, 'total_flos': 405114969714960.0, 'train_loss': 0.33464579696489055, 'epoch': 3.0})

In [34]:
trainer.push_to_hub('End of training')

CommitInfo(commit_url='https://huggingface.co/dantedgp/test-trainer/commit/01f53eabc3a6811a185fdb75f9c38f3bd2368327', commit_message='End of training', commit_description='', oid='01f53eabc3a6811a185fdb75f9c38f3bd2368327', pr_url=None, pr_revision=None, pr_num=None)

In [37]:
trained = trainer.model
trained

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [41]:
trained.push_to_hub('helloworld')

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dantedgp/helloworld/commit/1d1921b00fac46f76e9889846966fb6463ba6ad5', commit_message='Upload BertForSequenceClassification', commit_description='', oid='1d1921b00fac46f76e9889846966fb6463ba6ad5', pr_url=None, pr_revision=None, pr_num=None)

In [7]:
predictions = trainer.predict(tokenized_datasets['validation'])
predictions.predictions.shape, predictions.label_ids.shape

((408, 2), (408,))

In [8]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

In [9]:
import evaluate
metric = evaluate.load('glue', 'mrpc')
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.8627450980392157, 'f1': 0.9057239057239057}

In [22]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [47]:
from huggingface_hub import create_repo

In [49]:
create_repo('dummy-model')

RepoUrl('https://huggingface.co/dantedgp/dummy-model', endpoint='https://huggingface.co', repo_type='model', repo_id='dantedgp/dummy-model')

In [57]:
from huggingface_hub import upload_file

upload_file(
    path_or_fileobj='fine_tuning.ipynb',
    path_in_repo='fine_tuning.ipynb',
    repo_id='dantedgp/test-trainer'
)

RepositoryNotFoundError: 404 Client Error. (Request ID: Root=1-66814b92-084543d21a090a0d33651128;0e873452-423f-4b6f-ac06-df2e696ba2fa)

Repository Not Found for url: https://huggingface.co/api/models/dantedgp/namespace/preupload/main.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated.
Note: Creating a commit assumes that the repo already exists on the Huggingface Hub. Please use `create_repo` if it's not the case.

In [53]:
??upload_file

[1;31mSignature:[0m
[0mupload_file[0m[1;33m([0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mpath_or_fileobj[0m[1;33m:[0m [1;34m'Union[str, Path, bytes, BinaryIO]'[0m[1;33m,[0m[1;33m
[0m    [0mpath_in_repo[0m[1;33m:[0m [1;34m'str'[0m[1;33m,[0m[1;33m
[0m    [0mrepo_id[0m[1;33m:[0m [1;34m'str'[0m[1;33m,[0m[1;33m
[0m    [0mtoken[0m[1;33m:[0m [1;34m'Union[str, bool, None]'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mrepo_type[0m[1;33m:[0m [1;34m'Optional[str]'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mrevision[0m[1;33m:[0m [1;34m'Optional[str]'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mcommit_message[0m[1;33m:[0m [1;34m'Optional[str]'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mcommit_description[0m[1;33m:[0m [1;34m'Optional[str]'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mcreate_pr[0m[1;33m:[0m [1;34m'Optional[bo