# -*- coding: utf-8 -*- """Fine Tuning Number One.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1ICULTdmxijXHisMebXX5KmPzxzfZ2TtH """ !pip install datasets !pip install torch !pip install -q -U transformers accelerate !pip install transformers[torch] !pip install accelerate -U !pip install huggingface_hub from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer from datasets import load_dataset # Load the dataset datasetTrain = load_dataset("rcds/wikipedia-for-mask-filling", "original_512", trust_remote_code=True) datasetTest = load_dataset("rcds/wikipedia-for-mask-filling", "original_4096", trust_remote_code=True) # Load the pre-trained model and tokenizer tokenizerOne = AutoTokenizer.from_pretrained("google-bert/bert-base-cased") tokenizerTwo = AutoTokenizer.from_pretrained("distilbert/distilbert-base-cased") # Tokenize the dataset def tokenize_function_one(examples): return tokenizerOne(examples["texts"], padding="max_length", truncation=True) def tokenize_function_two(examples): return tokenizerTwo(examples["texts"], padding="max_length", truncation=True, max_length=512) # Make the datasets tokenized_datasets_oneTrain = datasetTrain.map(tokenize_function_one, batched=True) tokenized_datasets_oneTest = datasetTest.map(tokenize_function_one, batched=True) tokenized_datasets_oneTrain = tokenized_datasets_oneTrain["train"].select(range(10000)) tokenized_datasets_oneTest = tokenized_datasets_oneTest["train"].select(range(2500)) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizerOne, mlm_probability=0.15) training_args = TrainingArguments( "test_trainer", num_train_epochs=3, per_device_train_batch_size=32, per_device_eval_batch_size=32, warmup_steps=500, weight_decay=0.01, ) # Model One: google-bert/bert-base-cased model_one = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-cased") trainer_one = Trainer( model=model_one, args=training_args, train_dataset=tokenized_datasets_oneTrain, eval_dataset=tokenized_datasets_oneTest, data_collator=data_collator, ) trainer_one.train() # Get your API token from HuggingFace. api_token = "redacted" from transformers import BertConfig, BertModel model_one.push_to_hub("emma7897/bert_one", token = api_token) tokenizerOne.push_to_hub("emma7897/bert_one", token = api_token) # Make the datasets tokenized_datasets_twoTrain = datasetTrain.map(tokenize_function_two, batched=True) tokenized_datasets_twoTest = datasetTest.map(tokenize_function_two, batched=True) tokenized_datasets_twoTrain = tokenized_datasets_twoTrain["train"].select(range(10000)) tokenized_datasets_twoTest = tokenized_datasets_twoTest["train"].select(range(2500)) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizerTwo, mlm_probability=0.15) training_args = TrainingArguments( "test_trainer", num_train_epochs=3, per_device_train_batch_size=48, per_device_eval_batch_size=48, warmup_steps=500, weight_decay=0.01, ) # Model Two: distilbert/distilbert-base-cased model_two = AutoModelForMaskedLM.from_pretrained("distilbert/distilbert-base-cased") trainer_two = Trainer( model=model_two, args=training_args, train_dataset=tokenized_datasets_twoTrain, eval_dataset=tokenized_datasets_twoTest, data_collator=data_collator, ) trainer_two.train() from transformers import DistilBertConfig, DistilBertModel # Push my DistilBert model to the Hub. model_two.push_to_hub("emma7897/distilbert_one", token=api_token) tokenizerTwo.push_to_hub("emma7897/distilbert_one", token=api_token)