# -*- coding: utf-8 -*- """Fine Tuning Numer Two.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1iqPWMaXrktOsY2BwZNdQE8c1B4o1trit """ !pip install datasets !pip install torch !pip install -q -U transformers accelerate !pip install transformers[torch] !pip install accelerate -U !pip install huggingface_hub from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer from datasets import load_dataset # Load the dataset dataset = load_dataset("ajibawa-2023/Children-Stories-Collection", trust_remote_code=True) # Load the pre-trained model and tokenizer tokenizerOne = AutoTokenizer.from_pretrained("google-bert/bert-base-cased") tokenizerTwo = AutoTokenizer.from_pretrained("distilbert/distilbert-base-cased") # Tokenize the dataset def tokenize_function_one(examples): return tokenizerOne(examples["text"], padding="max_length", truncation=True) def tokenize_function_two(examples): return tokenizerTwo(examples["text"], padding="max_length", truncation=True, max_length=512) tokenizedDatasetOne = dataset.map(tokenize_function_one, batched=True) shuffled_dataset = tokenizedDatasetOne['train'].shuffle(seed=42) tokenized_datasets_oneTrain = shuffled_dataset.select(range(10000)) tokenized_datasets_oneTest = shuffled_dataset.select(range(10000, 12500)) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizerOne, mlm_probability=0.15) training_args = TrainingArguments( "test_trainer", num_train_epochs=3, per_device_train_batch_size=32, per_device_eval_batch_size=32, warmup_steps=500, weight_decay=0.01, ) # Model One: google-bert/bert-base-cased model_one = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-cased") trainer_one = Trainer( model=model_one, args=training_args, train_dataset=tokenized_datasets_oneTrain, eval_dataset=tokenized_datasets_oneTest, data_collator=data_collator, ) trainer_one.train() # Get your API token from HuggingFace. api_token = "redacted" from transformers import BertConfig, BertModel model_one.push_to_hub("emma7897/bert_two", token = api_token) tokenizerOne.push_to_hub("emma7897/bert_two", token = api_token) tokenizedDatasetTwo = dataset.map(tokenize_function_two, batched=True) shuffled_dataset = tokenizedDatasetTwo['train'].shuffle(seed=42) tokenized_datasets_twoTrain = shuffled_dataset.select(range(10000)) tokenized_datasets_twoTest = shuffled_dataset.select(range(10000, 12500)) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizerTwo, mlm_probability=0.15) training_args = TrainingArguments( "test_trainer", num_train_epochs=3, per_device_train_batch_size=32, per_device_eval_batch_size=32, warmup_steps=500, weight_decay=0.01, ) # Model Two: distilbert/distilbert-base-cased model_two = AutoModelForMaskedLM.from_pretrained("distilbert/distilbert-base-cased") trainer_two = Trainer( model=model_two, args=training_args, train_dataset=tokenized_datasets_twoTrain, eval_dataset=tokenized_datasets_twoTest, data_collator=data_collator, ) trainer_two.train() from transformers import DistilBertConfig, DistilBertModel model_two.push_to_hub("emma7897/distilbert_two", token=api_token) tokenizerTwo.push_to_hub("emma7897/distilbert_two", token=api_token)