emma7897 commited on
Commit
be80892
1 Parent(s): 36cf581

Upload 3 files

Browse files
fine_tuning_number_one.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Fine Tuning Number One.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1ICULTdmxijXHisMebXX5KmPzxzfZ2TtH
8
+ """
9
+
10
+
11
+
12
+ !pip install datasets
13
+ !pip install torch
14
+ !pip install -q -U transformers accelerate
15
+ !pip install transformers[torch]
16
+ !pip install accelerate -U
17
+ !pip install huggingface_hub
18
+
19
+ from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer
20
+ from datasets import load_dataset
21
+
22
+ # Load the dataset
23
+ datasetTrain = load_dataset("rcds/wikipedia-for-mask-filling", "original_512", trust_remote_code=True)
24
+ datasetTest = load_dataset("rcds/wikipedia-for-mask-filling", "original_4096", trust_remote_code=True)
25
+
26
+ # Load the pre-trained model and tokenizer
27
+ tokenizerOne = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
28
+ tokenizerTwo = AutoTokenizer.from_pretrained("distilbert/distilbert-base-cased")
29
+
30
+ # Tokenize the dataset
31
+ def tokenize_function_one(examples):
32
+ return tokenizerOne(examples["texts"], padding="max_length", truncation=True)
33
+
34
+ def tokenize_function_two(examples):
35
+ return tokenizerTwo(examples["texts"], padding="max_length", truncation=True, max_length=512)
36
+
37
+ # Make the datasets
38
+ tokenized_datasets_oneTrain = datasetTrain.map(tokenize_function_one, batched=True)
39
+ tokenized_datasets_oneTest = datasetTest.map(tokenize_function_one, batched=True)
40
+ tokenized_datasets_oneTrain = tokenized_datasets_oneTrain["train"].select(range(10000))
41
+ tokenized_datasets_oneTest = tokenized_datasets_oneTest["train"].select(range(2500))
42
+
43
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizerOne, mlm_probability=0.15)
44
+ training_args = TrainingArguments(
45
+ "test_trainer",
46
+ num_train_epochs=3,
47
+ per_device_train_batch_size=32,
48
+ per_device_eval_batch_size=32,
49
+ warmup_steps=500,
50
+ weight_decay=0.01,
51
+ )
52
+
53
+ # Model One: google-bert/bert-base-cased
54
+ model_one = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-cased")
55
+ trainer_one = Trainer(
56
+ model=model_one,
57
+ args=training_args,
58
+ train_dataset=tokenized_datasets_oneTrain,
59
+ eval_dataset=tokenized_datasets_oneTest,
60
+ data_collator=data_collator,
61
+ )
62
+ trainer_one.train()
63
+
64
+ # Get your API token from HuggingFace.
65
+ api_token = "redacted"
66
+
67
+ from transformers import BertConfig, BertModel
68
+
69
+ model_one.push_to_hub("emma7897/bert_one", token = api_token)
70
+ tokenizerOne.push_to_hub("emma7897/bert_one", token = api_token)
71
+
72
+ # Make the datasets
73
+ tokenized_datasets_twoTrain = datasetTrain.map(tokenize_function_two, batched=True)
74
+ tokenized_datasets_twoTest = datasetTest.map(tokenize_function_two, batched=True)
75
+ tokenized_datasets_twoTrain = tokenized_datasets_twoTrain["train"].select(range(10000))
76
+ tokenized_datasets_twoTest = tokenized_datasets_twoTest["train"].select(range(2500))
77
+
78
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizerTwo, mlm_probability=0.15)
79
+ training_args = TrainingArguments(
80
+ "test_trainer",
81
+ num_train_epochs=3,
82
+ per_device_train_batch_size=48,
83
+ per_device_eval_batch_size=48,
84
+ warmup_steps=500,
85
+ weight_decay=0.01,
86
+ )
87
+
88
+ # Model Two: distilbert/distilbert-base-cased
89
+ model_two = AutoModelForMaskedLM.from_pretrained("distilbert/distilbert-base-cased")
90
+ trainer_two = Trainer(
91
+ model=model_two,
92
+ args=training_args,
93
+ train_dataset=tokenized_datasets_twoTrain,
94
+ eval_dataset=tokenized_datasets_twoTest,
95
+ data_collator=data_collator,
96
+ )
97
+ trainer_two.train()
98
+
99
+ from transformers import DistilBertConfig, DistilBertModel
100
+
101
+ # Push my DistilBert model to the Hub.
102
+ model_two.push_to_hub("emma7897/distilbert_one", token=api_token)
103
+ tokenizerTwo.push_to_hub("emma7897/distilbert_one", token=api_token)
fine_tuning_numer_two.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Fine Tuning Numer Two.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1iqPWMaXrktOsY2BwZNdQE8c1B4o1trit
8
+ """
9
+
10
+ !pip install datasets
11
+ !pip install torch
12
+ !pip install -q -U transformers accelerate
13
+ !pip install transformers[torch]
14
+ !pip install accelerate -U
15
+ !pip install huggingface_hub
16
+
17
+ from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer
18
+ from datasets import load_dataset
19
+
20
+ # Load the dataset
21
+ dataset = load_dataset("ajibawa-2023/Children-Stories-Collection", trust_remote_code=True)
22
+
23
+ # Load the pre-trained model and tokenizer
24
+ tokenizerOne = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
25
+ tokenizerTwo = AutoTokenizer.from_pretrained("distilbert/distilbert-base-cased")
26
+
27
+ # Tokenize the dataset
28
+ def tokenize_function_one(examples):
29
+ return tokenizerOne(examples["text"], padding="max_length", truncation=True)
30
+
31
+ def tokenize_function_two(examples):
32
+ return tokenizerTwo(examples["text"], padding="max_length", truncation=True, max_length=512)
33
+
34
+ tokenizedDatasetOne = dataset.map(tokenize_function_one, batched=True)
35
+ shuffled_dataset = tokenizedDatasetOne['train'].shuffle(seed=42)
36
+ tokenized_datasets_oneTrain = shuffled_dataset.select(range(10000))
37
+ tokenized_datasets_oneTest = shuffled_dataset.select(range(10000, 12500))
38
+
39
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizerOne, mlm_probability=0.15)
40
+ training_args = TrainingArguments(
41
+ "test_trainer",
42
+ num_train_epochs=3,
43
+ per_device_train_batch_size=32,
44
+ per_device_eval_batch_size=32,
45
+ warmup_steps=500,
46
+ weight_decay=0.01,
47
+ )
48
+
49
+ # Model One: google-bert/bert-base-cased
50
+ model_one = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-cased")
51
+ trainer_one = Trainer(
52
+ model=model_one,
53
+ args=training_args,
54
+ train_dataset=tokenized_datasets_oneTrain,
55
+ eval_dataset=tokenized_datasets_oneTest,
56
+ data_collator=data_collator,
57
+ )
58
+ trainer_one.train()
59
+
60
+ # Get your API token from HuggingFace.
61
+ api_token = "redacted"
62
+
63
+ from transformers import BertConfig, BertModel
64
+
65
+ model_one.push_to_hub("emma7897/bert_two", token = api_token)
66
+ tokenizerOne.push_to_hub("emma7897/bert_two", token = api_token)
67
+
68
+ tokenizedDatasetTwo = dataset.map(tokenize_function_two, batched=True)
69
+ shuffled_dataset = tokenizedDatasetTwo['train'].shuffle(seed=42)
70
+ tokenized_datasets_twoTrain = shuffled_dataset.select(range(10000))
71
+ tokenized_datasets_twoTest = shuffled_dataset.select(range(10000, 12500))
72
+
73
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizerTwo, mlm_probability=0.15)
74
+ training_args = TrainingArguments(
75
+ "test_trainer",
76
+ num_train_epochs=3,
77
+ per_device_train_batch_size=32,
78
+ per_device_eval_batch_size=32,
79
+ warmup_steps=500,
80
+ weight_decay=0.01,
81
+ )
82
+
83
+ # Model Two: distilbert/distilbert-base-cased
84
+ model_two = AutoModelForMaskedLM.from_pretrained("distilbert/distilbert-base-cased")
85
+ trainer_two = Trainer(
86
+ model=model_two,
87
+ args=training_args,
88
+ train_dataset=tokenized_datasets_twoTrain,
89
+ eval_dataset=tokenized_datasets_twoTest,
90
+ data_collator=data_collator,
91
+ )
92
+ trainer_two.train()
93
+
94
+ from transformers import DistilBertConfig, DistilBertModel
95
+
96
+ model_two.push_to_hub("emma7897/distilbert_two", token=api_token)
97
+ tokenizerTwo.push_to_hub("emma7897/distilbert_two", token=api_token)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ nltk
4
+ datasets