nicoleathy commited on
Commit
c7a7b12
·
verified ·
1 Parent(s): d071d21

Upload 2 files

Browse files
Files changed (2) hide show
  1. competition/gemma-2-9b.py +128 -0
  2. competition/llama.py +145 -0
competition/gemma-2-9b.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
2
+ from datasets import Dataset
3
+ import pandas as pd
4
+ from sklearn.model_selection import train_test_split
5
+ from peft import get_peft_model, LoraConfig, TaskType
6
+ import evaluate
7
+ import numpy as np
8
+
9
+ # Load the dataset
10
+ file_path = 'train_en.csv'
11
+ dataset = pd.read_csv(file_path)
12
+
13
+ # Map labels to expected responses
14
+ label_mapping = {
15
+ "Yes": 0,
16
+ "No": 1,
17
+ "It doesn't matter": 2,
18
+ "Unimportant": 2, # Assuming "unimportant" is synonymous with "It doesn't matter"
19
+ "Incorrect questioning": 3,
20
+ "Correct answers": 4
21
+ }
22
+
23
+ # Apply label mapping
24
+ dataset['label'] = dataset['label'].map(label_mapping)
25
+
26
+ # Handle NaN values: Drop rows where label is NaN
27
+ dataset = dataset.dropna(subset=['label'])
28
+
29
+ # Ensure labels are integers
30
+ dataset['label'] = dataset['label'].astype(int)
31
+
32
+ # Combine "text" and "puzzle" columns
33
+ dataset['combined_text'] = dataset['text'] + " " + dataset['puzzle']
34
+
35
+ # Split the dataset into training and validation sets
36
+ train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42)
37
+
38
+ # Convert the dataframes to datasets
39
+ train_dataset = Dataset.from_pandas(train_df)
40
+ val_dataset = Dataset.from_pandas(val_df)
41
+
42
+ # Load the tokenizer and model
43
+ model_name = "google/gemma-2-9b"
44
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
45
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
46
+
47
+ # Tokenize the data
48
+ def tokenize_function(examples):
49
+ return tokenizer(examples['combined_text'], truncation=True, padding='max_length', max_length=128)
50
+
51
+ train_dataset = train_dataset.map(tokenize_function, batched=True)
52
+ val_dataset = val_dataset.map(tokenize_function, batched=True)
53
+
54
+ # Set the format for PyTorch
55
+ train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
56
+ val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
57
+
58
+ # Define LoRA configuration
59
+ lora_config = LoraConfig(
60
+ task_type=TaskType.SEQ_CLS,
61
+ r=16,
62
+ lora_alpha=16,
63
+ target_modules=["q_proj", "v_proj"],
64
+ lora_dropout=0.05,
65
+ bias="none"
66
+ )
67
+
68
+ # Apply LoRA to the model
69
+ model = get_peft_model(model, lora_config)
70
+ model.print_trainable_parameters()
71
+
72
+ # Training arguments
73
+ training_args = TrainingArguments(
74
+ output_dir='./results',
75
+ learning_rate=1e-4,
76
+ lr_scheduler_type="linear",
77
+ warmup_ratio=0.1,
78
+ max_grad_norm=0.3,
79
+ per_device_train_batch_size=4,
80
+ per_device_eval_batch_size=4,
81
+ num_train_epochs=3,
82
+ weight_decay=0.001,
83
+ evaluation_strategy="epoch",
84
+ save_strategy="epoch",
85
+ load_best_model_at_end=True,
86
+ report_to="wandb",
87
+ fp16=True,
88
+ gradient_checkpointing=True,
89
+ gradient_accumulation_steps=4,
90
+ dataloader_num_workers=4,
91
+ logging_steps=100,
92
+ save_total_limit=2,
93
+ )
94
+
95
+ def compute_metrics(eval_pred):
96
+ precision_metric = evaluate.load("precision")
97
+ recall_metric = evaluate.load("recall")
98
+ f1_metric = evaluate.load("f1")
99
+ accuracy_metric = evaluate.load("accuracy")
100
+
101
+ logits, labels = eval_pred
102
+ predictions = np.argmax(logits, axis=-1)
103
+
104
+ precision = precision_metric.compute(predictions=predictions, references=labels, average='macro')["precision"]
105
+ recall = recall_metric.compute(predictions=predictions, references=labels, average='macro')["recall"]
106
+ f1 = f1_metric.compute(predictions=predictions, references=labels, average='macro')["f1"]
107
+ accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
108
+
109
+ return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}
110
+
111
+ # Initialize the Trainer
112
+ trainer = Trainer(
113
+ model=model,
114
+ args=training_args,
115
+ train_dataset=train_dataset,
116
+ eval_dataset=val_dataset,
117
+ compute_metrics=compute_metrics
118
+ )
119
+
120
+ # Train the model
121
+ trainer.train()
122
+
123
+ # Save the model
124
+ model.save_pretrained('trained_gemma_model')
125
+ tokenizer.save_pretrained('trained_gemma_model')
126
+
127
+ # Evaluate the model
128
+ trainer.evaluate()
competition/llama.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
3
+ from datasets import Dataset
4
+ import pandas as pd
5
+ from sklearn.model_selection import train_test_split
6
+ from peft import get_peft_model, LoraConfig, TaskType
7
+ import evaluate
8
+ import numpy as np
9
+ from tqdm import tqdm
10
+
11
+ # Load the dataset
12
+ file_path = 'train_en.csv'
13
+ dataset = pd.read_csv(file_path)
14
+
15
+ # Map labels to expected responses
16
+ label_mapping = {
17
+ "Yes": 0,
18
+ "No": 1,
19
+ "It doesn't matter": 2,
20
+ "Unimportant": 2,
21
+ "Incorrect questioning": 3,
22
+ "Correct answers": 4
23
+ }
24
+
25
+ # Apply label mapping
26
+ dataset['label'] = dataset['label'].map(label_mapping)
27
+
28
+ # Handle NaN values: Drop rows where label is NaN
29
+ dataset = dataset.dropna(subset=['label'])
30
+
31
+ # Ensure labels are integers
32
+ dataset['label'] = dataset['label'].astype(int)
33
+
34
+ # Combine "text" and "puzzle" columns
35
+ dataset['combined_text'] = dataset['text'] + " " + dataset['puzzle']
36
+
37
+ # Split the dataset into training and validation sets
38
+ train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42)
39
+
40
+ # Convert the dataframes to datasets
41
+ train_dataset = Dataset.from_pandas(train_df)
42
+ val_dataset = Dataset.from_pandas(val_df)
43
+
44
+ # Load the tokenizer and model
45
+ model_name = "meta-llama/Meta-Llama-3-8B" # Replace with the actual model name
46
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
47
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
48
+
49
+ # Add a padding token if it's not already present
50
+ if tokenizer.pad_token is None:
51
+ tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
52
+ model.resize_token_embeddings(len(tokenizer))
53
+ tokenizer.pad_token = tokenizer.eos_token # Set the padding token explicitly
54
+
55
+ # Ensure the padding token is set correctly in the model configuration
56
+ model.config.pad_token_id = tokenizer.pad_token_id
57
+
58
+ # Tokenize the data
59
+ def tokenize_function(examples):
60
+ return tokenizer(examples['combined_text'], truncation=True, padding='max_length', max_length=128)
61
+
62
+ train_dataset = train_dataset.map(tokenize_function, batched=True, num_proc=4) # Use multiprocessing
63
+ val_dataset = val_dataset.map(tokenize_function, batched=True, num_proc=4)
64
+
65
+ # Set the format for PyTorch
66
+ train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
67
+ val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
68
+
69
+ # Define LoRA configuration
70
+ lora_config = LoraConfig(
71
+ task_type=TaskType.SEQ_CLS,
72
+ r=16,
73
+ lora_alpha=16,
74
+ target_modules=["q_proj", "v_proj"],
75
+ lora_dropout=0.05,
76
+ bias="none"
77
+ )
78
+
79
+ # Apply LoRA to the model
80
+ model = get_peft_model(model, lora_config)
81
+ model.print_trainable_parameters()
82
+
83
+ # Training arguments
84
+ training_args = TrainingArguments(
85
+ output_dir='./results',
86
+ learning_rate=1e-4,
87
+ lr_scheduler_type="linear",
88
+ warmup_ratio=0.1,
89
+ max_grad_norm=0.3,
90
+ per_device_train_batch_size=8, # Increase batch size if memory allows
91
+ per_device_eval_batch_size=8,
92
+ num_train_epochs=3,
93
+ weight_decay=0.001,
94
+ evaluation_strategy="epoch",
95
+ save_strategy="epoch",
96
+ load_best_model_at_end=True,
97
+ report_to="wandb",
98
+ fp16=True,
99
+ gradient_checkpointing=True,
100
+ gradient_accumulation_steps=2, # Adjust based on memory constraints
101
+ dataloader_num_workers=4,
102
+ logging_steps=100,
103
+ save_total_limit=2,
104
+ )
105
+
106
+ def compute_metrics(eval_pred):
107
+ precision_metric = evaluate.load("precision")
108
+ recall_metric = evaluate.load("recall")
109
+ f1_metric = evaluate.load("f1")
110
+ accuracy_metric = evaluate.load("accuracy")
111
+
112
+ logits, labels = eval_pred
113
+ predictions = np.argmax(logits, axis=-1)
114
+
115
+ precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")["precision"]
116
+ recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")["recall"]
117
+ f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]
118
+ accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
119
+
120
+ return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}
121
+
122
+ # Initialize the Trainer
123
+ trainer = Trainer(
124
+ model=model,
125
+ args=training_args,
126
+ train_dataset=train_dataset,
127
+ eval_dataset=val_dataset,
128
+ compute_metrics=compute_metrics
129
+ )
130
+
131
+ # Train the model with progress bar
132
+ trainer.train()
133
+
134
+ # Save the model
135
+ model.save_pretrained('trained_llama_model')
136
+ tokenizer.save_pretrained('trained_llama_model')
137
+
138
+ # Evaluate the model with progress bar
139
+ eval_results = trainer.evaluate()
140
+ print(eval_results)
141
+
142
+ # %%
143
+
144
+
145
+