tomaarsen HF staff commited on
Commit
65a5cb1
·
1 Parent(s): 111bd4a

Upload train.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train.py +94 -0
train.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import shutil
3
+ from datasets import load_dataset
4
+ from transformers import TrainingArguments
5
+ from span_marker import SpanMarkerModel, Trainer
6
+ from span_marker.model_card import SpanMarkerModelCardData
7
+
8
+ import os
9
+
10
+ os.environ["CODECARBON_LOG_LEVEL"] = "error"
11
+
12
+
13
+ def main() -> None:
14
+ # Load the dataset, ensure "tokens" and "ner_tags" columns, and get a list of labels
15
+ dataset_id = "tner/bionlp2004"
16
+ dataset_name = "BioNLP2004"
17
+ dataset = load_dataset(dataset_id).rename_column("tags", "ner_tags")
18
+ labels = [
19
+ "O",
20
+ "B-DNA",
21
+ "I-DNA",
22
+ "B-protein",
23
+ "I-protein",
24
+ "B-cell_type",
25
+ "I-cell_type",
26
+ "B-cell_line",
27
+ "I-cell_line",
28
+ "B-RNA",
29
+ "I-RNA",
30
+ ]
31
+
32
+ # Initialize a SpanMarker model using a pretrained BERT-style encoder
33
+ encoder_id = "bert-base-uncased"
34
+ model_id = f"tomaarsen/span-marker-{encoder_id}-bionlp"
35
+ model = SpanMarkerModel.from_pretrained(
36
+ encoder_id,
37
+ labels=labels,
38
+ # SpanMarker hyperparameters:
39
+ model_max_length=256,
40
+ marker_max_length=128,
41
+ entity_max_length=8,
42
+ # Model card variables
43
+ model_card_data=SpanMarkerModelCardData(
44
+ model_id=model_id,
45
+ encoder_id=encoder_id,
46
+ dataset_name=dataset_name,
47
+ dataset_id=dataset_id,
48
+ license="other",
49
+ language="en",
50
+ ),
51
+ )
52
+
53
+ # Prepare the 🤗 transformers training arguments
54
+ output_dir = Path("models") / model_id
55
+ args = TrainingArguments(
56
+ output_dir=output_dir,
57
+ run_name=model_id,
58
+ # Training Hyperparameters:
59
+ learning_rate=5e-5,
60
+ per_device_train_batch_size=32,
61
+ per_device_eval_batch_size=32,
62
+ num_train_epochs=3,
63
+ weight_decay=0.01,
64
+ warmup_ratio=0.1,
65
+ bf16=True, # Replace `bf16` with `fp16` if your hardware can't use bf16.
66
+ # Other Training parameters
67
+ logging_first_step=True,
68
+ logging_steps=50,
69
+ evaluation_strategy="steps",
70
+ save_strategy="steps",
71
+ eval_steps=300,
72
+ save_total_limit=2,
73
+ dataloader_num_workers=2,
74
+ )
75
+
76
+ # Initialize the trainer using our model, training args & dataset, and train
77
+ trainer = Trainer(
78
+ model=model,
79
+ args=args,
80
+ train_dataset=dataset["train"],
81
+ eval_dataset=dataset["validation"],
82
+ )
83
+ trainer.train()
84
+
85
+ # Compute & save the metrics on the test set
86
+ metrics = trainer.evaluate(dataset["test"], metric_key_prefix="test")
87
+ trainer.save_metrics("test", metrics)
88
+
89
+ trainer.save_model(output_dir / "checkpoint-final")
90
+ shutil.copy2(__file__, output_dir / "checkpoint-final" / "train.py")
91
+
92
+
93
+ if __name__ == "__main__":
94
+ main()