tomaarsen HF staff commited on
Commit
8fdee77
1 Parent(s): fe5f197

Upload train.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train.py +85 -0
train.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import shutil
3
+ from datasets import load_dataset
4
+ from transformers import TrainingArguments
5
+ from span_marker import SpanMarkerModel, Trainer
6
+ from span_marker.model_card import SpanMarkerModelCardData
7
+ from huggingface_hub import upload_folder, upload_file
8
+
9
+
10
+ def main() -> None:
11
+ # Load the dataset, ensure "tokens" and "ner_tags" columns, and get a list of labels
12
+ dataset = load_dataset("ljvmiranda921/tlunified-ner")
13
+ labels = dataset["train"].features["ner_tags"].feature.names
14
+
15
+ # Initialize a SpanMarker model using a pretrained BERT-style encoder
16
+ encoder_id = "bert-base-multilingual-cased"
17
+ model_id = f"tomaarsen/span-marker-mbert-base-tlunified"
18
+ model = SpanMarkerModel.from_pretrained(
19
+ encoder_id,
20
+ labels=labels,
21
+ # SpanMarker hyperparameters:
22
+ model_max_length=256,
23
+ marker_max_length=128,
24
+ entity_max_length=8,
25
+ # Model card variables
26
+ model_card_data=SpanMarkerModelCardData(
27
+ model_id=model_id,
28
+ encoder_id=encoder_id,
29
+ dataset_name="TLUnified",
30
+ license="gpl-3.0",
31
+ language=["tl"],
32
+ ),
33
+ )
34
+
35
+ # Prepare the 🤗 transformers training arguments
36
+ output_dir = Path("models") / model_id
37
+ args = TrainingArguments(
38
+ output_dir=output_dir,
39
+ run_name=model_id,
40
+ # Training Hyperparameters:
41
+ learning_rate=5e-5,
42
+ per_device_train_batch_size=16,
43
+ per_device_eval_batch_size=16,
44
+ num_train_epochs=3,
45
+ weight_decay=0.01,
46
+ warmup_ratio=0.1,
47
+ bf16=True, # Replace `bf16` with `fp16` if your hardware can't use bf16.
48
+ # Other Training parameters
49
+ logging_first_step=True,
50
+ logging_steps=50,
51
+ evaluation_strategy="steps",
52
+ save_strategy="steps",
53
+ eval_steps=400,
54
+ save_total_limit=1,
55
+ dataloader_num_workers=4,
56
+ )
57
+
58
+ # Initialize the trainer using our model, training args & dataset, and train
59
+ trainer = Trainer(
60
+ model=model,
61
+ args=args,
62
+ train_dataset=dataset["train"],
63
+ eval_dataset=dataset["validation"],
64
+ )
65
+ trainer.train()
66
+
67
+ # Compute & save the metrics on the test set
68
+ metrics = trainer.evaluate(dataset["test"], metric_key_prefix="test")
69
+ trainer.save_metrics("test", metrics)
70
+
71
+ # Save the model & training script locally
72
+ trainer.save_model(output_dir / "checkpoint-final")
73
+ shutil.copy2(__file__, output_dir / "checkpoint-final" / "train.py")
74
+
75
+ # Upload everything to the Hub
76
+ breakpoint()
77
+ model.push_to_hub(model_id, private=True)
78
+ upload_folder(folder_path=output_dir / "runs", path_in_repo="runs", repo_id=model_id)
79
+ upload_file(path_or_fileobj=__file__, path_in_repo="train.py", repo_id=model_id)
80
+ upload_file(path_or_fileobj=output_dir / "all_results.json", path_in_repo="all_results.json", repo_id=model_id)
81
+ upload_file(path_or_fileobj=output_dir / "emissions.csv", path_in_repo="emissions.csv", repo_id=model_id)
82
+
83
+
84
+ if __name__ == "__main__":
85
+ main()