Spaces:
Sleeping
Sleeping
| import os | |
| import numpy as np | |
| import json | |
| import argparse | |
| from utils_data import load_data, MyDataset | |
| from modeling_bert import BertForSequenceClassification | |
| from transformers import ( | |
| AutoConfig, | |
| BertForSequenceClassification, | |
| AutoTokenizer, | |
| EvalPrediction, | |
| Trainer, | |
| TrainingArguments, | |
| default_data_collator, | |
| set_seed, | |
| ) | |
| def parse_args(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('--data_root', type=str, default='data') | |
| parser.add_argument('--model', type=str, default='bert-base-uncased') | |
| parser.add_argument('--output_dir', type=str, default='experiments') | |
| parser.add_argument('--epoch', type=int, default=2) | |
| parser.add_argument('--lr', type=float, default=2e-5) | |
| parser.add_argument('--bs', type=int, default=32) | |
| parser.add_argument('--max_length', type=int, default=512) | |
| parser.add_argument('--seed', type=int, default=666, help='random seed') | |
| args = parser.parse_args() | |
| return args | |
| def main(): | |
| args = parse_args() | |
| print("args",args) | |
| print('====Input Arguments====') | |
| print(json.dumps(vars(args), indent=2, sort_keys=False)) | |
| # Set seed before initializing model, for reproduction purpose. | |
| set_seed(args.seed) | |
| # Load pretrained model and tokenizer | |
| config = AutoConfig.from_pretrained(args.model) | |
| tokenizer = AutoTokenizer.from_pretrained(args.model) | |
| model =BertForSequenceClassification.from_pretrained(args.model, config=config) | |
| # Load data | |
| train_data = load_data(args, "train") | |
| train_dataset = MyDataset(train_data, tokenizer, args.max_length, is_test=False) | |
| eval_data = load_data(args, "val") | |
| eval_dataset = MyDataset(eval_data, tokenizer, args.max_length, is_test=False) | |
| test_data = load_data(args, "test") | |
| test_dataset = MyDataset(test_data, tokenizer, args.max_length, is_test=True) | |
| def compute_metrics(p: EvalPrediction): | |
| preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions | |
| preds = np.argmax(preds, axis=1) | |
| correct = ((preds == p.label_ids).sum()).item() | |
| return {'accuracy': 1.0*correct/len(preds)} | |
| training_args = TrainingArguments( | |
| output_dir = args.output_dir, | |
| do_train=True, | |
| do_eval=True, | |
| do_predict=True, | |
| logging_strategy="steps", | |
| save_strategy="epoch", | |
| learning_rate= args.lr, | |
| per_device_train_batch_size=args.bs, | |
| per_device_eval_batch_size=args.bs, | |
| num_train_epochs=args.epoch, | |
| report_to="none" | |
| ) | |
| # Initialize our Trainer | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset if training_args.do_train else None, | |
| eval_dataset=eval_dataset if training_args.do_eval else None, | |
| compute_metrics=compute_metrics, | |
| tokenizer=tokenizer, | |
| data_collator=default_data_collator, | |
| ) | |
| # Training | |
| if training_args.do_train: | |
| train_result = trainer.train() | |
| metrics = train_result.metrics | |
| trainer.save_model() # Saves the tokenizer too for easy upload | |
| trainer.log_metrics("train", metrics) | |
| trainer.save_metrics("train", metrics) | |
| trainer.save_state() | |
| # Evaluation | |
| if training_args.do_eval: | |
| metrics = trainer.evaluate(eval_dataset=eval_dataset) | |
| trainer.log_metrics("eval", metrics) | |
| trainer.save_metrics("eval", metrics) | |
| if training_args.do_predict: | |
| predictions = trainer.predict(test_dataset, metric_key_prefix="predict").predictions | |
| predictions = np.argmax(predictions, axis=1) | |
| output_predict_file = os.path.join(args.output_dir, "predict_results.txt") | |
| if trainer.is_world_process_zero(): | |
| with open(output_predict_file, "w") as writer: | |
| writer.write("index\tprediction\n") | |
| for index, item in enumerate(predictions): | |
| writer.write(f"{index}\t{item}\n") | |
| if __name__ == "__main__": | |
| main() |