| | from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
|
| | from datasets import load_dataset
|
| |
|
| |
|
| | def load_data_from_csv(csv_file):
|
| | dataset = load_dataset("csv", data_files=csv_file)
|
| | return dataset['train']
|
| |
|
| |
|
| | def get_model_and_tokenizer():
|
| | model_name = "microsoft/codebert-base"
|
| | tokenizer = RobertaTokenizer.from_pretrained(model_name)
|
| | model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)
|
| | return model, tokenizer
|
| |
|
| |
|
| | def tokenize_function(example, tokenizer):
|
| | return tokenizer(example['content'], truncation=True, padding="max_length", max_length=512)
|
| |
|
| |
|
| | def train_model(dataset, tokenizer, model):
|
| | tokenized_data = dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)
|
| | training_args = TrainingArguments(
|
| | output_dir="./results",
|
| | evaluation_strategy="epoch",
|
| | save_strategy="epoch",
|
| | learning_rate=2e-5,
|
| | num_train_epochs=3,
|
| | per_device_train_batch_size=16,
|
| | per_device_eval_batch_size=16,
|
| | warmup_steps=500,
|
| | weight_decay=0.01,
|
| | logging_dir="./logs",
|
| | logging_steps=10,
|
| | )
|
| | trainer = Trainer(
|
| | model=model,
|
| | args=training_args,
|
| | train_dataset=tokenized_data,
|
| | eval_dataset=tokenized_data,
|
| | tokenizer=tokenizer,
|
| | )
|
| | trainer.train()
|
| |
|
| | if __name__ == "__main__":
|
| |
|
| | dataset = load_data_from_csv("code_analysis_dataset.csv")
|
| | model, tokenizer = get_model_and_tokenizer()
|
| |
|
| |
|
| | train_model(dataset, tokenizer, model)
|
| | print("[SUCCESS] Model trained!")
|
| |
|