# pip install transformers # pip install datasets from datasets import Dataset from datasets import load_metric from xml.etree.ElementTree import ElementTree import pandas as pd import numpy as np from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer # Initialization tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=3) metric = load_metric("accuracy") training_args = TrainingArguments(output_dir="../test_trainer", evaluation_strategy="epoch") def extract_data(file): fileTree = ElementTree() fileTree.parse(file) root = fileTree.getroot() questions = [] for thread in root: for child in thread: subject = child.find("RelQSubject").text body = child.find("RelQBody").text content = "{} {}".format(subject, body) tag = child.attrib["RELQ_FACT_LABEL"] questions.append((tag, content)) questions_dataframe = pd.DataFrame(questions) questions_dataframe.columns = ["label", "text"] questions_dataframe["label"] = questions_dataframe["label"].replace({'Factual': 0, 'Opinion': 1, 'Socializing': 2}) return questions_dataframe def tokenize_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True) def compute_metrics(eval_pred): logits, labels = eval_pred predictions = np.argmax(logits, axis=-1) return metric.compute(predictions=predictions, references=labels) train_df = extract_data("questions_train.xml") dev_df = extract_data("questions_dev.xml") train_dataset = Dataset.from_pandas(train_df) dev_dataset = Dataset.from_pandas(dev_df) tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True) tokenized_dev_dataset = dev_dataset.map(tokenize_function, batched=True) small_train_dataset = tokenized_train_dataset.shuffle(seed=42).select(range(5)) small_dev_dataset = tokenized_dev_dataset.shuffle(seed=42).select(range(1)) trainer = Trainer( model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_dev_dataset, compute_metrics=compute_metrics, ) trainer.train() trainer.save_model("saved_model")