File size: 2,294 Bytes
f2f41c6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
# pip install transformers
# pip install datasets
from datasets import Dataset
from datasets import load_metric
from xml.etree.ElementTree import ElementTree
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
# Initialization
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=3)
metric = load_metric("accuracy")
training_args = TrainingArguments(output_dir="../test_trainer", evaluation_strategy="epoch")
def extract_data(file):
fileTree = ElementTree()
fileTree.parse(file)
root = fileTree.getroot()
questions = []
for thread in root:
for child in thread:
subject = child.find("RelQSubject").text
body = child.find("RelQBody").text
content = "{} {}".format(subject, body)
tag = child.attrib["RELQ_FACT_LABEL"]
questions.append((tag, content))
questions_dataframe = pd.DataFrame(questions)
questions_dataframe.columns = ["label", "text"]
questions_dataframe["label"] = questions_dataframe["label"].replace({'Factual': 0, 'Opinion': 1, 'Socializing': 2})
return questions_dataframe
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True)
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
train_df = extract_data("questions_train.xml")
dev_df = extract_data("questions_dev.xml")
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_dev_dataset = dev_dataset.map(tokenize_function, batched=True)
small_train_dataset = tokenized_train_dataset.shuffle(seed=42).select(range(5))
small_dev_dataset = tokenized_dev_dataset.shuffle(seed=42).select(range(1))
trainer = Trainer(
model=model,
args=training_args,
train_dataset=small_train_dataset,
eval_dataset=small_dev_dataset,
compute_metrics=compute_metrics,
)
trainer.train()
trainer.save_model("saved_model")
|