Spaces:
Sleeping
Sleeping
from datasets import load_dataset | |
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, | |
TrainingArguments, Trainer) | |
# Reload dataset and tokenizer | |
dataset = load_dataset("liar") | |
dataset = dataset["train"].train_test_split(test_size=0.2, seed=42) | |
def simplify_label(example): | |
name = dataset["train"].features["label"].names[ example["label"] ] | |
example["label"] = int(name in ["pants‑fire","false","barely‑true"]) | |
return example | |
dataset = dataset.map(simplify_label) | |
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") | |
# Tokenize the text field (can try combining title + text later for improved performance): | |
def tokenize(example): | |
return tokenizer(example["statement"], truncation=True, padding="max_length", max_length=128) | |
# Tokenize the dataset | |
tokenized_dataset = dataset.map(tokenize, batched=True) | |
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"]) | |
# Load model | |
model = AutoModelForSequenceClassification.from_pretrained("models/bert-liar-fake-news") | |
# Set up Trainer for evaluation | |
training_args = TrainingArguments(output_dir="./results", per_device_eval_batch_size=8) | |
trainer = Trainer(model=model, args=training_args) | |
# Evaluate | |
metrics = trainer.evaluate(eval_dataset=tokenized_dataset["test"]) | |
print(metrics) |