|
import torch |
|
from transformers import ( |
|
DistilBertTokenizer, |
|
DistilBertForSequenceClassification, |
|
Trainer, |
|
) |
|
from datasets import Dataset, load_from_disk |
|
import pandas as pd |
|
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score |
|
from functools import partial |
|
import os |
|
|
|
version = 3 |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
print("Device:", device) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") |
|
|
|
cleaned_path = ( |
|
f"./output/version_{version}/tokenized_data_{version}/inference_{version}" |
|
) |
|
|
|
|
|
model = DistilBertForSequenceClassification.from_pretrained( |
|
f"./output/version_{version}/best_model_{version}" |
|
) |
|
model.to(device) |
|
|
|
|
|
if os.path.exists(cleaned_path): |
|
print("Loading dataset...") |
|
dataset = load_from_disk(cleaned_path) |
|
else: |
|
print("No dataset found. Loading and preparing dataset now...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df = pd.read_csv("./data_3/news_articles.csv") |
|
df.drop( |
|
columns=[ |
|
"author", |
|
"published", |
|
"site_url", |
|
"main_img_url", |
|
"type", |
|
"text_without_stopwords", |
|
"title_without_stopwords", |
|
"hasImage", |
|
], |
|
inplace=True, |
|
) |
|
|
|
df["label"] = df["label"].map({"Real": 1, "Fake": 0}) |
|
df = df[df["label"].isin([1, 0])] |
|
|
|
|
|
df = df[df["language"] == "english"] |
|
df.drop(columns=["language"], inplace=True) |
|
|
|
|
|
df["title"] = df["title"].apply(lambda x: "" if x == "no title" else x) |
|
|
|
df.dropna(inplace=True) |
|
|
|
|
|
df["text"] = df["text"].str.replace( |
|
r"(\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-|\(Reuters\))", "", regex=True |
|
) |
|
|
|
|
|
df["text"] = df["text"].str.replace( |
|
r"Featured image via .+?\.($|\s)", "", regex=True |
|
) |
|
|
|
df["text"] = df["title"] + " " + df["text"] |
|
df = df[["text", "label"]] |
|
df["label"] = df["label"].astype(int) |
|
|
|
dataset = Dataset.from_pandas(df) |
|
|
|
def tokenize(tokenizer, examples): |
|
return tokenizer( |
|
examples["text"], padding=True, truncation=True, max_length=512 |
|
) |
|
|
|
|
|
tokenize_with_tokenizer = partial(tokenize, tokenizer) |
|
|
|
|
|
dataset = dataset.map(tokenize_with_tokenizer, batched=True, num_proc=8) |
|
dataset.save_to_disk(cleaned_path) |
|
|
|
dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"]) |
|
print("Finished tokenizing.") |
|
|
|
trainer = Trainer(model=model) |
|
predictions = trainer.predict(dataset) |
|
accuracy = accuracy_score(predictions.label_ids, predictions.predictions.argmax(-1)) |
|
f1 = f1_score(predictions.label_ids, predictions.predictions.argmax(-1)) |
|
auc_roc = roc_auc_score(predictions.label_ids, predictions.predictions.argmax(-1)) |
|
|
|
print(f"Accuracy: {accuracy}") |
|
print(f"F1 Score: {f1}") |
|
print(f"AUC-ROC: {auc_roc}") |
|
|