File size: 3,878 Bytes
608e624 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import torch
from transformers import (
DistilBertTokenizer,
DistilBertForSequenceClassification,
Trainer,
)
from datasets import Dataset, load_from_disk
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from functools import partial
import os
version = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
if __name__ == "__main__":
# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
cleaned_path = (
f"./output/version_{version}/tokenized_data_{version}/inference_{version}"
)
# Load the model
model = DistilBertForSequenceClassification.from_pretrained(
f"./output/version_{version}/best_model_{version}"
)
model.to(device)
# Load and prepare the new dataset
if os.path.exists(cleaned_path):
print("Loading dataset...")
dataset = load_from_disk(cleaned_path)
else:
print("No dataset found. Loading and preparing dataset now...")
# # Load the datasets
# true_news = pd.read_csv("data_1/True.csv")
# fake_news = pd.read_csv("data_1/Fake.csv")
# # Add labels
# true_news["label"] = 1
# fake_news["label"] = 0
# # Combine the datasets
# df = pd.concat([true_news, fake_news], ignore_index=True)
df = pd.read_csv("./data_3/news_articles.csv")
df.drop(
columns=[
"author",
"published",
"site_url",
"main_img_url",
"type",
"text_without_stopwords",
"title_without_stopwords",
"hasImage",
],
inplace=True,
)
# Map Real to 1 and Fake to 0
df["label"] = df["label"].map({"Real": 1, "Fake": 0})
df = df[df["label"].isin([1, 0])]
# Drop rows where the language is not 'english'
df = df[df["language"] == "english"]
df.drop(columns=["language"], inplace=True)
# Convert "no title" to empty string
df["title"] = df["title"].apply(lambda x: "" if x == "no title" else x)
df.dropna(inplace=True)
# Remove patterns like "COUNTRY or STATE NAME (Reuters) -"
df["text"] = df["text"].str.replace(
r"(\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-|\(Reuters\))", "", regex=True
)
# Remove patterns like "Featured image via author name / image place"
df["text"] = df["text"].str.replace(
r"Featured image via .+?\.($|\s)", "", regex=True
)
df["text"] = df["title"] + " " + df["text"]
df = df[["text", "label"]]
df["label"] = df["label"].astype(int)
dataset = Dataset.from_pandas(df)
def tokenize(tokenizer, examples):
return tokenizer(
examples["text"], padding=True, truncation=True, max_length=512
)
# Use partial to create a new function that has tokenizer as its first argument
tokenize_with_tokenizer = partial(tokenize, tokenizer)
# Use tokenize_with_tokenizer in the map function
dataset = dataset.map(tokenize_with_tokenizer, batched=True, num_proc=8)
dataset.save_to_disk(cleaned_path)
dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
print("Finished tokenizing.")
trainer = Trainer(model=model)
predictions = trainer.predict(dataset)
accuracy = accuracy_score(predictions.label_ids, predictions.predictions.argmax(-1))
f1 = f1_score(predictions.label_ids, predictions.predictions.argmax(-1))
auc_roc = roc_auc_score(predictions.label_ids, predictions.predictions.argmax(-1))
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"AUC-ROC: {auc_roc}")
|