File size: 4,561 Bytes
ae7fe37 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
# --- IMPORTS ---
import torch
from datasets import Dataset
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))
# -------------------------
# --- INPUT ---
input_list = [""" WRITE HERE YOUR FIRST ESSAY """,
""" WRITE HERE YOUR SECOND ESSAY """]
# -------------
# --- USEFUL FUNCTIONS ----
def clean_text(text):
"""
This funtion get's rid of nonalphabetical characters, stopwords and lower cases the text.
Args:
text (str): The text to be cleaned
Returns:
text (str): The cleaned text
Example:
df['text'] = df['text'].apply(clean_text)
"""
text = re.sub(r'[^a-zA-Z]', ' ', text)
text = text.lower()
words = text.split()
text = [word for word in words if not word in stopwords]
text = ' '.join(words)
return text
def tokenize_function(dataframe):
"""
This funtion tokenizes the 'text' field of the dataframe.
Args:
dataframe (pandas.DataFrame): The dataframe to be tokenized
Returns:
dataframe (pandas.DataFrame): The tokenized dataframe
Example and output:
train_dataset_token = train_dataset.map(tokenize_function, batched=True)
"""
return tokenizer(dataframe["text"], truncation=True)
def compute_metrics(eval_pred):
"""
This funtion computes the accuracy, precision, recall and f1 score of the model.
It'is passed to the trainer and it outputs when evaluating the model.
Args:
eval_pred (tuple): The predictions and labels of the model
Returns:
dict: The accuracy, precision, recall and f1 score of the model
Example:
>>> trainer.evaluate()
{
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1
}
"""
predictions, labels = eval_pred
predictions = predictions.argmax(axis=-1)
accuracy = accuracy_score(labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(
labels, predictions, average='binary')
return {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1
}
# -------------------------
# --- LOADING THE MODEL ---
# Load the initial tokenizer and model to set the number of labels its going to classify as 2
checkpoint = "diegovelilla/EssAI"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
# -------------------------
# --- DATA PREPROCESSING ---
n_input = len(input_list)
# Now we convert the input to a dataset
df = pd.DataFrame({'text': input_list})
# Get rid of nonalphatetical characters, stopwords and we lower case it.
df['text'] = df['text'].apply(clean_text)
# We convert the pandas dataframe into hugging face datasets and tokenize both of them
ds = Dataset.from_pandas(df)
ds_token = ds.map(tokenize_function, batched=True)
# Drop columns that are not necessary and set the dataset format to pytorch tensors
ds_token = ds_token.remove_columns(["text", "token_type_ids"])
ds_token.set_format(type='torch', columns=['input_ids', 'attention_mask'])
# -------------------------
# --- INSTANTIATING TRAINER ----
# We instantiate a DataCollatorWithPadding in order to pad the inputs in batches while training
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Create the training arguments
training_args = TrainingArguments(".")
# Create the trainer
trainer = Trainer(
model,
training_args,
eval_dataset=ds_token,
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
# -------------------------
# --- PREDICT ---
# We predict and then format the output
predictions = trainer.predict(ds_token)
predictions = torch.from_numpy(predictions.predictions)
predictions = torch.nn.functional.softmax(predictions, dim=-1)
print('\n\n')
for i in range(n_input):
index = torch.argmax(predictions[i])
print(f'{i+1}: HUMAN {round(predictions[i][0].item() * 100, 2)}% of confidence.') if index == 0 else print(
f'{i+1}: AI {round(predictions[i][1].item() * 100, 2)}% of confidence.')
# -------------------------
|