{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os; os.chdir('..')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "from datasets import Dataset, load_dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ParagraphAI_generated
68451Anoop Bikram Shahi (also Anup Bikram Shahi) (N...0
51106The Olympus scandal was precipitated on 14 Oct...0
102697Leslie Herbert \"Speedy\" Duncan (August 10, 194...0
97245Elaine Rochelle Sisman (born January 20, 1952)...1
276647Typhoon Pamela was a powerful typhoon that str...0
24973Bernard Hartmut Breslauer (1 July 1918 – 14 Au...0
277295Rwamagana is a city and capital of both the Rw...0
63059Sir Lewis Cohen (23 December 1849 – 5 December...1
19267SN 2005gl was a supernova in the barred-spiral...0
239340Sulgrave Manor, Sulgrave, Northamptonshire, En...0
\n", "
" ], "text/plain": [ " Paragraph AI_generated\n", "68451 Anoop Bikram Shahi (also Anup Bikram Shahi) (N... 0\n", "51106 The Olympus scandal was precipitated on 14 Oct... 0\n", "102697 Leslie Herbert \"Speedy\" Duncan (August 10, 194... 0\n", "97245 Elaine Rochelle Sisman (born January 20, 1952)... 1\n", "276647 Typhoon Pamela was a powerful typhoon that str... 0\n", "24973 Bernard Hartmut Breslauer (1 July 1918 – 14 Au... 0\n", "277295 Rwamagana is a city and capital of both the Rw... 0\n", "63059 Sir Lewis Cohen (23 December 1849 – 5 December... 1\n", "19267 SN 2005gl was a supernova in the barred-spiral... 0\n", "239340 Sulgrave Manor, Sulgrave, Northamptonshire, En... 0" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df= pd.read_csv('data/AI_checker_remade.csv')\n", "\n", "\n", "df.sample(10)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textlabel
103386Huang Yu-chun (born 16 November 1939 in Shanto...0
13297John Kane (7 April 1921 – 27 February 2002) wa...1
65718Vasiliev equations are formally consistent gau...0
63724Lauderdale Oval or Lauderdale Sports Ground \\n...1
86145Earl Zebedee Hooker (January 15, 1930 – April ...0
152381Agnes Strickland (18 July 1796 – 8 October 185...1
201492Prinsep may mean any of several notable member...0
159645Jana Herzen is a singer-songwriter with a foc...1
7977Cincy Blues Fest is an annual blues music fest...0
261224Bitti Mohanty, also called Bitihotra Mohanty i...0
\n", "
" ], "text/plain": [ " text label\n", "103386 Huang Yu-chun (born 16 November 1939 in Shanto... 0\n", "13297 John Kane (7 April 1921 – 27 February 2002) wa... 1\n", "65718 Vasiliev equations are formally consistent gau... 0\n", "63724 Lauderdale Oval or Lauderdale Sports Ground \\n... 1\n", "86145 Earl Zebedee Hooker (January 15, 1930 – April ... 0\n", "152381 Agnes Strickland (18 July 1796 – 8 October 185... 1\n", "201492 Prinsep may mean any of several notable member... 0\n", "159645 Jana Herzen is a singer-songwriter with a foc... 1\n", "7977 Cincy Blues Fest is an annual blues music fest... 0\n", "261224 Bitti Mohanty, also called Bitihotra Mohanty i... 0" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.rename(columns={\n", " \"Paragraph\": \"text\", \n", " \"AI_generated\": \"label\"\n", "}, \n", " inplace=True\n", ")\n", "\n", "df.sample(10)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# df_dict= df.to_dict()\n", "# df_dict" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['text', 'label'],\n", " num_rows: 300000\n", "})" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Dataset.from_pandas(df)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['text', 'label'],\n", " num_rows: 300000\n", "})" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset_df= Dataset.from_pandas(df)\n", "dataset_df" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['text', 'label'],\n", " num_rows: 300000\n", "})" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset_df" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# df_dict.keys()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "# dataset_df[:10]" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['text', 'label'],\n", " num_rows: 240000\n", " })\n", " test: Dataset({\n", " features: ['text', 'label'],\n", " num_rows: 60000\n", " })\n", "})" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_data= dataset_df.train_test_split(test_size=0.2)\n", "new_data" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'text': 'The Hillbrow Tower (formerly JG Strijdom Tower) is a luxury residential skyscraper located in the Hillbrow district of Johannesburg, South Africa. The tower was completed in 2006 and has 48 floors. It is the tallest residential building in Johannesburg and South Africa.\\n\\nThe tower is named after JG Strijdom, the first Prime Minister of South Africa. The tower has been controversial since its inception, as it has been accused of being an eyesore and a symbol of inequality in Johannesburg.',\n", " 'label': 1}" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_data['test'][10]" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "def preprocess_function(examples):\n", " return tokenizer(examples[\"text\"], truncation=True)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Map: 100%|██████████| 240000/240000 [01:03<00:00, 3756.57 examples/s]\n", "Map: 100%|██████████| 60000/60000 [00:15<00:00, 3767.88 examples/s]\n" ] } ], "source": [ "tokenized_df = new_data.map(preprocess_function, batched=True)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2023-10-06 10:06:53.343215: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", "2023-10-06 10:06:55.238543: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" ] } ], "source": [ "# from transformers import DataCollatorWithPadding\n", "\n", "# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors=\"tf\")\n", "\n", "\n", "\n", "\n", "from transformers import DataCollatorWithPadding\n", "\n", "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "import evaluate\n", "\n", "accuracy = evaluate.load(\"accuracy\")" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "\n", "\n", "def compute_metrics(eval_pred):\n", " predictions, labels = eval_pred\n", " predictions = np.argmax(predictions, axis=1)\n", " return accuracy.compute(predictions=predictions, references=labels)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "id2label = {0: \"NEGATIVE\", 1: \"POSITIVE\"}\n", "label2id = {\"NEGATIVE\": 0, \"POSITIVE\": 1}" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "# from transformers import create_optimizer\n", "# import tensorflow as tf\n", "\n", "# batch_size = 16\n", "# num_epochs = 5\n", "# batches_per_epoch = len(tokenized_df[\"train\"]) // batch_size\n", "# total_train_steps = int(batches_per_epoch * num_epochs)\n", "# optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "# # from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer\n", "\n", "# # model = AutoModelForSequenceClassification.from_pretrained(\n", "# # \"distilbert-base-uncased\", num_labels=2, id2label=id2label, label2id=label2id\n", "# # )\n", "\n", "# from transformers import TFAutoModelForSequenceClassification\n", "\n", "# model = TFAutoModelForSequenceClassification.from_pretrained(\n", "# \"distilbert-base-uncased\", num_labels=2, id2label=id2label, label2id=label2id\n", "# )" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer\n", "\n", "model = AutoModelForSequenceClassification.from_pretrained(\n", " \"distilbert-base-uncased\", num_labels=2, id2label=id2label, label2id=label2id\n", ")" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "# tf_train_set = model.prepare_tf_dataset(\n", "# tokenized_df[\"train\"],\n", "# shuffle=True,\n", "# batch_size=16,\n", "# collate_fn=data_collator,\n", "# )\n", "\n", "# tf_validation_set = model.prepare_tf_dataset(\n", "# tokenized_df[\"test\"],\n", "# shuffle=False,\n", "# batch_size=16,\n", "# collate_fn=data_collator,\n", "# )" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "# import tensorflow as tf\n", "\n", "# model.compile(optimizer=optimizer) # No loss argument!" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" ] }, { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " [30000/30000 1:25:18, Epoch 2/2]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation LossAccuracy
10.0178000.0749630.984383
20.0016000.0392180.993867

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "TrainOutput(global_step=30000, training_loss=0.018804504087567328, metrics={'train_runtime': 5119.9395, 'train_samples_per_second': 93.751, 'train_steps_per_second': 5.859, 'total_flos': 4.559380077797894e+16, 'train_loss': 0.018804504087567328, 'epoch': 2.0})" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "training_args = TrainingArguments(\n", " output_dir=\"my_awesome_model\",\n", " learning_rate=2e-5,\n", " per_device_train_batch_size=16,\n", " per_device_eval_batch_size=16,\n", " num_train_epochs=2,\n", " weight_decay=0.01,\n", " evaluation_strategy=\"epoch\",\n", " save_strategy=\"epoch\",\n", " load_best_model_at_end=True,\n", " # push_to_hub=True,\n", ")\n", "\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=tokenized_df[\"train\"],\n", " eval_dataset=tokenized_df[\"test\"],\n", " tokenizer=tokenizer,\n", " data_collator=data_collator,\n", " compute_metrics=compute_metrics,\n", ")\n", "\n", "trainer.train()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "# from transformers.keras_callbacks import KerasMetricCallback\n", "\n", "# metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "# callbacks = [metric_callback]" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('models/tokenizer_v1/tokenizer_config.json',\n", " 'models/tokenizer_v1/special_tokens_map.json',\n", " 'models/tokenizer_v1/vocab.txt',\n", " 'models/tokenizer_v1/added_tokens.json',\n", " 'models/tokenizer_v1/tokenizer.json')" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.save_pretrained(\"models/tokenizer_v1\")\n", "model.save_pretrained(\"models/trained_model_v1\")\n", "\n", "tokenizer.save_pretrained(\"models/trained_model_v11\")\n", "model.save_pretrained(\"models/trained_model_v11\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }