{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[]},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":1,"metadata":{"id":"fh_udV1UwMAI","executionInfo":{"status":"ok","timestamp":1683371603134,"user_tz":-120,"elapsed":3355,"user":{"displayName":"Rachel","userId":"14306854771685514192"}}},"outputs":[],"source":["# importing\n","import pandas as pd\n","import numpy as np\n","from sklearn.feature_extraction.text import TfidfVectorizer\n","from sklearn.metrics import classification_report\n","from sklearn.linear_model import LogisticRegression\n","from sklearn.model_selection import train_test_split\n","import nltk"]},{"cell_type":"code","source":["from google.colab import drive\n","drive.mount('/content/gdrive')\n","%cd gdrive/MyDrive/NLP_Project"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"mQRfWLf7xser","executionInfo":{"status":"ok","timestamp":1683371625856,"user_tz":-120,"elapsed":22729,"user":{"displayName":"Rachel","userId":"14306854771685514192"}},"outputId":"022758a4-467c-40ef-91c5-2c8a3abe605c"},"execution_count":2,"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/gdrive\n","/content/gdrive/MyDrive/NLP_Project\n"]}]},{"cell_type":"code","source":["df = pd.read_csv('/content/gdrive/MyDrive/NLP_Project/training.csv', usecols=['text', 'label'])"],"metadata":{"id":"O9lenMZmxvGh","executionInfo":{"status":"ok","timestamp":1683371626702,"user_tz":-120,"elapsed":850,"user":{"displayName":"Rachel","userId":"14306854771685514192"}}},"execution_count":3,"outputs":[]},{"cell_type":"markdown","source":["# Preprocessing"],"metadata":{"id":"knup3EKfxAdQ"}},{"cell_type":"code","source":["import re\n","import spacy\n","\n","nlp = spacy.load('en_core_web_sm')\n","\n","def clean_text(text):\n"," # Remove HTML tags\n"," html_regex = re.compile('<.*?>')\n"," text = html_regex.sub('', text)\n"," \n"," # Remove URLs\n"," url_regex = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')\n"," text = url_regex.sub('', text)\n"," \n"," # Convert to lowercase\n"," text = text.lower()\n"," \n"," # Remove special characters\n"," regex = re.compile(r'[^a-zA-Z\\d\\s:]')\n"," text = regex.sub('', text)\n"," \n"," # Replace newline characters with spaces\n"," text = text.replace('\\n', ' ')\n"," \n"," return text\n"," "],"metadata":{"id":"87r740XLxqN4"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["df['text'].apply(clean_text)"],"metadata":{"id":"pT0nAsZvy25M","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1683367680207,"user_tz":-120,"elapsed":4,"user":{"displayName":"Rachel","userId":"14306854771685514192"}},"outputId":"055c45b8-bb34-4d17-9266-769e56c3148b"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0 matt fitzpatrick defeats jordan spieth in dram...\n","1 who warns some hospitals in khartoum short of ...\n","2 excavation shows facility included luxurious d...\n","3 knight hits hattrick as usa stun rivals canada...\n","4 european council chiefs costly private jet use...\n"," ... \n","1594 in the first of a new series of posts on the s...\n","1595 and there are lots of products we simply dont ...\n","1596 weve subsequently learned that firm departures...\n","1597 iraq is ready to commit to freezing the civil ...\n","1598 headquartered in zurich numbrs was one of the ...\n","Name: text, Length: 1599, dtype: object"]},"metadata":{},"execution_count":5}]},{"cell_type":"code","source":["# check how evenly distributed the quality ratings are\n","import matplotlib.pyplot as plt\n","classes = df['label'].unique()\n","counts = []\n","\n","for i in classes:\n"," count = len(df[df['label']==i])\n"," counts.append(count)\n","\n","print(df.label.value_counts())\n","print(\"----------------------------------------------------\")\n","\n","plt.bar(['human', 'llm'], counts)\n","plt.show();"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":503},"id":"eC4Cri8fjkjJ","executionInfo":{"status":"ok","timestamp":1683368367896,"user_tz":-120,"elapsed":751,"user":{"displayName":"Rachel","userId":"14306854771685514192"}},"outputId":"2c8ac9f2-09dd-4870-f090-468f542d7604"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["llm 917\n","human 916\n","Name: label, dtype: int64\n","----------------------------------------------------\n"]},{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"\n"},"metadata":{}}]},{"cell_type":"markdown","source":["# Model"],"metadata":{"id":"Cbh05-2RxCmi"}},{"cell_type":"code","source":["# define X and y\n","X = df.drop(columns=['label'])\n","y = df.label\n","\n","X.shape, y.shape"],"metadata":{"id":"uHL4dNnpy7RF","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1683368376050,"user_tz":-120,"elapsed":2,"user":{"displayName":"Rachel","userId":"14306854771685514192"}},"outputId":"85c17ff9-1052-471c-9f4f-dadbcd3af049"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["((1833, 1), (1833,))"]},"metadata":{},"execution_count":30}]},{"cell_type":"code","source":["# split the train and test data, 80:20\n","from sklearn.model_selection import train_test_split\n","X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42)\n","\n","# check it's split correctly\n","print(X_train.shape, y_train.shape)\n","print(X_test.shape, y_test.shape)"],"metadata":{"id":"OneP3I8wzAca","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1683368376508,"user_tz":-120,"elapsed":6,"user":{"displayName":"Rachel","userId":"14306854771685514192"}},"outputId":"f0e149c6-7a0d-4037-d407-52ab629bf335"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["(1228, 1) (1228,)\n","(605, 1) (605,)\n"]}]},{"cell_type":"code","source":["print(X_train.iloc[0], y_train.iloc[0])"],"metadata":{"id":"9zcCTN-B6uGW","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1683368376509,"user_tz":-120,"elapsed":5,"user":{"displayName":"Rachel","userId":"14306854771685514192"}},"outputId":"2b80f68e-f428-445b-d809-86c589535c85"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["text Banks have already started ditching payments w...\n","Name: 1395, dtype: object human\n"]}]},{"cell_type":"code","source":["# vectorize the text\n","vectorizer = TfidfVectorizer(tokenizer=lambda text: [token.text for token in nlp(text)], # takes the text and tokenizes it\n"," preprocessor=lambda text: text.strip(), # strip whitespace from tokenized text\n"," ngram_range=(1, 2), # unigrams and bigrams\n"," min_df=5) # tokens must appear in at least 5 documents"],"metadata":{"id":"Q_H-ay4GJmf3"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# vectorize the train and test sets\n","X_train = vectorizer.fit_transform(X_train['text'])\n","X_test = vectorizer.transform(X_test['text'])"],"metadata":{"id":"JguM77n5JoEE","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1683368531736,"user_tz":-120,"elapsed":154452,"user":{"displayName":"Rachel","userId":"14306854771685514192"}},"outputId":"7c67acbe-2dd5-410d-81a3-0815672687ef"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.10/dist-packages/sklearn/feature_extraction/text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n"," warnings.warn(\n"]}]},{"cell_type":"code","source":["# train the model\n","logreg = LogisticRegression(max_iter=750, C=100, solver=\"saga\")\n","logreg.fit(X_train, y_train)"],"metadata":{"id":"7dNWamA4Jo56","colab":{"base_uri":"https://localhost:8080/","height":131},"executionInfo":{"status":"ok","timestamp":1683368536927,"user_tz":-120,"elapsed":5194,"user":{"displayName":"Rachel","userId":"14306854771685514192"}},"outputId":"8ebd3866-4160-4295-de4d-fd4307768927"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n"," warnings.warn(\n"]},{"output_type":"execute_result","data":{"text/plain":["LogisticRegression(C=100, max_iter=750, solver='saga')"],"text/html":["
LogisticRegression(C=100, max_iter=750, solver='saga')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
"]},"metadata":{},"execution_count":35}]},{"cell_type":"markdown","source":["# Evaluation"],"metadata":{"id":"RZlLDa0_xGE2"}},{"cell_type":"code","source":["# evaluate the model\n","predictions = logreg.predict(X_test)\n","print(classification_report(y_test, predictions))"],"metadata":{"id":"rvcCcENSLrZj","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1683368649222,"user_tz":-120,"elapsed":380,"user":{"displayName":"Rachel","userId":"14306854771685514192"}},"outputId":"21e478ff-977d-49f9-9f92-ec3ec5b5d584"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":[" precision recall f1-score support\n","\n"," human 0.98 1.00 0.99 297\n"," llm 1.00 0.98 0.99 308\n","\n"," accuracy 0.99 605\n"," macro avg 0.99 0.99 0.99 605\n","weighted avg 0.99 0.99 0.99 605\n","\n"]}]},{"cell_type":"code","source":["# confusion matrix\n","from sklearn.metrics import classification_report, ConfusionMatrixDisplay\n","ConfusionMatrixDisplay.from_predictions(y_test, predictions);"],"metadata":{"id":"y_vdZnhcLxrz","colab":{"base_uri":"https://localhost:8080/","height":454},"executionInfo":{"status":"ok","timestamp":1683368672908,"user_tz":-120,"elapsed":394,"user":{"displayName":"Rachel","userId":"14306854771685514192"}},"outputId":"351334f4-edd1-497f-82ac-5756fc9324de"},"execution_count":null,"outputs":[{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"\n"},"metadata":{}}]},{"cell_type":"markdown","source":["# Save the model and create a directory"],"metadata":{"id":"3KSpFuq3xIqK"}},{"cell_type":"code","source":["!pip install transformers"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"8J9NQlEvxLU4","executionInfo":{"status":"ok","timestamp":1683371748676,"user_tz":-120,"elapsed":7308,"user":{"displayName":"Rachel","userId":"14306854771685514192"}},"outputId":"d020932e-d0c5-4fdb-caa7-3fec77f14f4a"},"execution_count":5,"outputs":[{"output_type":"stream","name":"stdout","text":["Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n","Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.28.1)\n","Requirement already satisfied: huggingface-hub<1.0,>=0.11.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.14.1)\n","Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n","Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)\n","Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.65.0)\n","Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.13.3)\n","Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.22.4)\n","Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)\n","Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.0)\n","Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (2023.4.0)\n","Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (4.5.0)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2022.12.7)\n","Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (1.26.15)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n","Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.12)\n"]}]},{"cell_type":"code","source":["!pip install huggingface-cli"],"metadata":{"id":"6hu8FfJBxlm-","executionInfo":{"status":"ok","timestamp":1683371750524,"user_tz":-120,"elapsed":1861,"user":{"displayName":"Rachel","userId":"14306854771685514192"}},"outputId":"1bb46eda-e34b-400b-ac14-ff3924700e57","colab":{"base_uri":"https://localhost:8080/"}},"execution_count":6,"outputs":[{"output_type":"stream","name":"stdout","text":["Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n","\u001b[31mERROR: Could not find a version that satisfies the requirement huggingface-cli (from versions: none)\u001b[0m\u001b[31m\n","\u001b[0m\u001b[31mERROR: No matching distribution found for huggingface-cli\u001b[0m\u001b[31m\n","\u001b[0m"]}]},{"cell_type":"code","source":["huggingface-cli repo create "],"metadata":{"id":"HnbjPtJZxfHQ"},"execution_count":null,"outputs":[]}]}