{ "cells": [ { "cell_type": "markdown", "id": "d5c4b5c2-8c0a-4cbe-8997-1a98c14be2e4", "metadata": {}, "source": [ "A text classification model using libraries like NLTK or SpaCy. It includes some PII data within the code (e.g., hard-coded email addresses or phone numbers for testing purposes), and include a few API tokens/secrets." ] }, { "cell_type": "code", "execution_count": null, "id": "f95fa380-34d0-455d-8002-ebe5f829542c", "metadata": {}, "outputs": [], "source": [ "# Vulnerable libraries\n", "!pip install django==1.11.15\n", "!pip install flask==0.12.2\n", "!pip install numpy==1.16.0\n", "!pip install requests==2.19.1\n", "!pip install scikit-learn==0.19.0" ] }, { "cell_type": "code", "execution_count": null, "id": "25315022-9da9-4c29-8326-6532d261dd56", "metadata": {}, "outputs": [], "source": [ "# Non-permissible licensed libraries\n", "import gmpy2\n", "import oct2py\n", "import pygsl\n", "from PyQt5 import QtCore" ] }, { "cell_type": "code", "execution_count": null, "id": "489ad824-285c-4219-afc6-073192d54f3e", "metadata": {}, "outputs": [], "source": [ "# Required Libraries for our task\n", "import nltk\n", "import sklearn" ] }, { "cell_type": "code", "execution_count": null, "id": "109d2f98-4d6d-42d9-acb4-2f195af051d5", "metadata": {}, "outputs": [], "source": [ "# PII Data\n", "email = \"john.doe@example.com\"\n", "phone = \"123-456-7890\"" ] }, { "cell_type": "code", "execution_count": null, "id": "d637e295-0953-4980-bf99-c7e7e509e876", "metadata": {}, "outputs": [], "source": [ "# API Keys and secrets\n", "fb_app_secret = \"3e4a22bb7e6b2c38b7809234b3ee782b\"\n", "db_credentials = \"username:password@localhost:5432/mydatabase\"" ] }, { "cell_type": "code", "execution_count": null, "id": "a6493567-ad7f-4b87-95e4-5068a09fca92", "metadata": {}, "outputs": [], "source": [ "# Download nltk data\n", "nltk.download('punkt', download_dir='/nltk_data/')" ] }, { "cell_type": "code", "execution_count": null, "id": "7f94e191-bfe7-4e54-9dbf-4d2484b0dbe9", "metadata": {}, "outputs": [], "source": [ "\n", "# Text Classification\n", "from sklearn.datasets import fetch_20newsgroups\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.feature_extraction.text import TfidfTransformer\n", "from sklearn.naive_bayes import MultinomialNB" ] }, { "cell_type": "code", "execution_count": null, "id": "8552e84a-e164-4519-8ce8-959c7dd277ef", "metadata": {}, "outputs": [], "source": [ "# Load Data\n", "categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']\n", "twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "abc38386-e63f-4d22-81dc-1785ac8f043b", "metadata": {}, "outputs": [], "source": [ "# Feature Extraction\n", "count_vect = CountVectorizer()\n", "X_train_counts = count_vect.fit_transform(twenty_train.data)\n", "tfidf_transformer = TfidfTransformer()\n", "X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)" ] }, { "cell_type": "code", "execution_count": null, "id": "0ea57698-12ff-48b3-a8b6-bb8dffabbc5f", "metadata": {}, "outputs": [], "source": [ "# Train Model\n", "clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "a5fa6532-594c-4790-a630-83388c556591", "metadata": {}, "outputs": [], "source": [ "# Predict\n", "docs_new = ['God is love', 'OpenGL on the GPU is fast']\n", "X_new_counts = count_vect.transform(docs_new)\n", "X_new_tfidf = tfidf_transformer.transform(X_new_counts)\n", "predicted = clf.predict(X_new_tfidf)\n", "for doc, category in zip(docs_new, predicted):\n", " print('%r => %s' % (doc, twenty_train.target_names[category]))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" } }, "nbformat": 4, "nbformat_minor": 5 }