diff --git "a/Resume_ranking_and_chunking (1).ipynb" "b/Resume_ranking_and_chunking (1).ipynb" new file mode 100644--- /dev/null +++ "b/Resume_ranking_and_chunking (1).ipynb" @@ -0,0 +1,2388 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "H-2L-S6b4ukm", + "outputId": "a13cc6d9-211b-4ad3-d1b3-c1f1bb0d63b3" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.35.2)\n", + "Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (2.15.0)\n", + "Requirement already satisfied: huggingface_hub in /usr/local/lib/python3.10/dist-packages (0.19.4)\n", + "Requirement already satisfied: sentence-transformers in /usr/local/lib/python3.10/dist-packages (2.2.2)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.13.1)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.23.5)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.2)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.6.3)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n", + "Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.15.0)\n", + "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.1)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.1)\n", + "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (10.0.1)\n", + "Requirement already satisfied: pyarrow-hotfix in /usr/local/lib/python3.10/dist-packages (from datasets) (0.6)\n", + "Requirement already satisfied: dill<0.3.8,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.7)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (1.5.3)\n", + "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.4.1)\n", + "Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.15)\n", + "Requirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2023.6.0)\n", + "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.9.1)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (4.5.0)\n", + "Requirement already satisfied: torch>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (2.1.0+cu121)\n", + "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.16.0+cu121)\n", + "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.2.2)\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.11.4)\n", + "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (3.8.1)\n", + "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.1.99)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.1.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.4)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.4)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.0)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.6)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.11.17)\n", + "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence-transformers) (1.12)\n", + "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence-transformers) (3.2.1)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence-transformers) (3.1.2)\n", + "Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence-transformers) (2.1.0)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->sentence-transformers) (8.1.7)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk->sentence-transformers) (1.3.2)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2023.3.post1)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->sentence-transformers) (3.2.0)\n", + "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.10/dist-packages (from torchvision->sentence-transformers) (9.4.0)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->datasets) (1.16.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.6.0->sentence-transformers) (2.1.3)\n", + "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.6.0->sentence-transformers) (1.3.0)\n" + ] + } + ], + "source": [ + "pip install transformers datasets huggingface_hub sentence-transformers" + ] + }, + { + "cell_type": "code", + "source": [ + "import re\n", + "import nltk\n", + "from nltk.corpus import stopwords\n", + "import torch\n", + "from torch.utils.data import DataLoader, TensorDataset\n", + "from transformers import AutoTokenizer, AutoModelForMaskedLM, AdamW\n", + "import pandas as pd\n", + "from tqdm import tqdm" + ], + "metadata": { + "id": "Jk533_F14yV8" + }, + "execution_count": 13, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Load your unlabeled dataset\n", + "resumes = pd.read_csv('/content/MyResume2.csv')" + ], + "metadata": { + "id": "IR-KIxHd5iyu" + }, + "execution_count": 14, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "resumes.head(5)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "Y0sgNBwr5mzH", + "outputId": "7d303582-9067-4426-ad13-0d86a4bba2df" + }, + "execution_count": 15, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Resumes\n", + "0 Global Sales Administrator Biamp Systems Globa...\n", + "1 Python Developer PythonIT ...\n", + "3 UI Front End Developer UI Fro...\n", + "4 IT Security Analyst IT..." + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Resumes
0Global Sales Administrator Biamp Systems Globa...
1Python Developer <span class=\"hl\">Python</span...
2IT Project Manager <span class=\"hl\">IT</span> ...
3UI Front End Developer UI <span class=\"hl\">Fro...
4IT Security Analyst <span class=\"hl\">IT</span>...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 15 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Define the function for cleaning text\n", + "def clean_text(text):\n", + " return re.sub(r\"(.*?)\", r\"\\1\", text)\n", + "# Apply the function to the entire column\n", + "resumes['Resumes'] = resumes['Resumes'].apply(clean_text)" + ], + "metadata": { + "id": "MrCrvWv65nAw" + }, + "execution_count": 16, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import nltk\n", + "nltk.download('punkt')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "aUdNZquW4yXo", + "outputId": "39821786-1a0b-4cfa-c2cc-86622675ccee" + }, + "execution_count": 17, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package punkt to /root/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 17 + } + ] + }, + { + "cell_type": "code", + "source": [ + "import nltk\n", + "nltk.download('stopwords')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "09C8uhGu51Vh", + "outputId": "0dd144a6-228e-4522-cddf-1a613642e995" + }, + "execution_count": 18, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 18 + } + ] + }, + { + "cell_type": "code", + "source": [ + "def clean_resume(resume):\n", + " if isinstance(resume, str):\n", + " # Convert to lowercase\n", + " resume = resume.lower()\n", + "\n", + " # Remove URLs, RT, cc, hashtags, mentions, non-ASCII characters, punctuation, and extra whitespace\n", + " resume = re.sub('http\\S+\\s*|RT|cc|#\\S+|@\\S+|[^\\x00-\\x7f]|[^\\w\\s]', ' ', resume)\n", + " resume = re.sub('\\s+', ' ', resume).strip()\n", + "\n", + " return resume\n", + " else:\n", + " return ''\n", + "\n", + "# Applying the cleaning function to a Datasets\n", + "resumes['Resumes'] = resumes['Resumes'].apply(lambda x: clean_resume(x))" + ], + "metadata": { + "id": "TWyPQ63w51kN" + }, + "execution_count": 19, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "from transformers import AutoTokenizer, AutoModelForMaskedLM, AdamW\n", + "import torch\n", + "from torch.utils.data import DataLoader, TensorDataset\n", + "from tqdm import tqdm\n", + "\n", + "# Load the pre-trained model\n", + "mpnet = \"sentence-transformers/all-mpnet-base-v2\"\n", + "tokenizer = AutoTokenizer.from_pretrained(mpnet)\n", + "pretrained_model = AutoModelForMaskedLM.from_pretrained(mpnet)\n", + "\n", + "# Assuming 'resumes' is a DataFrame with a column named 'Resumes'\n", + "texts = resumes['Resumes'].tolist()\n", + "\n", + "# Tokenize and encode the unlabeled data\n", + "encodings = tokenizer(texts, padding=True, truncation = True, return_tensors='pt')\n", + "\n", + "# Create a TensorDataset\n", + "dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'])\n", + "\n", + "# Move the model to the appropriate device (CPU or GPU)\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "pretrained_model.to(device)\n", + "\n", + "# Initialize the optimizer\n", + "optimizer = AdamW(pretrained_model.parameters(), lr=1e-5)\n", + "\n", + "batch_size = 8\n", + "epochs = 3\n", + "import math\n", + "\n", + "# Experiment with different chunk sizes\n", + "chunk_sizes_to_try = [200] # Can add more sizes later\n", + "\n", + "for chunk_size in chunk_sizes_to_try:\n", + " for epoch in range(epochs):\n", + " tqdm_dataloader = tqdm(DataLoader(dataset, batch_size=batch_size, shuffle=True), desc=f'Epoch {epoch + 1}/{epochs}')\n", + "\n", + " pretrained_model.train()\n", + " for batch in tqdm_dataloader:\n", + " input_ids, attention_mask = batch\n", + " input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)\n", + "\n", + " # Calculate number of chunks for current batch\n", + " sequence_length = input_ids.size(1) # Get actual sequence length\n", + " num_chunks = math.ceil(sequence_length / chunk_size)\n", + "\n", + " for i in range(num_chunks):\n", + " start_idx = i * chunk_size\n", + " end_idx = min((i + 1) * chunk_size, sequence_length) # Handle final chunk\n", + "\n", + " # Extract chunk data\n", + " input_ids_chunk = input_ids[:, start_idx:end_idx]\n", + " attention_mask_chunk = attention_mask[:, start_idx:end_idx]\n", + "\n", + " # Forward pass\n", + " outputs = pretrained_model(\n", + " input_ids_chunk, attention_mask=attention_mask_chunk, labels=input_ids_chunk.reshape(-1)\n", + " )\n", + "\n", + " # Calculate loss\n", + " loss = outputs.loss\n", + "\n", + " # Backward pass and optimization\n", + " optimizer.zero_grad()\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " # Update progress bar\n", + " tqdm_dataloader.set_postfix({'Loss': loss.item(), 'Chunk Size': chunk_size})" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "kypmxXhz4ybO", + "outputId": "0b46968a-5485-4e2e-dc4d-f1c9e023090b" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Some weights of the model checkpoint at sentence-transformers/all-mpnet-base-v2 were not used when initializing MPNetForMaskedLM: ['pooler.dense.weight', 'pooler.dense.bias']\n", + "- This IS expected if you are initializing MPNetForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing MPNetForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "Some weights of MPNetForMaskedLM were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + "Epoch 1/3: 2%|▏ | 67/3202 [01:06<54:36, 1.04s/it, Loss=3.44, Chunk Size=200]" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/gdrive')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GrFenXiepxRA", + "outputId": "84732ce8-bd2f-40de-a7e4-573ffa880ade" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/gdrive\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "pretrained_model.save_pretrained('/content/gdrive/My Drive/Finetunedmodel/fine_tuned_mpnet_base_v2')\n", + "tokenizer.save_pretrained('/content/gdrive/My Drive/Finetunedmodel/fine_tuned_mpnet_base_v2')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "h2jJmGCQgnUK", + "outputId": "5c8811d8-bb6f-46be-a5b2-1112d07b5d91" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "('/content/gdrive/My Drive/Finetunedmodel/fine_tuned_mpnet_v2/tokenizer_config.json',\n", + " '/content/gdrive/My Drive/Finetunedmodel/fine_tuned_mpnet_v2/special_tokens_map.json',\n", + " '/content/gdrive/My Drive/Finetunedmodel/fine_tuned_mpnet_v2/vocab.txt',\n", + " '/content/gdrive/My Drive/Finetunedmodel/fine_tuned_mpnet_v2/added_tokens.json',\n", + " '/content/gdrive/My Drive/Finetunedmodel/fine_tuned_mpnet_v2/tokenizer.json')" + ] + }, + "metadata": {}, + "execution_count": 11 + } + ] + }, + { + "cell_type": "code", + "source": [ + "ls" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FxxiUXhStwLk", + "outputId": "12b2967d-c5fa-405d-87e9-70dbf78eea55" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[0m\u001b[01;34mfine_tuned_mpnet\u001b[0m/ \u001b[01;34mfine_tuned_resumes\u001b[0m/ resumes6000.csv the_resumesFirst.csv\n", + "fine_tuned_mpnet.zip \u001b[01;34mgdrive\u001b[0m/ \u001b[01;34msample_data\u001b[0m/\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "pwd" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "x4KmqwXJt2AK", + "outputId": "8f359dba-ee83-41f7-da9c-755dda297926" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'/content'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 96 + } + ] + }, + { + "cell_type": "code", + "source": [ + "cd /content" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "na5_rkhhtIFS", + "outputId": "039a56d3-e2ce-49dd-b1a7-450300e2bdf8" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "/content\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Save the fine-tuned model\n", + "pretrained_model.save_pretrained('fine_tuned_mpnet_v1')\n", + "tokenizer.save_pretrained('fine_tuned_mpnet_v1')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Yt-tBcsCvXss", + "outputId": "b025722e-79a8-4361-930a-804a702ac232" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "('fine_tuned_mpnet_v1/tokenizer_config.json',\n", + " 'fine_tuned_mpnet_v1/special_tokens_map.json',\n", + " 'fine_tuned_mpnet_v1/vocab.txt',\n", + " 'fine_tuned_mpnet_v1/added_tokens.json',\n", + " 'fine_tuned_mpnet_v1/tokenizer.json')" + ] + }, + "metadata": {}, + "execution_count": 100 + } + ] + }, + { + "cell_type": "code", + "source": [ + "ls" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Wd7txZXhn3YX", + "outputId": "5ea0c99a-6541-4ea9-f6c6-e6bc8daf27ed" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[0m\u001b[01;34mfine_tuned_mpnet\u001b[0m/ fine_tuned_mpnet.zip \u001b[01;34mgdrive\u001b[0m/ \u001b[01;34msample_data\u001b[0m/\n", + "\u001b[01;34mfine_tuned_mpnet_v1\u001b[0m/ \u001b[01;34mfine_tuned_resumes\u001b[0m/ resumes6000.csv the_resumesFirst.csv\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "pretrained_model.save_pretrained('/content/gdrive/My Drive/Finetunedmodel/fine_tuned_mpnet_v1')\n", + "tokenizer.save_pretrained('/content/gdrive/My Drive/Finetunedmodel/fine_tuned_mpnet_v1')\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ppk2Js4Tv7-5", + "outputId": "8ac4a813-f674-46fb-a5d4-0323d4f8125a" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "('/content/gdrive/My Drive/Finetunedmodel/fine_tuned_mpnet_v1/tokenizer_config.json',\n", + " '/content/gdrive/My Drive/Finetunedmodel/fine_tuned_mpnet_v1/special_tokens_map.json',\n", + " '/content/gdrive/My Drive/Finetunedmodel/fine_tuned_mpnet_v1/vocab.txt',\n", + " '/content/gdrive/My Drive/Finetunedmodel/fine_tuned_mpnet_v1/added_tokens.json',\n", + " '/content/gdrive/My Drive/Finetunedmodel/fine_tuned_mpnet_v1/tokenizer.json')" + ] + }, + "metadata": {}, + "execution_count": 102 + } + ] + }, + { + "cell_type": "code", + "source": [ + "from transformers import AutoTokenizer, AutoModelForMaskedLM\n", + "\n", + "model= 'DeroG/mpnet_new'\n", + "tokenizer = AutoTokenizer.from_pretrained(model)\n", + "model = AutoModelForMaskedLM.from_pretrained(model)" + ], + "metadata": { + "id": "7wXwExyi6rYK" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "pip install hnswlib" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pKDy0s9g9gcK", + "outputId": "5d796c7e-39cf-4465-a964-6ca4791aba4a" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting hnswlib\n", + " Downloading hnswlib-0.8.0.tar.gz (36 kB)\n", + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from hnswlib) (1.23.5)\n", + "Building wheels for collected packages: hnswlib\n", + " Building wheel for hnswlib (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for hnswlib: filename=hnswlib-0.8.0-cp310-cp310-linux_x86_64.whl size=2287618 sha256=f1176656b86f12880d251e5a85b8562e1be1a45d5618ecfa55f021001c459f48\n", + " Stored in directory: /root/.cache/pip/wheels/af/a9/3e/3e5d59ee41664eb31a4e6de67d1846f86d16d93c45f277c4e7\n", + "Successfully built hnswlib\n", + "Installing collected packages: hnswlib\n", + "Successfully installed hnswlib-0.8.0\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "pip install -U sentence-transformers" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-inPZMW9-ask", + "outputId": "06585972-e9fc-4b8f-80fc-ea34f49aa247" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: sentence-transformers in /usr/local/lib/python3.10/dist-packages (2.2.2)\n", + "Requirement already satisfied: transformers<5.0.0,>=4.6.0 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (4.35.2)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (4.66.1)\n", + "Requirement already satisfied: torch>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (2.1.0+cu121)\n", + "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.16.0+cu121)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.23.5)\n", + "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.2.2)\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.11.4)\n", + "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (3.8.1)\n", + "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.1.99)\n", + "Requirement already satisfied: huggingface-hub>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.19.4)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (3.13.1)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (2023.6.0)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (2.31.0)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (6.0.1)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (4.5.0)\n", + "Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (23.2)\n", + "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence-transformers) (1.12)\n", + "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence-transformers) (3.2.1)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence-transformers) (3.1.2)\n", + "Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence-transformers) (2.1.0)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.6.0->sentence-transformers) (2023.6.3)\n", + "Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.6.0->sentence-transformers) (0.15.0)\n", + "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.6.0->sentence-transformers) (0.4.1)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->sentence-transformers) (8.1.7)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk->sentence-transformers) (1.3.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->sentence-transformers) (3.2.0)\n", + "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.10/dist-packages (from torchvision->sentence-transformers) (9.4.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.6.0->sentence-transformers) (2.1.3)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.4.0->sentence-transformers) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.4.0->sentence-transformers) (3.6)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.4.0->sentence-transformers) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.4.0->sentence-transformers) (2023.11.17)\n", + "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.6.0->sentence-transformers) (1.3.0)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import re\n", + "# Download the NLTK stopwords\n", + "import nltk\n", + "nltk.download('punkt')\n", + "from nltk.corpus import stopwords\n", + "import pickle\n", + "import hnswlib\n", + "import sentence_transformers as st\n", + "from sentence_transformers import SentenceTransformer, util\n", + "import time\n", + "from tqdm import tqdm\n", + "import numpy as np\n", + "from math import ceil\n", + "from torch.nn import functional as F" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "eoyL885S9kbH", + "outputId": "d01de86b-0643-4096-dbd5-a183db05a961" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package punkt to /root/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "resumes = pd.read_csv(\"/content/the_resumesFirst.csv\")" + ], + "metadata": { + "id": "cWM655E56wvO" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "resumes.head(5)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "-EKdhiiF9Zgc", + "outputId": "598b2c0d-e0ee-467a-9597-4e3077ad9e24" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Resume Category\n", + "0 CESAR CABAL Financial Analyst ccabal43emailcom... Digital Marketing\n", + "1 Selamawit Yemane Customer Service Manager sela... Digital Marketing\n", + "2 DRISTAN ARTHUR BUDGET ANALYST CONTACT darthure... Digital Marketing\n", + "3 BENTLI FALLA CAREER OBJECTIVE Creative and wit... Digital Marketing\n", + "4 First Last Advertising Copy Writer Bay Area Ca... Digital Marketing" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ResumeCategory
0CESAR CABAL Financial Analyst ccabal43emailcom...Digital Marketing
1Selamawit Yemane Customer Service Manager sela...Digital Marketing
2DRISTAN ARTHUR BUDGET ANALYST CONTACT darthure...Digital Marketing
3BENTLI FALLA CAREER OBJECTIVE Creative and wit...Digital Marketing
4First Last Advertising Copy Writer Bay Area Ca...Digital Marketing
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 175 + } + ] + }, + { + "cell_type": "code", + "source": [ + "import nltk\n", + "nltk.download('stopwords')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YPjo60K-AHGv", + "outputId": "3696fdd4-9095-4bb1-8768-06954b86dc3d" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 176 + } + ] + }, + { + "cell_type": "code", + "source": [ + "Preprop_resumes = resumes[\"Resume\"]" + ], + "metadata": { + "id": "0o9HS80GE26g" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Function for cleaning and preprocessing the resume\n", + "def clean_resume(resume):\n", + " if isinstance(resume, str):\n", + " # Convert to lowercase\n", + " resume = resume.lower()\n", + "\n", + " # Remove URLs, RT, cc, hashtags, mentions, non-ASCII characters, punctuation, and extra whitespace\n", + " resume = re.sub('http\\S+\\s*|RT|cc|#\\S+|@\\S+|[^\\x00-\\x7f]|[^\\w\\s]', ' ', resume)\n", + " resume = re.sub('\\s+', ' ', resume).strip()\n", + "\n", + " # Tokenize the resume\n", + " tokens = nltk.word_tokenize(resume)\n", + "\n", + " # Remove stopwords\n", + " stop_words = set(stopwords.words('english'))\n", + " tokens = [token for token in tokens if token.lower() not in stop_words]\n", + "\n", + " # Join the tokens back into a sentence\n", + " preprocessed_resume = ' '.join(tokens)\n", + "\n", + " return preprocessed_resume\n", + " else:\n", + " return ''\n", + "# Applying the cleaning function to a Datasets\n", + "Preprop_resumes = Preprop_resumes.apply(lambda x: clean_resume(x))" + ], + "metadata": { + "id": "tAQj28-z-rPO" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "resumes = resumes[\"Resume\"].tolist()" + ], + "metadata": { + "id": "14d8JRdvFRgc" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "Preprop_resumes = Preprop_resumes.tolist()" + ], + "metadata": { + "id": "UJgKODhmCeQc" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from transformers import AutoTokenizer, AutoModelForMaskedLM\n", + "\n", + "model= 'DeroG/mpnet_new'\n", + "tokenizer = AutoTokenizer.from_pretrained(model)\n", + "model = AutoModelForMaskedLM.from_pretrained(model)" + ], + "metadata": { + "id": "ML9r4WqyvymZ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from transformers import AutoTokenizer\n", + "import torch\n", + "from tqdm import tqdm\n", + "\n", + "def embed_resumes_with_progress(model, tokenizer, resumes, chunk_size=200):\n", + " \"\"\"\n", + " Embeds a list of resumes using the SentenceTransformer model with chunking and progress bar.\n", + "\n", + " Args:\n", + " model: The SentenceTransformer model.\n", + " tokenizer: The Hugging Face Tokenizer for text pre-processing.\n", + " resumes: A list of preprocessed resumes.\n", + " chunk_size: Maximum number of tokens per chunk (default: 200).\n", + "\n", + " Returns:\n", + " A numpy array containing the averaged embeddings for each resume.\n", + " \"\"\"\n", + " resume_embeddings = []\n", + "\n", + " with tqdm(total=len(Preprop_resumes)) as pbar:\n", + " for resume in Preprop_resumes:\n", + " encoded_chunks = []\n", + " chunks = [resume[i:i+chunk_size] for i in range(0, len(resume), chunk_size)]\n", + " for chunk in chunks:\n", + " encoded_chunk = tokenizer(chunk, padding=True, truncation=True, return_tensors=\"pt\")\n", + " with torch.no_grad():\n", + " chunk_embedding = model(**encoded_chunk)[0]\n", + " attention_mask = encoded_chunk[\"attention_mask\"]\n", + " encoded_chunks.append(chunk_embedding)\n", + "\n", + " # Concatenate the encoded chunks\n", + " concatenated_chunks = torch.cat(encoded_chunks, dim=1)\n", + " resume_embedding = torch.mean(concatenated_chunks, dim=1)\n", + " resume_embeddings.append(resume_embedding)\n", + "\n", + " pbar.update(1)\n", + "\n", + " return torch.cat(resume_embeddings)" + ], + "metadata": { + "id": "IWhRAIEGznBW" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Get resume embeddings\n", + "import torch\n", + "resume_embeddings = embed_resumes_with_progress(model, tokenizer, Preprop_resumes)\n", + "\n", + "# Access individual embedding\n", + "first_resume_embedding = resume_embeddings[0]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cefhi-6uV0u0", + "outputId": "fccef929-1f33-4c1f-a361-bf5dd9cab61c" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "100%|██████████| 9/9 [00:22<00:00, 2.51s/it]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "resume_embeddings.shape" + ], + "metadata": { + "id": "FlG8BH8nG4Ys", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "409cb351-39df-4cff-94c8-7d658c3b558c" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "torch.Size([9, 30527])" + ] + }, + "metadata": {}, + "execution_count": 184 + } + ] + }, + { + "cell_type": "code", + "source": [ + "job_description = \"\"\"\n", + "\n", + "Boyave\n", + "Full-Time\n", + "Description\n", + "Content Creator\n", + "Job brief\n", + "We are looking for a Content Creator to write and publish various types of pieces for our company’s web pages, like articles, ebooks and social media posts.\n", + "Content Creator responsibilities include producing marketing copy to advertise our products, writing blog posts about industry-related topics and promoting our content on social media. To be successful in this role, you should have experience with digital publishing and generating traffic and leads for new business. Please share samples of your work (portfolio or links to published articles) along with your application.\n", + "Ultimately, you will help us reach our target audience by delivering both useful and appealing online information about our company and products.\n", + "Responsibilities\n", + "•\tResearch industry-related topics\n", + "•\tPrepare well-structured drafts using digital publishing platforms\n", + "•\tCreate and distribute marketing copy to advertise our company and products\n", + "•\tInterview industry professionals and incorporate their views in blog posts\n", + "•\tEdit and proofread written pieces before publication\n", + "•\tConduct keyword research and use SEO guidelines to optimize content\n", + "•\tPromote content on social networks and monitor engagement (e.g. comments and shares)\n", + "•\tIdentify customers’ needs and recommend new topics\n", + "•\tCoordinate with marketing and design teams to illustrate articles\n", + "•\tMeasure web traffic to content (e.g. conversion and bounce rates)\n", + "•\tUpdate our websites as needed\n", + "Requirements and skills\n", + "•\tProven work experience as a Content Creator, Copywriter or similar role\n", + "•\tPortfolio of published articles\n", + "•\tHands-on experience with Content Management Systems (e.g. WordPress)\n", + "•\tExcellent writing and editing skills in English\n", + "•\tAn ability to fact-check long-form content pieces\n", + "•\tTime-management skills\n", + "•\tFamiliarity with SEO\n", + "•\tBSc in Marketing, English, Journalism or relevant field\"\"\"" + ], + "metadata": { + "id": "9rePs2i2rE8Y" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def clean_JD(JD):\n", + " \"\"\"\n", + " Preprocesses the provided JD by:\n", + " - Lowercasing all text\n", + " - Removing punctuation\n", + " - Removing stop words and punctuation and sympols\n", + " \"\"\"\n", + " JD = JD.lower()\n", + " JD = re.sub(r\"[^\\w\\s]\", \"\", JD)\n", + " stop_words = stopwords.words(\"english\")\n", + " filtered_words = [word for word in JD.split() if word not in stop_words]\n", + " cleaned_JD = \" \".join(filtered_words)\n", + " return cleaned_JD" + ], + "metadata": { + "id": "fV2s-58itieE" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "cleaned_job_description = clean_JD(job_description)\n", + "print(\"Cleaned Job Description:\", cleaned_job_description)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "W5Wg84rU5V_b", + "outputId": "caee6213-0826-4535-fa7f-e684c91c622a" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Cleaned Job Description: boyave fulltime description content creator job brief looking content creator write publish various types pieces companys web pages like articles ebooks social media posts content creator responsibilities include producing marketing copy advertise products writing blog posts industryrelated topics promoting content social media successful role experience digital publishing generating traffic leads new business please share samples work portfolio links published articles along application ultimately help us reach target audience delivering useful appealing online information company products responsibilities research industryrelated topics prepare wellstructured drafts using digital publishing platforms create distribute marketing copy advertise company products interview industry professionals incorporate views blog posts edit proofread written pieces publication conduct keyword research use seo guidelines optimize content promote content social networks monitor engagement eg comments shares identify customers needs recommend new topics coordinate marketing design teams illustrate articles measure web traffic content eg conversion bounce rates update websites needed requirements skills proven work experience content creator copywriter similar role portfolio published articles handson experience content management systems eg wordpress excellent writing editing skills english ability factcheck longform content pieces timemanagement skills familiarity seo bsc marketing english journalism relevant field\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from transformers import AutoTokenizer\n", + "from tqdm import tqdm\n", + "\n", + "\n", + "def embed_JD_with_progress(model, tokenizer, cleaned_job_description, chunk_size=200):\n", + " \"\"\"\n", + " Embeds a job description using the SentenceTransformer model with chunking and progress bar.\n", + "\n", + " Args:\n", + " model: The SentenceTransformer model.\n", + " tokenizer: The Hugging Face Tokenizer for text pre-processing.\n", + " cleaned_job_description: A preprocessed job description string.\n", + " chunk_size: Maximum number of tokens per chunk (default: 200).\n", + "\n", + " Returns:\n", + " A numpy array containing the embedding for the job description.\n", + " \"\"\"\n", + "\n", + " encoded_chunks = []\n", + " chunks = [cleaned_job_description[i:i+chunk_size] for i in range(0, len(cleaned_job_description), chunk_size)]\n", + "\n", + " with tqdm(total=len(chunks), desc=\"Embedding Job Description\") as pbar:\n", + " for chunk in chunks:\n", + " encoded_chunk = tokenizer(chunk, padding=True, truncation=True, return_tensors=\"pt\")\n", + " with torch.no_grad():\n", + " chunk_embedding = model(**encoded_chunk)[0]\n", + " attention_mask = encoded_chunk[\"attention_mask\"]\n", + " encoded_chunks.append(chunk_embedding)\n", + " pbar.update(1)\n", + "\n", + " concatenated_chunks = torch.cat(encoded_chunks, dim=1)\n", + " JD_embeddings = torch.mean(concatenated_chunks, dim=1)\n", + " return JD_embeddings.cpu().numpy()\n" + ], + "metadata": { + "id": "DUy5qjVk6_-l" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Get resume embeddings\n", + "import torch\n", + "JD_embeddings = embed_JD_with_progress(model, tokenizer, cleaned_job_description)\n", + "\n", + "# Access individual embedding\n", + "first_JD_embedding = JD_embeddings[0]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PBdXLtj_rFDn", + "outputId": "cb1b7d79-1fdc-4d84-c29e-fc3b7b231480" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Embedding Job Description: 100%|██████████| 8/8 [00:01<00:00, 4.79it/s]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "JD_embeddings.shape" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LtcX_V638Qjh", + "outputId": "cfc786df-4c33-42b6-f421-b45d7c014d36" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(1, 30527)" + ] + }, + "metadata": {}, + "execution_count": 190 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Access individual embedding\n", + "first_JD_embedding = JD_embeddings[0]" + ], + "metadata": { + "id": "28E62owWtKE9" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "JD_embeddings" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "yoFomYiXtKG9", + "outputId": "26c5e508-b26b-414d-c40b-e764dabdc470" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([[ 2.4844365, 0.6637995, 7.125794 , ..., -1.8927242, -3.2216737,\n", + " 3.1794276]], dtype=float32)" + ] + }, + "metadata": {}, + "execution_count": 192 + } + ] + }, + { + "cell_type": "code", + "source": [ + "def similarity_percentage(similarity_score):\n", + " if similarity_score < 0.2:\n", + " return 0\n", + " elif 0.2 <= similarity_score < 0.3:\n", + " return similarity_score - 0.25\n", + " elif 0.3 <= similarity_score < 0.4:\n", + " return similarity_score - 0.23\n", + " elif 0.4 <= similarity_score < 0.55:\n", + " return similarity_score - 0.19\n", + " elif 0.55 <= similarity_score < 0.65:\n", + " return similarity_score - 0.14\n", + " else:\n", + " return similarity_score - 0.1" + ], + "metadata": { + "id": "UG_zHu2iudVi" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def create_hnsw_index(embeddings, max_elements, ef_construction, M, ef):\n", + " \"\"\"\n", + " Creates and initializes an HNSWLib index with the specified parameters.\n", + "\n", + " Args:\n", + " embeddings: A list of embedding vectors.\n", + " max_elements: Maximum number of elements to store in the index.\n", + " ef_construction: Number of elements to consider during index construction.\n", + " M: Maximum number of connections per node in the HNSW graph.\n", + " ef: Number of elements to consider during search.\n", + "\n", + " Returns:\n", + " An HNSWLib index object.\n", + " \"\"\"\n", + " embedding_size = 30527\n", + " index = hnswlib.Index(space='cosine', dim=embedding_size)\n", + " index.init_index(max_elements, ef_construction, M)\n", + " index.add_items(resume_embeddings, list(range(len(resume_embeddings))))\n", + " index.set_ef(ef)\n", + " return index" + ], + "metadata": { + "id": "phQs8On6udc2" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Define parameters for the index\n", + "max_elements = len(resume_embeddings)\n", + "ef_construction = 2000\n", + "M = 200\n", + "ef = 50\n", + "index = create_hnsw_index(resume_embeddings, max_elements, ef_construction, M, ef)\n", + "print(\"Corpus loaded with {} resumes / embeddings\".format(len(resume_embeddings)))\n", + "\n", + "# Retrieve resumes based on job description\n", + "take_k_hits = int(input(\"\\nHow many top resumes do you want to be retrieved?\\n\\n\"))\n", + "\n", + "start_time = time.time()\n", + "\n", + "resume_ids, dist = index.knn_query(JD_embeddings, take_k_hits)\n", + "\n", + "# Calculate the similarity percentage and create a DataFrame\n", + "hits = [{'resume_id': id, 'Original_Score': 1 - score, 'Adjusted_Score': similarity_percentage(1 - score)} for id, score in zip(resume_ids[0], dist[0])]\n", + "hits = sorted(hits, key=lambda x: x['Adjusted_Score'], reverse=True)\n", + "\n", + "end_time = time.time()\n", + "\n", + "print(\"Results (after {:.3f} seconds):\".format(end_time - start_time))\n", + "\n", + "# Create a DataFrame with original and adjusted similarity scores\n", + "Resumeranking = pd.DataFrame(hits[:take_k_hits])\n", + "Resumeranking['Resumes'] = Resumeranking['resume_id'].map(lambda x: resumes[x])\n", + "Resumeranking = Resumeranking.drop(['resume_id'], axis=1)\n", + "\n", + "# Convert Adjusted_Score to percentage format\n", + "Resumeranking['Original_Score'] = Resumeranking['Original_Score']\n", + "Resumeranking['Adjusted_Score'] = (Resumeranking['Adjusted_Score'] * 100).round(2)\n", + "Resumeranking['Adjusted_Score'] = Resumeranking['Adjusted_Score'].astype(str) + '%'\n", + "\n", + "Resumeranking = Resumeranking[['Resumes', 'Original_Score', 'Adjusted_Score']]\n", + "Resumeranking" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 436 + }, + "id": "AlHtoaZ_udeq", + "outputId": "e769c65e-25f6-434f-df77-1e49efe9888b" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Corpus loaded with 9 resumes / embeddings\n", + "\n", + "How many top resumes do you want to be retrieved?\n", + "\n", + "9\n", + "Results (after 0.001 seconds):\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Resumes Original_Score \\\n", + "0 First Last Advertising Copy Writer Bay Area Ca... 0.980730 \n", + "1 BENTLI FALLA CAREER OBJECTIVE Creative and wit... 0.975105 \n", + "2 CESAR CABAL Financial Analyst ccabal43emailcom... 0.965548 \n", + "3 First Last Digital Marketing Manager WORK EXPE... 0.962464 \n", + "4 CHELSEY DEGA Marketing Manager chelseydegaemai... 0.959740 \n", + "5 Reinhardt Konig Human Resources Intern Driven ... 0.958368 \n", + "6 DRISTAN ARTHUR BUDGET ANALYST CONTACT darthure... 0.954127 \n", + "7 Selamawit Yemane Customer Service Manager sela... 0.953998 \n", + "8 OBJECTIVE To impart my knowledge in Veterinary... 0.931807 \n", + "\n", + " Adjusted_Score \n", + "0 88.07% \n", + "1 87.51% \n", + "2 86.55% \n", + "3 86.25% \n", + "4 85.97% \n", + "5 85.84% \n", + "6 85.41% \n", + "7 85.4% \n", + "8 83.18% " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ResumesOriginal_ScoreAdjusted_Score
0First Last Advertising Copy Writer Bay Area Ca...0.98073088.07%
1BENTLI FALLA CAREER OBJECTIVE Creative and wit...0.97510587.51%
2CESAR CABAL Financial Analyst ccabal43emailcom...0.96554886.55%
3First Last Digital Marketing Manager WORK EXPE...0.96246486.25%
4CHELSEY DEGA Marketing Manager chelseydegaemai...0.95974085.97%
5Reinhardt Konig Human Resources Intern Driven ...0.95836885.84%
6DRISTAN ARTHUR BUDGET ANALYST CONTACT darthure...0.95412785.41%
7Selamawit Yemane Customer Service Manager sela...0.95399885.4%
8OBJECTIVE To impart my knowledge in Veterinary...0.93180783.18%
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 203 + } + ] + }, + { + "cell_type": "code", + "source": [ + "Resumeranking['Resumes'][0]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 174 + }, + "id": "q3iJMfuAudia", + "outputId": "457f5aa8-b1ea-4d12-9701-5bfdf7e69585" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'First Last Advertising Copy Writer Bay Area California +1234456789 professionalemailresumewordedcom linkedincominusername Advertising Copy Writer with eight years experience working with art directors to craft meaningful content for marketing and sales campaigns for over 75 clients to meet their advertising goals Supervised up to 10 junior copywriters and interns to assist in sharpening their writing skills to create content that delights its readers Acquired the Most Impactful Omnichannel Campaign Award for creating a crossplatform campaign concept that garnered over two million YouTube views and increased the clients customer base by 47 RELEVANT WORK EXPERIENCE Resume Worded New York NY 2015 Present Advertising Copy Writer 2017 Present Ranked number one in the top quartile of 50 copywriters for productivity metrics The metrics included the number of ads completed as per the brief within the scheduled timeframe with the least rewrites Consulted with eight department leads including Art Buying Merchandising Print Production and Social Media Management regarding print and graphic techniques to be considered when crafting the content Developed a $3M display for professionalgrade telescopes that increased category revenue by 50 Advertising Copy Writer 2015 2017 Developed copy for 10 omnichannel advertising campaigns for an international client while working remotely with a creative director and art director in London Collaborated with an art director on five awardwinning print advertisements that resulted in a 35 increase in client revenue and secured two new creative projects Growthsi San Francisco CA 2013 2015 Junior Copywriter Created social media strategies and rolled out 32 social media posts for four clients monthly which resulted in increased views likes and shares Optimized dated content across three websites to make it SEOfriendly by using the correct keywords and prioritizing highquality inbound and outbound links Resume Worded Exciting Company San Francisco CA 2011 2013 Copywriting Intern Started a weekly knowledgesharing Meetup for the companys 30 copywriting interns which resulted in 50 of the cohort being permanently employed Managed social media platforms for an advertising agency using TikTok Pinterest Facebook Instagram and Twitter and successfully increased engagement by 65 Collaborated with the other creative interns to brainstorm pitch ideas for dream clients which included names logos media content etc which resulted in two successful client pitches EDUCATION Resume Worded University New York NY Bachelor of Arts Journalism and Media Studies SKILLS Technical Skills Social Media WordPress Digital Marketing Search Engine Optimization SEO Languages English Native Portuguese Fluent Arabic Conversational'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 204 + } + ] + }, + { + "cell_type": "code", + "source": [ + "Resumeranking['Resumes'][1]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 261 + }, + "id": "etbBaUe3udjz", + "outputId": "44b96e1a-4b77-4b79-f1fa-d3d8a5db3cfe" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'BENTLI FALLA CAREER OBJECTIVE Creative and witty media mind with 3 years in professional social Social Media Content media roles Looking to use my knowledgebase of platforms and Creator trends which have generated trending posts with over 2M+ views to create captivating content for a company like Blueland = bentlifallaemailcom 123 4567890 o 125 WORK EXPERIENCE Q Las Vegas NV ff LinkedIn Social Media Content Moderator Teleperformance September 2018 current Las Vegas NV EDUCATION e Navigated 5+ social media platforms posting and engaging with content to increase followers by 22 since 2019 BS Maintained strong grasp of English utilizing grammatically Communication accurate sentences to answer 100+ questions per shift University of Nevada Analyzed media inquiries and determined how to address August 2014 May 2018 customer issues within 2 hours of posting Updated knowledge of software systems including Office Achieved 55 WPM responding to chats in 45 seconds Las Vegas NV Social Media Manager Intern Blue Ocean Digital Partners May 2018 September 2018 Las Vegas NV e Developed skills in SEO content creation KPIs and trend predictions Made engagement strategies presenting to 40+ colleagues Worked with Social Media Manager to boast lead generation creating 3 funnels resulting in a 7 increase in contact form submissions Generated social media presence on TikTok in 2018 rooting the company in the app before it was trending Maintained trending knowledgebase and strong understanding of 8 social media platforms including YouTube Facebook and TikTok SKILLS Instagram Twitter Facebook Pinterest YouTube TikTok Google Analytics SEO Paid social media advertising Attention to Detail'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 199 + } + ] + }, + { + "cell_type": "code", + "source": [ + "Resumeranking['Resumes'][2]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 400 + }, + "id": "LGl_LxhEudnm", + "outputId": "353c6ac5-f8de-47ae-e798-513b960c2a93" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'CESAR CABAL Financial Analyst ccabal43emailcom 123 4567890 Atlanta GA LinkedIn EDUCATION Masters Business Administration University of Georgia 2014 2016 Atlanta GA Bachelor of Science Finance University of Georgia 2010 2014 Atlanta GA SKILLS MS Excel Original Research Budget Development Project Management Data Analysis MA CAREER OBJECTIVE Detailoriented financial analyst with 6 years in quantitative statistical analysis budgeting accounting and forecasting Leveraging strong analytical skills to support operations through robust modeling to facilitate executivelevel decisionmaking and increase company revenue Quickly adapt to new technologies and attaining CPA licensure to become an indispensable asset to Logistics Property WORK EXPERIENCE Financial Analyst King Spalding October 2018 current Atlanta GA Update daily cash position through analysis investigation and reporting on key movements and trends in the PL lines Provide financial guidance to BusinessFunctions and assist in decisionmaking contributing to a 13 growth trend by developing strategic longrange planning recommendations for management Monitor regulatory developments and industry trends to facilitate incorporation into the firms AML program Perform a comprehensive analysis of financial issues debt and in depth market share and industry report that increased market share by 19 thereby increasing revenue by $12M Managed 3 financial statements with advanced layering of discounted cash flow analysis and internal planning models linked to automation tools that decreased manual admin tasks by 48 Financial Analyst Barnum Financial Group May 2016 October 2018 Atlanta GA Designed and created weekly and monthly comprehensive spending reports abstracts and charts to present data and guide investment strategies and performed adhoc analysis and reporting Improved operational efficiency of finance systems by 11 through implementation of streamlined datamanagement procedures Coordinated with underwriters lenders and loan managers to manage portfolios for multimilliondollar accounts Monitored the fund and equity investments including inflows outflows valuations risk ratings and record maintenance Established new forecasting tracking and management reporting system to improve availability and accuracy of financial data triggering a 14 increase in accuracy Financial Analysis Intern RaceTrac January 2016 April 2016 Atlanta GA Improved reporting process time by 20 with ad hoc analyses Created financial models and contributed findings to management to support initiatives for internal customers'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 200 + } + ] + }, + { + "cell_type": "code", + "source": [ + "Resumeranking['Resumes'][4]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 313 + }, + "id": "EbJ-XIMXudpM", + "outputId": "66f0d93b-3385-471c-8f64-8bb2a3f92822" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'CHELSEY DEGA Marketing Manager chelseydegaemailcom 123 4567890 Brooklyn NY LinkedIn EDUCATION BS Marketing University of Pittsburgh September 2010 April 2014 Pittsburgh PA SKILLS HubSpot Salesforce Microsoft Excel Word PowerPoint Paid Ads Facebook Google LinkedIn retargeting AB testing audience segmentation Google Analytics SEO WORK EXPERIENCE Marketing Manager HADASSAH May 2018 current New York NY Directed the launch of a campaign for a new platform resulting in revenue of $53M in the first year Created a holistic paid acquisition strategy ultimately leading to an ROI of 41 for every dollar spent Built out a culture of robust data collection and AB testing to iteratively improve campaign performance leading to an average improvement of 64 from campaign start to end Developed partnerships with higher education institutions in the US resulting in an incremental $74M in revenue Exceeded sales targets by 32 for the full year in 2019 Identified vendors who were underperforming leading to a reduction in costs of $425000 while exceeding revenue targets Oversaw a team of 5 fulltime marketers and 4 paid contractors Marketing Manager Fora Financial August 2016 May 2018 New York NY Developed a comprehensive paid acquisition strategy across Google Facebook and industry newsletters resulting in new leads that generated $18M in 2017 Built a robust brand awareness campaign through conferences and speaking engagements leading to an increase in inbound leads of 68 year over year Led the implementation of realtime reporting on marketing spend to adjust bid strategy leading to an improvement of ROI by 22 Exceeded growth targets every quarter by 23 on average Managed a team of 4 fulltime marketing associates Marketing Analyst Insight Global August 2014 August 2016 Washington DC Created AB testing plan for Facebook ad copy leading to an improvement in ROI of 12 Built key reports in Tableau for executive team around KPIs such as marketing spend new leads revenue generates and ROI saving 9 hours of manual reporting each week'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 201 + } + ] + }, + { + "cell_type": "code", + "source": [ + "Resumeranking['Resumes'][8]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 278 + }, + "id": "B6MH8RmjHEjE", + "outputId": "cb5aebaf-76b5-45ec-f482-52f9ad507a7a" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'OBJECTIVE To impart my knowledge in Veterinary medicine and provide quality healthcare to animals My aim is to sustain animal life by administering immunization to guard them against diseases PERSONAL INFORMATION Brian Avelar 322 Dogwood Lane Tucson AZ 85712 7778844221 bavelarsampleresumenet Date of Birth May 6 1979 Place of Birth PA Citizenship American Gender Male PROFILE SUMMARY Knowledgeable in toxicology and laboratory animal medicines Great skills in interpreting animal behavior Pathology EDUCATION Doctor of Veterinary Medicine 2009 Polytechnic Institute of New York University Brooklyn BS in Biology 2006 Polytechnic Institute of New York University Brooklyn EMPLOYMENT HISTORY Pharmaceutical Veterinarian 2007 Present AstraZeneca Pharmaceuticals LP Wilmington DE Responsibilities Administered immunization to animals to protect them from diseases Studied cause of animal diseases Reviewed the list of raw materials to be used and mixed in the vaccine Tested efficacy and effectiveness of vaccines to dying animals Inspected the meat products given to animals as food Developed vaccines for animals City Veterinarian 2006 2007 Private Practice Responsibilities Prescribed medicines for animals Provided daily menu for sick animals Immunized the cattle in MAGV Farm Educated the pet owners on proper animal bathing and feeding Demonstrated to clients the administration of vaccine to their pets RESEARCH Journey to Animal Life TRAININGCERTIFICATION Certificate in Animal Pathology AWARD EnvironmentFriendly Research PROFESSIONAL MEMBERSHIP American Animal Hospital Association American Veterinary Medical Association SKILLS Great communication skills Strong familiarity with animal diseases Ability to tame uncooperative animals'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 205 + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "PxzWAz19HF6j" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file