DeroG commited on
Commit
73964a3
1 Parent(s): 9c47bcd

Upload 7 files

Browse files
Untitled153.ipynb ADDED
@@ -0,0 +1,684 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "T4"
8
+ },
9
+ "kernelspec": {
10
+ "name": "python3",
11
+ "display_name": "Python 3"
12
+ },
13
+ "language_info": {
14
+ "name": "python"
15
+ },
16
+ "accelerator": "GPU"
17
+ },
18
+ "cells": [
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": 21,
22
+ "metadata": {
23
+ "colab": {
24
+ "base_uri": "https://localhost:8080/"
25
+ },
26
+ "id": "H-2L-S6b4ukm",
27
+ "outputId": "12789315-f584-4d98-afd4-2bd35d0453d9"
28
+ },
29
+ "outputs": [
30
+ {
31
+ "output_type": "stream",
32
+ "name": "stdout",
33
+ "text": [
34
+ "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.35.2)\n",
35
+ "Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (2.15.0)\n",
36
+ "Requirement already satisfied: huggingface_hub in /usr/local/lib/python3.10/dist-packages (0.19.4)\n",
37
+ "Requirement already satisfied: sentence-transformers in /usr/local/lib/python3.10/dist-packages (2.2.2)\n",
38
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.13.1)\n",
39
+ "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.23.5)\n",
40
+ "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.2)\n",
41
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n",
42
+ "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.6.3)\n",
43
+ "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n",
44
+ "Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.15.0)\n",
45
+ "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.1)\n",
46
+ "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.1)\n",
47
+ "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (10.0.1)\n",
48
+ "Requirement already satisfied: pyarrow-hotfix in /usr/local/lib/python3.10/dist-packages (from datasets) (0.6)\n",
49
+ "Requirement already satisfied: dill<0.3.8,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.7)\n",
50
+ "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (1.5.3)\n",
51
+ "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.4.1)\n",
52
+ "Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.15)\n",
53
+ "Requirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2023.6.0)\n",
54
+ "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.9.1)\n",
55
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (4.5.0)\n",
56
+ "Requirement already satisfied: torch>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (2.1.0+cu121)\n",
57
+ "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.16.0+cu121)\n",
58
+ "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.2.2)\n",
59
+ "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.11.4)\n",
60
+ "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (3.8.1)\n",
61
+ "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.1.99)\n",
62
+ "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.1.0)\n",
63
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.4)\n",
64
+ "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.4)\n",
65
+ "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.0)\n",
66
+ "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n",
67
+ "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n",
68
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.2)\n",
69
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.6)\n",
70
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.7)\n",
71
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.11.17)\n",
72
+ "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence-transformers) (1.12)\n",
73
+ "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence-transformers) (3.2.1)\n",
74
+ "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence-transformers) (3.1.2)\n",
75
+ "Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence-transformers) (2.1.0)\n",
76
+ "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->sentence-transformers) (8.1.7)\n",
77
+ "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk->sentence-transformers) (1.3.2)\n",
78
+ "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n",
79
+ "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2023.3.post1)\n",
80
+ "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->sentence-transformers) (3.2.0)\n",
81
+ "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.10/dist-packages (from torchvision->sentence-transformers) (9.4.0)\n",
82
+ "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->datasets) (1.16.0)\n",
83
+ "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.6.0->sentence-transformers) (2.1.3)\n",
84
+ "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.6.0->sentence-transformers) (1.3.0)\n"
85
+ ]
86
+ }
87
+ ],
88
+ "source": [
89
+ "pip install transformers datasets huggingface_hub sentence-transformers"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "source": [
95
+ "import re\n",
96
+ "import nltk\n",
97
+ "from nltk.corpus import stopwords\n",
98
+ "import torch\n",
99
+ "from torch.utils.data import DataLoader, TensorDataset\n",
100
+ "from transformers import AutoTokenizer, AutoModelForMaskedLM, AdamW\n",
101
+ "import pandas as pd\n",
102
+ "from tqdm import tqdm"
103
+ ],
104
+ "metadata": {
105
+ "id": "Jk533_F14yV8"
106
+ },
107
+ "execution_count": 22,
108
+ "outputs": []
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "source": [
113
+ "# Load your unlabeled dataset\n",
114
+ "resumes = pd.read_csv('/content/resumes6000.csv')"
115
+ ],
116
+ "metadata": {
117
+ "id": "IR-KIxHd5iyu"
118
+ },
119
+ "execution_count": 23,
120
+ "outputs": []
121
+ },
122
+ {
123
+ "cell_type": "code",
124
+ "source": [
125
+ "resumes.head(5)"
126
+ ],
127
+ "metadata": {
128
+ "colab": {
129
+ "base_uri": "https://localhost:8080/",
130
+ "height": 206
131
+ },
132
+ "id": "Y0sgNBwr5mzH",
133
+ "outputId": "9728d843-eef7-4719-c9ed-418155127788"
134
+ },
135
+ "execution_count": 24,
136
+ "outputs": [
137
+ {
138
+ "output_type": "execute_result",
139
+ "data": {
140
+ "text/plain": [
141
+ " Resumes\n",
142
+ "0 Global Sales Administrator Biamp Systems Globa...\n",
143
+ "1 Python Developer - Sprint 8 years of experien...\n",
144
+ "2 IT Project Manager - Scrum Master of Digital ...\n",
145
+ "3 UI Front End Developer UI <span class=\"hl\">Fro...\n",
146
+ "4 IT Security Analyst Camp Hill, PA Work Experie..."
147
+ ],
148
+ "text/html": [
149
+ "\n",
150
+ " <div id=\"df-4103e9a9-d2f4-4f6d-a97a-5a5ae9a6a217\" class=\"colab-df-container\">\n",
151
+ " <div>\n",
152
+ "<style scoped>\n",
153
+ " .dataframe tbody tr th:only-of-type {\n",
154
+ " vertical-align: middle;\n",
155
+ " }\n",
156
+ "\n",
157
+ " .dataframe tbody tr th {\n",
158
+ " vertical-align: top;\n",
159
+ " }\n",
160
+ "\n",
161
+ " .dataframe thead th {\n",
162
+ " text-align: right;\n",
163
+ " }\n",
164
+ "</style>\n",
165
+ "<table border=\"1\" class=\"dataframe\">\n",
166
+ " <thead>\n",
167
+ " <tr style=\"text-align: right;\">\n",
168
+ " <th></th>\n",
169
+ " <th>Resumes</th>\n",
170
+ " </tr>\n",
171
+ " </thead>\n",
172
+ " <tbody>\n",
173
+ " <tr>\n",
174
+ " <th>0</th>\n",
175
+ " <td>Global Sales Administrator Biamp Systems Globa...</td>\n",
176
+ " </tr>\n",
177
+ " <tr>\n",
178
+ " <th>1</th>\n",
179
+ " <td>Python Developer - Sprint 8 years of experien...</td>\n",
180
+ " </tr>\n",
181
+ " <tr>\n",
182
+ " <th>2</th>\n",
183
+ " <td>IT Project Manager - Scrum Master of Digital ...</td>\n",
184
+ " </tr>\n",
185
+ " <tr>\n",
186
+ " <th>3</th>\n",
187
+ " <td>UI Front End Developer UI &lt;span class=\"hl\"&gt;Fro...</td>\n",
188
+ " </tr>\n",
189
+ " <tr>\n",
190
+ " <th>4</th>\n",
191
+ " <td>IT Security Analyst Camp Hill, PA Work Experie...</td>\n",
192
+ " </tr>\n",
193
+ " </tbody>\n",
194
+ "</table>\n",
195
+ "</div>\n",
196
+ " <div class=\"colab-df-buttons\">\n",
197
+ "\n",
198
+ " <div class=\"colab-df-container\">\n",
199
+ " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-4103e9a9-d2f4-4f6d-a97a-5a5ae9a6a217')\"\n",
200
+ " title=\"Convert this dataframe to an interactive table.\"\n",
201
+ " style=\"display:none;\">\n",
202
+ "\n",
203
+ " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
204
+ " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
205
+ " </svg>\n",
206
+ " </button>\n",
207
+ "\n",
208
+ " <style>\n",
209
+ " .colab-df-container {\n",
210
+ " display:flex;\n",
211
+ " gap: 12px;\n",
212
+ " }\n",
213
+ "\n",
214
+ " .colab-df-convert {\n",
215
+ " background-color: #E8F0FE;\n",
216
+ " border: none;\n",
217
+ " border-radius: 50%;\n",
218
+ " cursor: pointer;\n",
219
+ " display: none;\n",
220
+ " fill: #1967D2;\n",
221
+ " height: 32px;\n",
222
+ " padding: 0 0 0 0;\n",
223
+ " width: 32px;\n",
224
+ " }\n",
225
+ "\n",
226
+ " .colab-df-convert:hover {\n",
227
+ " background-color: #E2EBFA;\n",
228
+ " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
229
+ " fill: #174EA6;\n",
230
+ " }\n",
231
+ "\n",
232
+ " .colab-df-buttons div {\n",
233
+ " margin-bottom: 4px;\n",
234
+ " }\n",
235
+ "\n",
236
+ " [theme=dark] .colab-df-convert {\n",
237
+ " background-color: #3B4455;\n",
238
+ " fill: #D2E3FC;\n",
239
+ " }\n",
240
+ "\n",
241
+ " [theme=dark] .colab-df-convert:hover {\n",
242
+ " background-color: #434B5C;\n",
243
+ " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
244
+ " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
245
+ " fill: #FFFFFF;\n",
246
+ " }\n",
247
+ " </style>\n",
248
+ "\n",
249
+ " <script>\n",
250
+ " const buttonEl =\n",
251
+ " document.querySelector('#df-4103e9a9-d2f4-4f6d-a97a-5a5ae9a6a217 button.colab-df-convert');\n",
252
+ " buttonEl.style.display =\n",
253
+ " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
254
+ "\n",
255
+ " async function convertToInteractive(key) {\n",
256
+ " const element = document.querySelector('#df-4103e9a9-d2f4-4f6d-a97a-5a5ae9a6a217');\n",
257
+ " const dataTable =\n",
258
+ " await google.colab.kernel.invokeFunction('convertToInteractive',\n",
259
+ " [key], {});\n",
260
+ " if (!dataTable) return;\n",
261
+ "\n",
262
+ " const docLinkHtml = 'Like what you see? Visit the ' +\n",
263
+ " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
264
+ " + ' to learn more about interactive tables.';\n",
265
+ " element.innerHTML = '';\n",
266
+ " dataTable['output_type'] = 'display_data';\n",
267
+ " await google.colab.output.renderOutput(dataTable, element);\n",
268
+ " const docLink = document.createElement('div');\n",
269
+ " docLink.innerHTML = docLinkHtml;\n",
270
+ " element.appendChild(docLink);\n",
271
+ " }\n",
272
+ " </script>\n",
273
+ " </div>\n",
274
+ "\n",
275
+ "\n",
276
+ "<div id=\"df-a376dc72-fa58-4744-913c-c4534b40ab5d\">\n",
277
+ " <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-a376dc72-fa58-4744-913c-c4534b40ab5d')\"\n",
278
+ " title=\"Suggest charts\"\n",
279
+ " style=\"display:none;\">\n",
280
+ "\n",
281
+ "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
282
+ " width=\"24px\">\n",
283
+ " <g>\n",
284
+ " <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
285
+ " </g>\n",
286
+ "</svg>\n",
287
+ " </button>\n",
288
+ "\n",
289
+ "<style>\n",
290
+ " .colab-df-quickchart {\n",
291
+ " --bg-color: #E8F0FE;\n",
292
+ " --fill-color: #1967D2;\n",
293
+ " --hover-bg-color: #E2EBFA;\n",
294
+ " --hover-fill-color: #174EA6;\n",
295
+ " --disabled-fill-color: #AAA;\n",
296
+ " --disabled-bg-color: #DDD;\n",
297
+ " }\n",
298
+ "\n",
299
+ " [theme=dark] .colab-df-quickchart {\n",
300
+ " --bg-color: #3B4455;\n",
301
+ " --fill-color: #D2E3FC;\n",
302
+ " --hover-bg-color: #434B5C;\n",
303
+ " --hover-fill-color: #FFFFFF;\n",
304
+ " --disabled-bg-color: #3B4455;\n",
305
+ " --disabled-fill-color: #666;\n",
306
+ " }\n",
307
+ "\n",
308
+ " .colab-df-quickchart {\n",
309
+ " background-color: var(--bg-color);\n",
310
+ " border: none;\n",
311
+ " border-radius: 50%;\n",
312
+ " cursor: pointer;\n",
313
+ " display: none;\n",
314
+ " fill: var(--fill-color);\n",
315
+ " height: 32px;\n",
316
+ " padding: 0;\n",
317
+ " width: 32px;\n",
318
+ " }\n",
319
+ "\n",
320
+ " .colab-df-quickchart:hover {\n",
321
+ " background-color: var(--hover-bg-color);\n",
322
+ " box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
323
+ " fill: var(--button-hover-fill-color);\n",
324
+ " }\n",
325
+ "\n",
326
+ " .colab-df-quickchart-complete:disabled,\n",
327
+ " .colab-df-quickchart-complete:disabled:hover {\n",
328
+ " background-color: var(--disabled-bg-color);\n",
329
+ " fill: var(--disabled-fill-color);\n",
330
+ " box-shadow: none;\n",
331
+ " }\n",
332
+ "\n",
333
+ " .colab-df-spinner {\n",
334
+ " border: 2px solid var(--fill-color);\n",
335
+ " border-color: transparent;\n",
336
+ " border-bottom-color: var(--fill-color);\n",
337
+ " animation:\n",
338
+ " spin 1s steps(1) infinite;\n",
339
+ " }\n",
340
+ "\n",
341
+ " @keyframes spin {\n",
342
+ " 0% {\n",
343
+ " border-color: transparent;\n",
344
+ " border-bottom-color: var(--fill-color);\n",
345
+ " border-left-color: var(--fill-color);\n",
346
+ " }\n",
347
+ " 20% {\n",
348
+ " border-color: transparent;\n",
349
+ " border-left-color: var(--fill-color);\n",
350
+ " border-top-color: var(--fill-color);\n",
351
+ " }\n",
352
+ " 30% {\n",
353
+ " border-color: transparent;\n",
354
+ " border-left-color: var(--fill-color);\n",
355
+ " border-top-color: var(--fill-color);\n",
356
+ " border-right-color: var(--fill-color);\n",
357
+ " }\n",
358
+ " 40% {\n",
359
+ " border-color: transparent;\n",
360
+ " border-right-color: var(--fill-color);\n",
361
+ " border-top-color: var(--fill-color);\n",
362
+ " }\n",
363
+ " 60% {\n",
364
+ " border-color: transparent;\n",
365
+ " border-right-color: var(--fill-color);\n",
366
+ " }\n",
367
+ " 80% {\n",
368
+ " border-color: transparent;\n",
369
+ " border-right-color: var(--fill-color);\n",
370
+ " border-bottom-color: var(--fill-color);\n",
371
+ " }\n",
372
+ " 90% {\n",
373
+ " border-color: transparent;\n",
374
+ " border-bottom-color: var(--fill-color);\n",
375
+ " }\n",
376
+ " }\n",
377
+ "</style>\n",
378
+ "\n",
379
+ " <script>\n",
380
+ " async function quickchart(key) {\n",
381
+ " const quickchartButtonEl =\n",
382
+ " document.querySelector('#' + key + ' button');\n",
383
+ " quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n",
384
+ " quickchartButtonEl.classList.add('colab-df-spinner');\n",
385
+ " try {\n",
386
+ " const charts = await google.colab.kernel.invokeFunction(\n",
387
+ " 'suggestCharts', [key], {});\n",
388
+ " } catch (error) {\n",
389
+ " console.error('Error during call to suggestCharts:', error);\n",
390
+ " }\n",
391
+ " quickchartButtonEl.classList.remove('colab-df-spinner');\n",
392
+ " quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
393
+ " }\n",
394
+ " (() => {\n",
395
+ " let quickchartButtonEl =\n",
396
+ " document.querySelector('#df-a376dc72-fa58-4744-913c-c4534b40ab5d button');\n",
397
+ " quickchartButtonEl.style.display =\n",
398
+ " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
399
+ " })();\n",
400
+ " </script>\n",
401
+ "</div>\n",
402
+ "\n",
403
+ " </div>\n",
404
+ " </div>\n"
405
+ ]
406
+ },
407
+ "metadata": {},
408
+ "execution_count": 24
409
+ }
410
+ ]
411
+ },
412
+ {
413
+ "cell_type": "code",
414
+ "source": [
415
+ "# Define the function for cleaning text\n",
416
+ "def clean_text(text):\n",
417
+ " return re.sub(r\"<span class=\\\"hl\\\">(.*?)</span>\", r\"\\1\", text)\n",
418
+ "# Apply the function to the entire column\n",
419
+ "resumes['Resumes'] = resumes['Resumes'].apply(clean_text)"
420
+ ],
421
+ "metadata": {
422
+ "id": "MrCrvWv65nAw"
423
+ },
424
+ "execution_count": 26,
425
+ "outputs": []
426
+ },
427
+ {
428
+ "cell_type": "code",
429
+ "source": [
430
+ " import nltk\n",
431
+ " nltk.download('punkt')"
432
+ ],
433
+ "metadata": {
434
+ "colab": {
435
+ "base_uri": "https://localhost:8080/"
436
+ },
437
+ "id": "aUdNZquW4yXo",
438
+ "outputId": "254067bd-9b4e-4e98-b8a0-9c661e6955f3"
439
+ },
440
+ "execution_count": 27,
441
+ "outputs": [
442
+ {
443
+ "output_type": "stream",
444
+ "name": "stderr",
445
+ "text": [
446
+ "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
447
+ "[nltk_data] Package punkt is already up-to-date!\n"
448
+ ]
449
+ },
450
+ {
451
+ "output_type": "execute_result",
452
+ "data": {
453
+ "text/plain": [
454
+ "True"
455
+ ]
456
+ },
457
+ "metadata": {},
458
+ "execution_count": 27
459
+ }
460
+ ]
461
+ },
462
+ {
463
+ "cell_type": "code",
464
+ "source": [
465
+ "import nltk\n",
466
+ "nltk.download('stopwords')"
467
+ ],
468
+ "metadata": {
469
+ "colab": {
470
+ "base_uri": "https://localhost:8080/"
471
+ },
472
+ "id": "09C8uhGu51Vh",
473
+ "outputId": "3cd7a9af-293f-4c3c-a073-92fe26c49bd5"
474
+ },
475
+ "execution_count": 28,
476
+ "outputs": [
477
+ {
478
+ "output_type": "stream",
479
+ "name": "stderr",
480
+ "text": [
481
+ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
482
+ "[nltk_data] Package stopwords is already up-to-date!\n"
483
+ ]
484
+ },
485
+ {
486
+ "output_type": "execute_result",
487
+ "data": {
488
+ "text/plain": [
489
+ "True"
490
+ ]
491
+ },
492
+ "metadata": {},
493
+ "execution_count": 28
494
+ }
495
+ ]
496
+ },
497
+ {
498
+ "cell_type": "code",
499
+ "source": [
500
+ "# Function for cleaning and preprocessing the resume\n",
501
+ "def clean_resume(resume):\n",
502
+ " if isinstance(resume, str):\n",
503
+ " # Convert to lowercase\n",
504
+ " resume = resume.lower()\n",
505
+ "\n",
506
+ " # Remove URLs, RT, cc, hashtags, mentions, non-ASCII characters, punctuation, and extra whitespace\n",
507
+ " resume = re.sub('http\\S+\\s*|RT|cc|#\\S+|@\\S+|[^\\x00-\\x7f]|[^\\w\\s]', ' ', resume)\n",
508
+ " resume = re.sub('\\s+', ' ', resume).strip()\n",
509
+ "\n",
510
+ " # Tokenize the resume\n",
511
+ " tokens = nltk.word_tokenize(resume)\n",
512
+ "\n",
513
+ " # Remove stopwords\n",
514
+ " stop_words = set(stopwords.words('english'))\n",
515
+ " tokens = [token for token in tokens if token.lower() not in stop_words]\n",
516
+ "\n",
517
+ " # Join the tokens back into a sentence\n",
518
+ " preprocessed_resume = ' '.join(tokens)\n",
519
+ "\n",
520
+ " return preprocessed_resume\n",
521
+ " else:\n",
522
+ " return ''\n",
523
+ "# Applying the cleaning function to a Datasets\n",
524
+ "resumes['Resumes'] = resumes['Resumes'].apply(lambda x: clean_resume(x))"
525
+ ],
526
+ "metadata": {
527
+ "id": "TWyPQ63w51kN"
528
+ },
529
+ "execution_count": 30,
530
+ "outputs": []
531
+ },
532
+ {
533
+ "cell_type": "code",
534
+ "source": [
535
+ "import pandas as pd\n",
536
+ "from transformers import AutoTokenizer, AutoModelForMaskedLM, AdamW\n",
537
+ "import torch\n",
538
+ "from torch.utils.data import DataLoader, TensorDataset\n",
539
+ "from tqdm import tqdm\n",
540
+ "\n",
541
+ "# Load the pre-trained model\n",
542
+ "mpnet = \"sentence-transformers/all-mpnet-base-v2\"\n",
543
+ "tokenizer = AutoTokenizer.from_pretrained(mpnet)\n",
544
+ "pretrained_model = AutoModelForMaskedLM.from_pretrained(mpnet)\n",
545
+ "\n",
546
+ "# Assuming 'resumes' is a DataFrame with a column named 'Resumes'\n",
547
+ "texts = resumes['Resumes'].tolist()\n",
548
+ "\n",
549
+ "# Tokenize and encode the unlabeled data\n",
550
+ "encodings = tokenizer(texts, padding=True, truncation = True, return_tensors='pt')\n",
551
+ "\n",
552
+ "# Create a TensorDataset\n",
553
+ "dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'])\n",
554
+ "\n",
555
+ "# Move the model to the appropriate device (CPU or GPU)\n",
556
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
557
+ "pretrained_model.to(device)\n",
558
+ "\n",
559
+ "# Initialize the optimizer\n",
560
+ "optimizer = AdamW(pretrained_model.parameters(), lr=2e-5)\n",
561
+ "\n",
562
+ "batch_size = 8\n",
563
+ "epochs = 3\n",
564
+ "import math\n",
565
+ "\n",
566
+ "# Experiment with different chunk sizes\n",
567
+ "chunk_sizes_to_try = [200] # Can add more sizes later\n",
568
+ "\n",
569
+ "for chunk_size in chunk_sizes_to_try:\n",
570
+ " for epoch in range(epochs):\n",
571
+ " tqdm_dataloader = tqdm(DataLoader(dataset, batch_size=batch_size, shuffle=True), desc=f'Epoch {epoch + 1}/{epochs}')\n",
572
+ "\n",
573
+ " pretrained_model.train()\n",
574
+ " for batch in tqdm_dataloader:\n",
575
+ " input_ids, attention_mask = batch\n",
576
+ " input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)\n",
577
+ "\n",
578
+ " # Calculate number of chunks for current batch\n",
579
+ " sequence_length = input_ids.size(1) # Get actual sequence length\n",
580
+ " num_chunks = math.ceil(sequence_length / chunk_size)\n",
581
+ "\n",
582
+ " for i in range(num_chunks):\n",
583
+ " start_idx = i * chunk_size\n",
584
+ " end_idx = min((i + 1) * chunk_size, sequence_length) # Handle final chunk\n",
585
+ "\n",
586
+ " # Extract chunk data\n",
587
+ " input_ids_chunk = input_ids[:, start_idx:end_idx]\n",
588
+ " attention_mask_chunk = attention_mask[:, start_idx:end_idx]\n",
589
+ "\n",
590
+ " # Forward pass\n",
591
+ " outputs = pretrained_model(\n",
592
+ " input_ids_chunk, attention_mask=attention_mask_chunk, labels=input_ids_chunk.reshape(-1)\n",
593
+ " )\n",
594
+ "\n",
595
+ " # Calculate loss\n",
596
+ " loss = outputs.loss\n",
597
+ "\n",
598
+ " # Backward pass and optimization\n",
599
+ " optimizer.zero_grad()\n",
600
+ " loss.backward()\n",
601
+ " optimizer.step()\n",
602
+ "\n",
603
+ " # Update progress bar\n",
604
+ " tqdm_dataloader.set_postfix({'Loss': loss.item(), 'Chunk Size': chunk_size})"
605
+ ],
606
+ "metadata": {
607
+ "colab": {
608
+ "base_uri": "https://localhost:8080/"
609
+ },
610
+ "id": "kypmxXhz4ybO",
611
+ "outputId": "a142f965-498a-4f33-ffbb-028f88f27d51"
612
+ },
613
+ "execution_count": 43,
614
+ "outputs": [
615
+ {
616
+ "output_type": "stream",
617
+ "name": "stderr",
618
+ "text": [
619
+ "Some weights of the model checkpoint at sentence-transformers/all-mpnet-base-v2 were not used when initializing MPNetForMaskedLM: ['pooler.dense.weight', 'pooler.dense.bias']\n",
620
+ "- This IS expected if you are initializing MPNetForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
621
+ "- This IS NOT expected if you are initializing MPNetForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
622
+ "Some weights of MPNetForMaskedLM were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']\n",
623
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
624
+ "/usr/local/lib/python3.10/dist-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
625
+ " warnings.warn(\n",
626
+ "Epoch 1/3: 100%|██████████| 750/750 [11:46<00:00, 1.06it/s, Loss=0.057, Chunk Size=200]\n",
627
+ "Epoch 2/3: 100%|██████████| 750/750 [11:47<00:00, 1.06it/s, Loss=0.0571, Chunk Size=200]\n",
628
+ "Epoch 3/3: 100%|██████████| 750/750 [11:47<00:00, 1.06it/s, Loss=0.0464, Chunk Size=200]\n"
629
+ ]
630
+ }
631
+ ]
632
+ },
633
+ {
634
+ "cell_type": "code",
635
+ "source": [
636
+ "# Save the fine-tuned model\n",
637
+ "pretrained_model.save_pretrained('fine_tuned_mpnet')\n",
638
+ "tokenizer.save_pretrained('fine_tuned_mpnet')"
639
+ ],
640
+ "metadata": {
641
+ "colab": {
642
+ "base_uri": "https://localhost:8080/"
643
+ },
644
+ "id": "U-mZPfa8Sipl",
645
+ "outputId": "fc93a178-aaf4-415b-f8e2-bba93a832052"
646
+ },
647
+ "execution_count": 44,
648
+ "outputs": [
649
+ {
650
+ "output_type": "execute_result",
651
+ "data": {
652
+ "text/plain": [
653
+ "('fine_tuned_mpnet/tokenizer_config.json',\n",
654
+ " 'fine_tuned_mpnet/special_tokens_map.json',\n",
655
+ " 'fine_tuned_mpnet/vocab.txt',\n",
656
+ " 'fine_tuned_mpnet/added_tokens.json',\n",
657
+ " 'fine_tuned_mpnet/tokenizer.json')"
658
+ ]
659
+ },
660
+ "metadata": {},
661
+ "execution_count": 44
662
+ }
663
+ ]
664
+ },
665
+ {
666
+ "cell_type": "code",
667
+ "source": [],
668
+ "metadata": {
669
+ "id": "fnD7hsloTA1i"
670
+ },
671
+ "execution_count": null,
672
+ "outputs": []
673
+ },
674
+ {
675
+ "cell_type": "code",
676
+ "source": [],
677
+ "metadata": {
678
+ "id": "LEUEojrfTBB0"
679
+ },
680
+ "execution_count": null,
681
+ "outputs": []
682
+ }
683
+ ]
684
+ }
config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "sentence-transformers/all-mpnet-base-v2",
3
+ "architectures": [
4
+ "MPNetForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "mpnet",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "relative_attention_num_buckets": 32,
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.35.2",
23
+ "vocab_size": 30527
24
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae792036540d52db6ef280faae80757c247ef848bc4ae66ff8ad5effc4ad232a
3
+ size 438097372
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "[UNK]",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "104": {
36
+ "content": "[UNK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "30526": {
44
+ "content": "<mask>",
45
+ "lstrip": true,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ }
51
+ },
52
+ "bos_token": "<s>",
53
+ "clean_up_tokenization_spaces": true,
54
+ "cls_token": "<s>",
55
+ "do_lower_case": true,
56
+ "eos_token": "</s>",
57
+ "mask_token": "<mask>",
58
+ "max_length": 128,
59
+ "model_max_length": 512,
60
+ "pad_to_multiple_of": null,
61
+ "pad_token": "<pad>",
62
+ "pad_token_type_id": 0,
63
+ "padding_side": "right",
64
+ "sep_token": "</s>",
65
+ "stride": 0,
66
+ "strip_accents": null,
67
+ "tokenize_chinese_chars": true,
68
+ "tokenizer_class": "MPNetTokenizer",
69
+ "truncation_side": "right",
70
+ "truncation_strategy": "longest_first",
71
+ "unk_token": "[UNK]"
72
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff