Pietro Lesci commited on
Commit
ebbb0ba
1 Parent(s): c7908b4

remove dev

Browse files
Files changed (4) hide show
  1. Dockerfile +0 -30
  2. Makefile +0 -42
  3. notebooks/wordifier_nb.ipynb +0 -794
  4. pytest.ini +0 -4
Dockerfile DELETED
@@ -1,30 +0,0 @@
1
- ###############################################################################
2
- # main
3
- ###############################################################################
4
-
5
- FROM continuumio/miniconda3:4.8.2 AS main
6
-
7
- # RUN apt-get -y update && \
8
- # apt-get -y install build-essential
9
- RUN conda update -n base -c defaults conda
10
-
11
- # chown changes owner from root owner (1000) to the first user inside the env (100)
12
- # COPY --chown=1000:100 requirements.txt /opt/requirements.txt
13
- # RUN conda install --force-reinstall -y -q --name base -c conda-forge --file /opt/requirements.txt
14
- RUN conda install --force-reinstall -y -q --name base pip
15
-
16
- COPY . /var/app/
17
- # WORKDIR /var/dev
18
- WORKDIR /var/app
19
- RUN pip install -r dev-requirements.txt
20
- CMD streamlit run ./app.py
21
-
22
- ###############################################################################
23
- # test
24
- ###############################################################################
25
-
26
- FROM main AS test
27
- COPY . /var/dev/
28
- WORKDIR /var/dev
29
- # add unit test instruction here: RUN xxxxxx
30
- # add integration test instruction here: RUN xxxxx
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Makefile DELETED
@@ -1,42 +0,0 @@
1
- .PHONY: help build dev integration-test push
2
- .DEFAULT_GOAL := help
3
-
4
- # Docker image build info
5
- PROJECT:=wordify
6
- BUILD_TAG?=v0.1
7
-
8
- ALL_IMAGES:=src
9
-
10
- help:
11
- # http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html
12
- @echo "python starter project"
13
- @echo "====================="
14
- @echo "Replace % with a directory name (e.g., make build/python-example)"
15
- @echo
16
- @grep -E '^[a-zA-Z0-9_%/-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
17
-
18
- ########################################################
19
- ## Local development
20
- ########################################################
21
-
22
- dev: ARGS?=/bin/bash
23
- dev: DARGS?=-v "${CURDIR}":/var/dev
24
- dev: ## run a foreground container
25
- docker run -it --rm -p 8501:8501 $(DARGS) $(PROJECT):${BUILD_TAG} $(ARGS)
26
-
27
-
28
- notebook: ARGS?=jupyter lab
29
- notebook: DARGS?=-v "${CURDIR}":/var/dev -p 8888:8888 ##notebook shall be run on http://0.0.0.0:8888 by default. Change to a different port (e.g. 8899) if 8888 is used for example 8899:8888
30
- notebook: ## run a foreground container
31
- docker run -it --rm $(DARGS) $(PROJECT) $(ARGS) \
32
- --ip=0.0.0.0 \
33
- --allow-root \
34
- --NotebookApp.token="" \
35
- --NotebookApp.password=""
36
-
37
- build: DARGS?=
38
- build: ## build the latest image for a project
39
- docker build $(DARGS) --build-arg BUILD_TAG=${BUILD_TAG} --rm --force-rm -t $(PROJECT):${BUILD_TAG} .
40
-
41
- run:
42
- docker run -d --name $(PROJECT)-${BUILD_TAG}-container -it --rm -p 8501:8501 $(PROJECT):${BUILD_TAG}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
notebooks/wordifier_nb.ipynb DELETED
@@ -1,794 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 65,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "import sys\n",
10
- "sys.path.insert(0, \"..\")\n",
11
- "import vaex\n",
12
- "from vaex.ml import LabelEncoder\n",
13
- "import spacy\n",
14
- "import pandas as pd\n",
15
- "from tqdm import tqdm\n",
16
- "import os\n",
17
- "import multiprocessing as mp\n",
18
- "from src.preprocessing import PreprocessingPipeline, encode\n",
19
- "from src.wordifier import ModelConfigs\n",
20
- "from sklearn.pipeline import Pipeline\n",
21
- "from sklearn.linear_model import LogisticRegression\n",
22
- "from sklearn.feature_extraction.text import TfidfVectorizer\n",
23
- "import numpy as np"
24
- ]
25
- },
26
- {
27
- "cell_type": "code",
28
- "execution_count": 67,
29
- "metadata": {},
30
- "outputs": [],
31
- "source": [
32
- "pipe = PreprocessingPipeline(\n",
33
- " language=\"English\",\n",
34
- " pre_steps=list(PreprocessingPipeline.pipeline_components().keys()),\n",
35
- " lemmatization_step=list(PreprocessingPipeline.lemmatization_component().keys())[1],\n",
36
- " post_steps=list(PreprocessingPipeline.pipeline_components().keys()),\n",
37
- ")"
38
- ]
39
- },
40
- {
41
- "cell_type": "code",
42
- "execution_count": 68,
43
- "metadata": {},
44
- "outputs": [],
45
- "source": [
46
- "def fn(t):\n",
47
- " return pipe.post(pipe.lemma(pipe.nlp(pipe.pre(t))))"
48
- ]
49
- },
50
- {
51
- "cell_type": "code",
52
- "execution_count": 69,
53
- "metadata": {},
54
- "outputs": [],
55
- "source": [
56
- "vdf = vaex.from_pandas(df)\n",
57
- "vdf[\"processed_text\"] = vdf.apply(fn, arguments=[vdf[\"text\"]], vectorize=False)\n",
58
- "df = vdf.to_pandas_df()"
59
- ]
60
- },
61
- {
62
- "cell_type": "code",
63
- "execution_count": 71,
64
- "metadata": {},
65
- "outputs": [
66
- {
67
- "name": "stderr",
68
- "output_type": "stream",
69
- "text": [
70
- "2021-11-28 17:01:36.883 \n",
71
- " \u001b[33m\u001b[1mWarning:\u001b[0m to view this Streamlit app on a browser, run it with the following\n",
72
- " command:\n",
73
- "\n",
74
- " streamlit run /Users/pietrolesci/miniconda3/envs/wordify/lib/python3.7/site-packages/ipykernel_launcher.py [ARGUMENTS]\n"
75
- ]
76
- }
77
- ],
78
- "source": [
79
- "import streamlit as st\n",
80
- "pbar = st.progress(0)\n",
81
- "N = 100\n",
82
- "for i, _ in enumerate(range(N)):\n",
83
- " if i % N == 0:\n",
84
- " pbar.progress(1)"
85
- ]
86
- },
87
- {
88
- "cell_type": "code",
89
- "execution_count": null,
90
- "metadata": {},
91
- "outputs": [],
92
- "source": []
93
- },
94
- {
95
- "cell_type": "code",
96
- "execution_count": 24,
97
- "metadata": {},
98
- "outputs": [],
99
- "source": [
100
- "configs = ModelConfigs\n",
101
- "clf = Pipeline(\n",
102
- " [\n",
103
- " (\"tfidf\", TfidfVectorizer()),\n",
104
- " (\n",
105
- " \"classifier\",\n",
106
- " LogisticRegression(\n",
107
- " penalty=\"l1\",\n",
108
- " C=configs.PENALTIES.value[np.random.randint(len(configs.PENALTIES.value))],\n",
109
- " solver=\"liblinear\",\n",
110
- " multi_class=\"auto\",\n",
111
- " max_iter=500,\n",
112
- " class_weight=\"balanced\",\n",
113
- " ),\n",
114
- " ),\n",
115
- " ]\n",
116
- ")\n"
117
- ]
118
- },
119
- {
120
- "cell_type": "code",
121
- "execution_count": 29,
122
- "metadata": {},
123
- "outputs": [
124
- {
125
- "data": {
126
- "text/plain": [
127
- "Pipeline(steps=[('tfidf', TfidfVectorizer()),\n",
128
- " ('classifier',\n",
129
- " LogisticRegression(C=1, class_weight='balanced', max_iter=500,\n",
130
- " penalty='l1', solver='liblinear'))])"
131
- ]
132
- },
133
- "execution_count": 29,
134
- "metadata": {},
135
- "output_type": "execute_result"
136
- }
137
- ],
138
- "source": [
139
- "clf.fit(df[\"text\"], df[\"label\"])"
140
- ]
141
- },
142
- {
143
- "cell_type": "code",
144
- "execution_count": 39,
145
- "metadata": {},
146
- "outputs": [
147
- {
148
- "data": {
149
- "text/plain": [
150
- "array(['00', '000', '00001', ..., 'ís', 'über', 'überwoman'], dtype=object)"
151
- ]
152
- },
153
- "execution_count": 39,
154
- "metadata": {},
155
- "output_type": "execute_result"
156
- }
157
- ],
158
- "source": []
159
- },
160
- {
161
- "cell_type": "code",
162
- "execution_count": 40,
163
- "metadata": {},
164
- "outputs": [],
165
- "source": [
166
- "def wordifier(df, text_col, label_col, configs=ModelConfigs):\n",
167
- "\n",
168
- " n_instances, n_features = X.shape\n",
169
- " n_classes = np.unique(y)\n",
170
- "\n",
171
- " # NOTE: the * 10 / 10 trick is to have \"nice\" round-ups\n",
172
- " sample_fraction = np.ceil((n_features / n_instances) * 10) / 10\n",
173
- "\n",
174
- " sample_size = min(\n",
175
- " # this is the maximum supported\n",
176
- " configs.MAX_SELECTION.value,\n",
177
- " # at minimum you want MIN_SELECTION but in general you want\n",
178
- " # n_instances * sample_fraction\n",
179
- " max(configs.MIN_SELECTION.value, int(n_instances * sample_fraction)),\n",
180
- " # however if previous one is bigger the the available instances take\n",
181
- " # the number of available instances\n",
182
- " n_instances,\n",
183
- " )\n",
184
- "\n",
185
- " # TODO: might want to try out something to subsample features at each iteration\n",
186
- "\n",
187
- " # initialize coefficient matrices\n",
188
- " pos_scores = np.zeros((n_classes, n_features), dtype=int)\n",
189
- " neg_scores = np.zeros((n_classes, n_features), dtype=int)\n",
190
- "\n",
191
- " for _ in range(configs.NUM_ITERS.value):\n",
192
- "\n",
193
- " # run randomized regression\n",
194
- " clf = Pipeline([\n",
195
- " ('tfidf', TfidfVectorizer()), \n",
196
- " ('classifier', LogisticRegression(\n",
197
- " penalty=\"l1\",\n",
198
- " C=configs.PENALTIES.value[\n",
199
- " np.random.randint(len(configs.PENALTIES.value))\n",
200
- " ],\n",
201
- " solver=\"liblinear\",\n",
202
- " multi_class=\"auto\",\n",
203
- " max_iter=500,\n",
204
- " class_weight=\"balanced\",\n",
205
- " ))]\n",
206
- " )\n",
207
- "\n",
208
- " # sample indices to subsample matrix\n",
209
- " selection = resample(\n",
210
- " np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size\n",
211
- " )\n",
212
- "\n",
213
- " # fit\n",
214
- " try:\n",
215
- " clf.fit(X[selection], y[selection])\n",
216
- " except ValueError:\n",
217
- " continue\n",
218
- "\n",
219
- " # record coefficients\n",
220
- " if n_classes == 2:\n",
221
- " pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)\n",
222
- " neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)\n",
223
- " pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)\n",
224
- " neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)\n",
225
- " else:\n",
226
- " pos_scores += clf.coef_ > 0\n",
227
- " neg_scores += clf.coef_ < 0\n",
228
- "\n",
229
- "\n",
230
- " # normalize\n",
231
- " pos_scores = pos_scores / configs.NUM_ITERS.value\n",
232
- " neg_scores = neg_scores / configs.NUM_ITERS.value\n",
233
- "\n",
234
- " # get only active features\n",
235
- " pos_positions = np.where(\n",
236
- " pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0\n",
237
- " )\n",
238
- " neg_positions = np.where(\n",
239
- " neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0\n",
240
- " )\n",
241
- "\n",
242
- " # prepare DataFrame\n",
243
- " X_names = clf.steps[0][1].get_feature_names_out()\n",
244
- " pos = [\n",
245
- " (X_names[i], pos_scores[c, i], y_names[c])\n",
246
- " for c, i in zip(*pos_positions.nonzero())\n",
247
- " ]\n",
248
- " neg = [\n",
249
- " (X_names[i], neg_scores[c, i], y_names[c])\n",
250
- " for c, i in zip(*neg_positions.nonzero())\n",
251
- " ]\n",
252
- "\n",
253
- " posdf = pd.DataFrame(pos, columns=\"word score label\".split()).sort_values(\n",
254
- " [\"label\", \"score\"], ascending=False\n",
255
- " )\n",
256
- " negdf = pd.DataFrame(neg, columns=\"word score label\".split()).sort_values(\n",
257
- " [\"label\", \"score\"], ascending=False\n",
258
- " )\n",
259
- "\n",
260
- " return posdf, negdf"
261
- ]
262
- },
263
- {
264
- "cell_type": "code",
265
- "execution_count": 41,
266
- "metadata": {},
267
- "outputs": [],
268
- "source": [
269
- "res = vdf.apply(wordifier, arguments=[vdf.processed_text, vdf.encoded_label], vectorize=False)"
270
- ]
271
- },
272
- {
273
- "cell_type": "code",
274
- "execution_count": 45,
275
- "metadata": {},
276
- "outputs": [],
277
- "source": [
278
- "from vaex.ml.sklearn import Predictor"
279
- ]
280
- },
281
- {
282
- "cell_type": "code",
283
- "execution_count": 60,
284
- "metadata": {},
285
- "outputs": [],
286
- "source": [
287
- "clf = Pipeline(\n",
288
- " [\n",
289
- " (\n",
290
- " \"tfidf\",\n",
291
- " TfidfVectorizer(\n",
292
- " input=\"content\", # default: file already in memory\n",
293
- " encoding=\"utf-8\", # default\n",
294
- " decode_error=\"strict\", # default\n",
295
- " strip_accents=None, # do nothing\n",
296
- " lowercase=False, # do nothing\n",
297
- " preprocessor=None, # do nothing - default\n",
298
- " tokenizer=None, # default\n",
299
- " stop_words=None, # do nothing\n",
300
- " analyzer=\"word\",\n",
301
- " ngram_range=(1, 3), # maximum 3-ngrams\n",
302
- " min_df=0.001,\n",
303
- " max_df=0.75,\n",
304
- " sublinear_tf=True,\n",
305
- " ),\n",
306
- " ),\n",
307
- " (\n",
308
- " \"classifier\",\n",
309
- " LogisticRegression(\n",
310
- " penalty=\"l1\",\n",
311
- " C=configs.PENALTIES.value[np.random.randint(len(configs.PENALTIES.value))],\n",
312
- " solver=\"liblinear\",\n",
313
- " multi_class=\"auto\",\n",
314
- " max_iter=500,\n",
315
- " class_weight=\"balanced\",\n",
316
- " ),\n",
317
- " ),\n",
318
- " ]\n",
319
- ")\n",
320
- "\n",
321
- "vaex_model = Predictor(\n",
322
- " features=[\"processed_text\"],\n",
323
- " target=\"encoded_label\",\n",
324
- " model=clf,\n",
325
- " prediction_name=\"prediction\",\n",
326
- ")\n"
327
- ]
328
- },
329
- {
330
- "cell_type": "code",
331
- "execution_count": 61,
332
- "metadata": {},
333
- "outputs": [
334
- {
335
- "ename": "TypeError",
336
- "evalue": "unhashable type: 'list'",
337
- "output_type": "error",
338
- "traceback": [
339
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
340
- "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
341
- "\u001b[0;32m/var/folders/b_/m81mmt0s6gv48kdvk44n2l740000gn/T/ipykernel_52217/687453386.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mvaex_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
342
- "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/ml/sklearn.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, df, **kwargs)\u001b[0m\n\u001b[1;32m 103\u001b[0m '''\n\u001b[1;32m 104\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 105\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeatures\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 106\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtarget\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 107\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
343
- "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36mvalues\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 6897\u001b[0m \u001b[0mIf\u001b[0m \u001b[0many\u001b[0m \u001b[0mof\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[0mcontain\u001b[0m \u001b[0mmasked\u001b[0m \u001b[0marrays\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mmasks\u001b[0m \u001b[0mare\u001b[0m \u001b[0mignored\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mmasked\u001b[0m \u001b[0melements\u001b[0m \u001b[0mare\u001b[0m \u001b[0mreturned\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mwell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6898\u001b[0m \"\"\"\n\u001b[0;32m-> 6899\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__array__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6900\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6901\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
344
- "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36m__array__\u001b[0;34m(self, dtype, parallel)\u001b[0m\n\u001b[1;32m 5989\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcolumn_type\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5990\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Cannot cast %r (of type %r) to %r\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5991\u001b[0;31m \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumn_names\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparallel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marray_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'numpy'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5992\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0many\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misMaskedArray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5993\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
345
- "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36mevaluate\u001b[0;34m(self, expression, i1, i2, out, selection, filtered, array_type, parallel, chunk_size, progress)\u001b[0m\n\u001b[1;32m 2962\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate_iterator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpression\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms1\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms2\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfiltered\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfiltered\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marray_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marray_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparallel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprogress\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprogress\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2963\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2964\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_evaluate_implementation\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpression\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi1\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi2\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfiltered\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfiltered\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marray_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marray_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparallel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprogress\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprogress\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2965\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2966\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mdocsubst\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
346
- "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36m_evaluate_implementation\u001b[0;34m(self, expression, i1, i2, out, selection, filtered, array_type, parallel, chunk_size, raw, progress)\u001b[0m\n\u001b[1;32m 6207\u001b[0m \u001b[0;31m# TODO: For NEP branch: dtype -> dtype_evaluate\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6208\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 6209\u001b[0;31m \u001b[0mexpression_to_evaluate\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpressions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# lets assume we have to do them all\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6210\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6211\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mexpression\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpressions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
347
- "\u001b[0;31mTypeError\u001b[0m: unhashable type: 'list'"
348
- ]
349
- }
350
- ],
351
- "source": [
352
- "vaex_model.fit(vdf)"
353
- ]
354
- },
355
- {
356
- "cell_type": "code",
357
- "execution_count": null,
358
- "metadata": {},
359
- "outputs": [],
360
- "source": []
361
- },
362
- {
363
- "cell_type": "code",
364
- "execution_count": 52,
365
- "metadata": {},
366
- "outputs": [
367
- {
368
- "data": {
369
- "text/plain": [
370
- "b'\\x80\\x03c__main__\\nwordifier\\nq\\x00.'"
371
- ]
372
- },
373
- "execution_count": 52,
374
- "metadata": {},
375
- "output_type": "execute_result"
376
- }
377
- ],
378
- "source": [
379
- "import pickle\n",
380
- "pickle.dumps(wordifier)"
381
- ]
382
- },
383
- {
384
- "cell_type": "code",
385
- "execution_count": 47,
386
- "metadata": {},
387
- "outputs": [
388
- {
389
- "ename": "TypeError",
390
- "evalue": "unhashable type: 'list'",
391
- "output_type": "error",
392
- "traceback": [
393
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
394
- "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
395
- "\u001b[0;32m/var/folders/b_/m81mmt0s6gv48kdvk44n2l740000gn/T/ipykernel_52217/687453386.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mvaex_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
396
- "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/ml/sklearn.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, df, **kwargs)\u001b[0m\n\u001b[1;32m 103\u001b[0m '''\n\u001b[1;32m 104\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 105\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeatures\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 106\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtarget\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 107\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
397
- "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36mvalues\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 6897\u001b[0m \u001b[0mIf\u001b[0m \u001b[0many\u001b[0m \u001b[0mof\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[0mcontain\u001b[0m \u001b[0mmasked\u001b[0m \u001b[0marrays\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mmasks\u001b[0m \u001b[0mare\u001b[0m \u001b[0mignored\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mmasked\u001b[0m \u001b[0melements\u001b[0m \u001b[0mare\u001b[0m \u001b[0mreturned\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mwell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6898\u001b[0m \"\"\"\n\u001b[0;32m-> 6899\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__array__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6900\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6901\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
398
- "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36m__array__\u001b[0;34m(self, dtype, parallel)\u001b[0m\n\u001b[1;32m 5989\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcolumn_type\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5990\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Cannot cast %r (of type %r) to %r\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5991\u001b[0;31m \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumn_names\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparallel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marray_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'numpy'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5992\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0many\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misMaskedArray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5993\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
399
- "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36mevaluate\u001b[0;34m(self, expression, i1, i2, out, selection, filtered, array_type, parallel, chunk_size, progress)\u001b[0m\n\u001b[1;32m 2962\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate_iterator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpression\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms1\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms2\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfiltered\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfiltered\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marray_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marray_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparallel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprogress\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprogress\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2963\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2964\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_evaluate_implementation\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpression\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi1\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi2\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfiltered\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfiltered\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marray_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marray_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparallel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprogress\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprogress\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2965\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2966\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mdocsubst\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
400
- "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36m_evaluate_implementation\u001b[0;34m(self, expression, i1, i2, out, selection, filtered, array_type, parallel, chunk_size, raw, progress)\u001b[0m\n\u001b[1;32m 6207\u001b[0m \u001b[0;31m# TODO: For NEP branch: dtype -> dtype_evaluate\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6208\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 6209\u001b[0;31m \u001b[0mexpression_to_evaluate\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpressions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# lets assume we have to do them all\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6210\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6211\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mexpression\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpressions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
401
- "\u001b[0;31mTypeError\u001b[0m: unhashable type: 'list'"
402
- ]
403
- }
404
- ],
405
- "source": []
406
- },
407
- {
408
- "cell_type": "code",
409
- "execution_count": null,
410
- "metadata": {},
411
- "outputs": [],
412
- "source": []
413
- },
414
- {
415
- "cell_type": "code",
416
- "execution_count": null,
417
- "metadata": {},
418
- "outputs": [],
419
- "source": [
420
- "res = []\n",
421
- "with tqdm(total=len(df)) as pbar:\n",
422
- " for doc in tqdm(nlp.pipe(df[\"text\"].values, batch_size=500, n_process=n_cpus)):\n",
423
- " res.append([i.lemma_ for i in doc])\n",
424
- " pbar.update(1)"
425
- ]
426
- },
427
- {
428
- "cell_type": "code",
429
- "execution_count": null,
430
- "metadata": {},
431
- "outputs": [],
432
- "source": [
433
- "import pickle"
434
- ]
435
- },
436
- {
437
- "cell_type": "code",
438
- "execution_count": null,
439
- "metadata": {},
440
- "outputs": [],
441
- "source": [
442
- "def fn(t):\n",
443
- " return "
444
- ]
445
- },
446
- {
447
- "cell_type": "code",
448
- "execution_count": null,
449
- "metadata": {},
450
- "outputs": [],
451
- "source": [
452
- "%%timeit\n",
453
- "with mp.Pool(mp.cpu_count()) as pool:\n",
454
- " new_s = pool.map(nlp, df[\"text\"].values)"
455
- ]
456
- },
457
- {
458
- "cell_type": "code",
459
- "execution_count": null,
460
- "metadata": {},
461
- "outputs": [],
462
- "source": []
463
- },
464
- {
465
- "cell_type": "code",
466
- "execution_count": null,
467
- "metadata": {},
468
- "outputs": [],
469
- "source": []
470
- },
471
- {
472
- "cell_type": "code",
473
- "execution_count": null,
474
- "metadata": {},
475
- "outputs": [],
476
- "source": [
477
- "from typing import List\n",
478
- "import numpy as np\n",
479
- "import pandas as pd\n",
480
- "import streamlit as st\n",
481
- "from sklearn.linear_model import LogisticRegression\n",
482
- "from sklearn.utils import resample\n",
483
- "\n",
484
- "from src.configs import ModelConfigs\n",
485
- "\n",
486
- "\n",
487
- "def wordifier(X, y, X_names: List[str], y_names: List[str], configs=ModelConfigs):\n",
488
- "\n",
489
- " n_instances, n_features = X.shape\n",
490
- " n_classes = len(y_names)\n",
491
- "\n",
492
- " # NOTE: the * 10 / 10 trick is to have \"nice\" round-ups\n",
493
- " sample_fraction = np.ceil((n_features / n_instances) * 10) / 10\n",
494
- "\n",
495
- " sample_size = min(\n",
496
- " # this is the maximum supported\n",
497
- " configs.MAX_SELECTION.value,\n",
498
- " # at minimum you want MIN_SELECTION but in general you want\n",
499
- " # n_instances * sample_fraction\n",
500
- " max(configs.MIN_SELECTION.value, int(n_instances * sample_fraction)),\n",
501
- " # however if previous one is bigger the the available instances take\n",
502
- " # the number of available instances\n",
503
- " n_instances,\n",
504
- " )\n",
505
- "\n",
506
- " # TODO: might want to try out something to subsample features at each iteration\n",
507
- "\n",
508
- " # initialize coefficient matrices\n",
509
- " pos_scores = np.zeros((n_classes, n_features), dtype=int)\n",
510
- " neg_scores = np.zeros((n_classes, n_features), dtype=int)\n",
511
- "\n",
512
- " with st.spinner(\"Wordifying!\"):\n",
513
- " pbar = st.progress(0)\n",
514
- "\n",
515
- " for i, _ in enumerate(range(configs.NUM_ITERS.value)):\n",
516
- "\n",
517
- " # run randomized regression\n",
518
- " clf = LogisticRegression(\n",
519
- " penalty=\"l1\",\n",
520
- " C=configs.PENALTIES.value[\n",
521
- " np.random.randint(len(configs.PENALTIES.value))\n",
522
- " ],\n",
523
- " solver=\"liblinear\",\n",
524
- " multi_class=\"auto\",\n",
525
- " max_iter=500,\n",
526
- " class_weight=\"balanced\",\n",
527
- " )\n",
528
- "\n",
529
- " # sample indices to subsample matrix\n",
530
- " selection = resample(\n",
531
- " np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size\n",
532
- " )\n",
533
- "\n",
534
- " # fit\n",
535
- " try:\n",
536
- " clf.fit(X[selection], y[selection])\n",
537
- " except ValueError:\n",
538
- " continue\n",
539
- "\n",
540
- " # record coefficients\n",
541
- " if n_classes == 2:\n",
542
- " pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)\n",
543
- " neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)\n",
544
- " pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)\n",
545
- " neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)\n",
546
- " else:\n",
547
- " pos_scores += clf.coef_ > 0\n",
548
- " neg_scores += clf.coef_ < 0\n",
549
- "\n",
550
- " pbar.progress(i + 1)\n",
551
- "\n",
552
- " # normalize\n",
553
- " pos_scores = pos_scores / configs.NUM_ITERS.value\n",
554
- " neg_scores = neg_scores / configs.NUM_ITERS.value\n",
555
- "\n",
556
- " # get only active features\n",
557
- " pos_positions = np.where(\n",
558
- " pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0\n",
559
- " )\n",
560
- " neg_positions = np.where(\n",
561
- " neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0\n",
562
- " )\n",
563
- "\n",
564
- " # prepare DataFrame\n",
565
- " pos = [\n",
566
- " (X_names[i], pos_scores[c, i], y_names[c])\n",
567
- " for c, i in zip(*pos_positions.nonzero())\n",
568
- " ]\n",
569
- " neg = [\n",
570
- " (X_names[i], neg_scores[c, i], y_names[c])\n",
571
- " for c, i in zip(*neg_positions.nonzero())\n",
572
- " ]\n",
573
- "\n",
574
- " posdf = pd.DataFrame(pos, columns=\"word score label\".split()).sort_values(\n",
575
- " [\"label\", \"score\"], ascending=False\n",
576
- " )\n",
577
- " negdf = pd.DataFrame(neg, columns=\"word score label\".split()).sort_values(\n",
578
- " [\"label\", \"score\"], ascending=False\n",
579
- " )\n",
580
- "\n",
581
- " return posdf, negdf\n"
582
- ]
583
- },
584
- {
585
- "cell_type": "code",
586
- "execution_count": null,
587
- "metadata": {},
588
- "outputs": [],
589
- "source": [
590
- "path = \"../../../../Downloads/wordify_10000_copy.xlsx\""
591
- ]
592
- },
593
- {
594
- "cell_type": "code",
595
- "execution_count": null,
596
- "metadata": {},
597
- "outputs": [],
598
- "source": [
599
- "df = pd.read_excel(path, dtype=str).dropna()"
600
- ]
601
- },
602
- {
603
- "cell_type": "code",
604
- "execution_count": null,
605
- "metadata": {},
606
- "outputs": [],
607
- "source": [
608
- "# df = pd.read_excel(\"../data/test_de.xlsx\")\n",
609
- "# mdf = mpd.read_csv(\"../data/test_en.csv\")\n",
610
- "language = \"English\"\n",
611
- "nlp = spacy.load(Languages[language].value, exclude=[\"parser\", \"ner\", \"pos\", \"tok2vec\"])"
612
- ]
613
- },
614
- {
615
- "cell_type": "code",
616
- "execution_count": null,
617
- "metadata": {},
618
- "outputs": [],
619
- "source": [
620
- "prep = TextPreprocessor(\n",
621
- " language=\"English\", \n",
622
- " cleaning_steps=list(TextPreprocessor._cleaning_options().keys()),\n",
623
- " lemmatizer_when=None,\n",
624
- ")"
625
- ]
626
- },
627
- {
628
- "cell_type": "code",
629
- "execution_count": null,
630
- "metadata": {},
631
- "outputs": [],
632
- "source": [
633
- "df[\"p_text\"] = prep.fit_transform(df[\"text\"])"
634
- ]
635
- },
636
- {
637
- "cell_type": "code",
638
- "execution_count": null,
639
- "metadata": {},
640
- "outputs": [],
641
- "source": [
642
- "X, y, X_names, y_names = encode(df[\"p_text\"], df[\"label\"]).values()"
643
- ]
644
- },
645
- {
646
- "cell_type": "code",
647
- "execution_count": null,
648
- "metadata": {},
649
- "outputs": [],
650
- "source": [
651
- "clf = LogisticRegression(\n",
652
- " penalty=\"l1\",\n",
653
- " C=0.05,#ModelConfigs.PENALTIES.value[np.random.randint(len(ModelConfigs.PENALTIES.value))],\n",
654
- " solver=\"liblinear\",\n",
655
- " multi_class=\"auto\",\n",
656
- " max_iter=500,\n",
657
- " class_weight=\"balanced\",\n",
658
- ")"
659
- ]
660
- },
661
- {
662
- "cell_type": "code",
663
- "execution_count": null,
664
- "metadata": {},
665
- "outputs": [],
666
- "source": [
667
- "%%time\n",
668
- "clf.fit(X, y)"
669
- ]
670
- },
671
- {
672
- "cell_type": "code",
673
- "execution_count": null,
674
- "metadata": {},
675
- "outputs": [],
676
- "source": []
677
- },
678
- {
679
- "cell_type": "code",
680
- "execution_count": null,
681
- "metadata": {},
682
- "outputs": [],
683
- "source": [
684
- "n_instances, n_features = X.shape\n",
685
- "n_classes = len(y_names)\n",
686
- "\n",
687
- "# NOTE: the * 10 / 10 trick is to have \"nice\" round-ups\n",
688
- "sample_fraction = np.ceil((n_features / n_instances) * 10) / 10\n",
689
- "\n",
690
- "sample_size = min(\n",
691
- " # this is the maximum supported\n",
692
- " ModelConfigs.MAX_SELECTION.value,\n",
693
- " # at minimum you want MIN_SELECTION but in general you want\n",
694
- " # n_instances * sample_fraction\n",
695
- " max(ModelConfigs.MIN_SELECTION.value, int(n_instances * sample_fraction)),\n",
696
- " # however if previous one is bigger the the available instances take\n",
697
- " # the number of available instances\n",
698
- " n_instances,\n",
699
- ")\n",
700
- "\n",
701
- "# TODO: might want to try out something to subsample features at each iteration\n",
702
- "\n",
703
- "# initialize coefficient matrices\n",
704
- "pos_scores = np.zeros((n_classes, n_features), dtype=int)\n",
705
- "neg_scores = np.zeros((n_classes, n_features), dtype=int)\n",
706
- "\n",
707
- "for _ in trange(ModelConfigs.NUM_ITERS.value):\n",
708
- "\n",
709
- " # run randomized regression\n",
710
- " clf = LogisticRegression(\n",
711
- " penalty=\"l1\",\n",
712
- " C=ModelConfigs.PENALTIES.value[np.random.randint(len(ModelConfigs.PENALTIES.value))],\n",
713
- " solver=\"liblinear\",\n",
714
- " multi_class=\"auto\",\n",
715
- " max_iter=500,\n",
716
- " class_weight=\"balanced\",\n",
717
- " )\n",
718
- "\n",
719
- " # sample indices to subsample matrix\n",
720
- " selection = resample(np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size)\n",
721
- "\n",
722
- " # fit\n",
723
- " try:\n",
724
- " clf.fit(X[selection], y[selection])\n",
725
- " except ValueError:\n",
726
- " continue\n",
727
- "\n",
728
- " # record coefficients\n",
729
- " if n_classes == 2:\n",
730
- " pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)\n",
731
- " neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)\n",
732
- " pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)\n",
733
- " neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)\n",
734
- " else:\n",
735
- " pos_scores += clf.coef_ > 0\n",
736
- " neg_scores += clf.coef_ < 0"
737
- ]
738
- },
739
- {
740
- "cell_type": "code",
741
- "execution_count": null,
742
- "metadata": {},
743
- "outputs": [],
744
- "source": [
745
- "# normalize\n",
746
- "pos_scores = pos_scores / ModelConfigs.NUM_ITERS.value\n",
747
- "neg_scores = neg_scores / ModelConfigs.NUM_ITERS.value\n",
748
- "\n",
749
- "# get only active features\n",
750
- "pos_positions = np.where(pos_scores >= ModelConfigs.SELECTION_THRESHOLD.value, pos_scores, 0)\n",
751
- "neg_positions = np.where(neg_scores >= ModelConfigs.SELECTION_THRESHOLD.value, neg_scores, 0)\n",
752
- "\n",
753
- "# prepare DataFrame\n",
754
- "pos = [(X_names[i], pos_scores[c, i], y_names[c]) for c, i in zip(*pos_positions.nonzero())]\n",
755
- "neg = [(X_names[i], neg_scores[c, i], y_names[c]) for c, i in zip(*neg_positions.nonzero())]\n",
756
- "\n",
757
- "posdf = pd.DataFrame(pos, columns=\"word score label\".split()).sort_values([\"label\", \"score\"], ascending=False)\n",
758
- "negdf = pd.DataFrame(neg, columns=\"word score label\".split()).sort_values([\"label\", \"score\"], ascending=False)"
759
- ]
760
- },
761
- {
762
- "cell_type": "code",
763
- "execution_count": null,
764
- "metadata": {},
765
- "outputs": [],
766
- "source": []
767
- }
768
- ],
769
- "metadata": {
770
- "interpreter": {
771
- "hash": "aa7efd0b3ada76bb0689aa8ed0b61d7de788847e3d11d2d142fc5800c765982f"
772
- },
773
- "kernelspec": {
774
- "display_name": "Python 3.8.3 64-bit ('py38': conda)",
775
- "language": "python",
776
- "name": "python3"
777
- },
778
- "language_info": {
779
- "codemirror_mode": {
780
- "name": "ipython",
781
- "version": 3
782
- },
783
- "file_extension": ".py",
784
- "mimetype": "text/x-python",
785
- "name": "python",
786
- "nbconvert_exporter": "python",
787
- "pygments_lexer": "ipython3",
788
- "version": "3.7.11"
789
- },
790
- "orig_nbformat": 2
791
- },
792
- "nbformat": 4,
793
- "nbformat_minor": 2
794
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pytest.ini DELETED
@@ -1,4 +0,0 @@
1
- [pytest]
2
- markers =
3
- cache_tests: mark a test which is about the recurrence computer cache
4
- seed_tests: mark a test which is about the seed sequence