Michelangiolo commited on
Commit
f2092a2
1 Parent(s): 8b29749
Files changed (5) hide show
  1. _test.ipynb +429 -0
  2. app.py +35 -25
  3. data_manipulation.ipynb +364 -18
  4. df_encoded2.parquet +3 -0
  5. df_encoded3.parquet +3 -0
_test.ipynb ADDED
@@ -0,0 +1,429 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "# os.system('pip install openpyxl')\n",
11
+ "# os.system('pip install sentence-transformers')\n",
12
+ "import pandas as pd\n",
13
+ "import gradio as gr\n",
14
+ "from sentence_transformers import SentenceTransformer\n",
15
+ "\n",
16
+ "model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2\n",
17
+ "\n",
18
+ "df = pd.read_parquet('df_encoded3.parquet')\n",
19
+ "df['tags'] = df['tags'].apply(lambda x : str(x))\n",
20
+ "def parse_raised(x):\n",
21
+ " if x == 'Undisclosed':\n",
22
+ " return 0\n",
23
+ " else: \n",
24
+ " quantifier = x[-1]\n",
25
+ " x = float(x[1:-1])\n",
26
+ " if quantifier == 'K':\n",
27
+ " return x/1000\n",
28
+ " elif quantifier == 'M':\n",
29
+ " return x\n",
30
+ "df['raised'] = df['raised'].apply(lambda x : parse_raised(x))\n",
31
+ "df['stage'] = df['stage'].apply(lambda x : x.lower())\n",
32
+ "df = df.reset_index(drop=True)\n",
33
+ "\n",
34
+ "from sklearn.neighbors import NearestNeighbors\n",
35
+ "import pandas as pd\n",
36
+ "from sentence_transformers import SentenceTransformer\n",
37
+ "\n",
38
+ "nbrs = NearestNeighbors(n_neighbors=5000, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())\n",
39
+ "\n",
40
+ "def search(df, query):\n",
41
+ " product = model.encode(query).tolist()\n",
42
+ " # product = df.iloc[0]['text_vector_'] #use one of the products as sample\n",
43
+ "\n",
44
+ " #prepare model\n",
45
+ " # \n",
46
+ " distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object\n",
47
+ "\n",
48
+ " #print out the description of every recommended product\n",
49
+ " return df.iloc[list(indices)[0]][['name', 'raised', 'target', 'size', 'stage', 'country', 'source', 'description', 'tags']]"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": 44,
55
+ "metadata": {},
56
+ "outputs": [
57
+ {
58
+ "name": "stderr",
59
+ "output_type": "stream",
60
+ "text": [
61
+ "c:\\Users\\ardit\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\gradio\\deprecation.py:43: UserWarning: You have unused kwarg parameters in Radio, please remove them: {'multiselect': False}\n",
62
+ " warnings.warn(\n"
63
+ ]
64
+ },
65
+ {
66
+ "name": "stdout",
67
+ "output_type": "stream",
68
+ "text": [
69
+ "Running on local URL: http://127.0.0.1:7884\n",
70
+ "\n",
71
+ "To create a public link, set `share=True` in `launch()`.\n"
72
+ ]
73
+ },
74
+ {
75
+ "data": {
76
+ "text/html": [
77
+ "<div><iframe src=\"http://127.0.0.1:7884/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
78
+ ],
79
+ "text/plain": [
80
+ "<IPython.core.display.HTML object>"
81
+ ]
82
+ },
83
+ "metadata": {},
84
+ "output_type": "display_data"
85
+ },
86
+ {
87
+ "data": {
88
+ "text/plain": []
89
+ },
90
+ "execution_count": 44,
91
+ "metadata": {},
92
+ "output_type": "execute_result"
93
+ },
94
+ {
95
+ "data": {
96
+ "text/html": [
97
+ "<div>\n",
98
+ "<style scoped>\n",
99
+ " .dataframe tbody tr th:only-of-type {\n",
100
+ " vertical-align: middle;\n",
101
+ " }\n",
102
+ "\n",
103
+ " .dataframe tbody tr th {\n",
104
+ " vertical-align: top;\n",
105
+ " }\n",
106
+ "\n",
107
+ " .dataframe thead th {\n",
108
+ " text-align: right;\n",
109
+ " }\n",
110
+ "</style>\n",
111
+ "<table border=\"1\" class=\"dataframe\">\n",
112
+ " <thead>\n",
113
+ " <tr style=\"text-align: right;\">\n",
114
+ " <th></th>\n",
115
+ " <th>name</th>\n",
116
+ " <th>raised</th>\n",
117
+ " <th>target</th>\n",
118
+ " <th>size</th>\n",
119
+ " <th>stage</th>\n",
120
+ " <th>country</th>\n",
121
+ " <th>source</th>\n",
122
+ " <th>description</th>\n",
123
+ " <th>tags</th>\n",
124
+ " </tr>\n",
125
+ " </thead>\n",
126
+ " <tbody>\n",
127
+ " <tr>\n",
128
+ " <th>78931</th>\n",
129
+ " <td>Developeration</td>\n",
130
+ " <td>Undisclosed</td>\n",
131
+ " <td>Undisclosed</td>\n",
132
+ " <td>11-500+</td>\n",
133
+ " <td>c</td>\n",
134
+ " <td>sweden</td>\n",
135
+ " <td>https://www.startupblink.com</td>\n",
136
+ " <td>Developeration AB was founded 2016 and is a st...</td>\n",
137
+ " <td>['healthtech']</td>\n",
138
+ " </tr>\n",
139
+ " <tr>\n",
140
+ " <th>77566</th>\n",
141
+ " <td>ComplyAdvantage</td>\n",
142
+ " <td>Undisclosed</td>\n",
143
+ " <td>Undisclosed</td>\n",
144
+ " <td>11-500+</td>\n",
145
+ " <td>c</td>\n",
146
+ " <td>united-kingdom</td>\n",
147
+ " <td>https://www.startupblink.com</td>\n",
148
+ " <td>We are a financial crime solutions provider co...</td>\n",
149
+ " <td>['fintech']</td>\n",
150
+ " </tr>\n",
151
+ " <tr>\n",
152
+ " <th>78674</th>\n",
153
+ " <td>Atlas</td>\n",
154
+ " <td>Undisclosed</td>\n",
155
+ " <td>Undisclosed</td>\n",
156
+ " <td>11-500+</td>\n",
157
+ " <td>c</td>\n",
158
+ " <td>russia</td>\n",
159
+ " <td>https://www.startupblink.com</td>\n",
160
+ " <td>Atlas Biomedical Holding is developing a netwo...</td>\n",
161
+ " <td>['healthtech']</td>\n",
162
+ " </tr>\n",
163
+ " <tr>\n",
164
+ " <th>81682</th>\n",
165
+ " <td>48 Factoring Inc</td>\n",
166
+ " <td>Undisclosed</td>\n",
167
+ " <td>Undisclosed</td>\n",
168
+ " <td>11-500+</td>\n",
169
+ " <td>c</td>\n",
170
+ " <td>united-states</td>\n",
171
+ " <td>https://www.startupblink.com</td>\n",
172
+ " <td>48 Factoring Inc. is a financial services comp...</td>\n",
173
+ " <td>['fintech']</td>\n",
174
+ " </tr>\n",
175
+ " <tr>\n",
176
+ " <th>78926</th>\n",
177
+ " <td>Xinca</td>\n",
178
+ " <td>Undisclosed</td>\n",
179
+ " <td>Undisclosed</td>\n",
180
+ " <td>11-500+</td>\n",
181
+ " <td>c</td>\n",
182
+ " <td>argentina</td>\n",
183
+ " <td>https://www.startupblink.com</td>\n",
184
+ " <td>Incorporar residuos en la fabricaci&amp;oacute;n d...</td>\n",
185
+ " <td>['energy' 'environment']</td>\n",
186
+ " </tr>\n",
187
+ " <tr>\n",
188
+ " <th>...</th>\n",
189
+ " <td>...</td>\n",
190
+ " <td>...</td>\n",
191
+ " <td>...</td>\n",
192
+ " <td>...</td>\n",
193
+ " <td>...</td>\n",
194
+ " <td>...</td>\n",
195
+ " <td>...</td>\n",
196
+ " <td>...</td>\n",
197
+ " <td>...</td>\n",
198
+ " </tr>\n",
199
+ " <tr>\n",
200
+ " <th>80432</th>\n",
201
+ " <td>Glow</td>\n",
202
+ " <td>Undisclosed</td>\n",
203
+ " <td>Undisclosed</td>\n",
204
+ " <td>11-500+</td>\n",
205
+ " <td>c</td>\n",
206
+ " <td>china</td>\n",
207
+ " <td>https://www.startupblink.com</td>\n",
208
+ " <td>Glow is an ambitious enterprise that uniquely ...</td>\n",
209
+ " <td>['healthtech']</td>\n",
210
+ " </tr>\n",
211
+ " <tr>\n",
212
+ " <th>77716</th>\n",
213
+ " <td>Owiwi</td>\n",
214
+ " <td>Undisclosed</td>\n",
215
+ " <td>Undisclosed</td>\n",
216
+ " <td>11-500+</td>\n",
217
+ " <td>c</td>\n",
218
+ " <td>greece</td>\n",
219
+ " <td>https://www.startupblink.com</td>\n",
220
+ " <td>Owiwi is a fun and engaging psychometric tool ...</td>\n",
221
+ " <td>['software' 'data']</td>\n",
222
+ " </tr>\n",
223
+ " <tr>\n",
224
+ " <th>78561</th>\n",
225
+ " <td>Quantib</td>\n",
226
+ " <td>Undisclosed</td>\n",
227
+ " <td>Undisclosed</td>\n",
228
+ " <td>11-500+</td>\n",
229
+ " <td>c</td>\n",
230
+ " <td>the-netherlands</td>\n",
231
+ " <td>https://www.startupblink.com</td>\n",
232
+ " <td>MRI scan technology to better diagnose -- and ...</td>\n",
233
+ " <td>['healthtech']</td>\n",
234
+ " </tr>\n",
235
+ " <tr>\n",
236
+ " <th>77554</th>\n",
237
+ " <td>Earnin</td>\n",
238
+ " <td>Undisclosed</td>\n",
239
+ " <td>Undisclosed</td>\n",
240
+ " <td>11-500+</td>\n",
241
+ " <td>c</td>\n",
242
+ " <td>united-states</td>\n",
243
+ " <td>https://www.startupblink.com</td>\n",
244
+ " <td>We're building a platform of community-support...</td>\n",
245
+ " <td>['fintech']</td>\n",
246
+ " </tr>\n",
247
+ " <tr>\n",
248
+ " <th>80694</th>\n",
249
+ " <td>Vibrent Health</td>\n",
250
+ " <td>Undisclosed</td>\n",
251
+ " <td>Undisclosed</td>\n",
252
+ " <td>11-500+</td>\n",
253
+ " <td>c</td>\n",
254
+ " <td>united-states</td>\n",
255
+ " <td>https://www.startupblink.com</td>\n",
256
+ " <td>The future of developing new cures for patient...</td>\n",
257
+ " <td>['healthtech']</td>\n",
258
+ " </tr>\n",
259
+ " </tbody>\n",
260
+ "</table>\n",
261
+ "<p>94 rows × 9 columns</p>\n",
262
+ "</div>"
263
+ ],
264
+ "text/plain": [
265
+ " name raised target size stage \\\n",
266
+ "78931 Developeration Undisclosed Undisclosed 11-500+ c \n",
267
+ "77566 ComplyAdvantage Undisclosed Undisclosed 11-500+ c \n",
268
+ "78674 Atlas Undisclosed Undisclosed 11-500+ c \n",
269
+ "81682 48 Factoring Inc Undisclosed Undisclosed 11-500+ c \n",
270
+ "78926 Xinca Undisclosed Undisclosed 11-500+ c \n",
271
+ "... ... ... ... ... ... \n",
272
+ "80432 Glow Undisclosed Undisclosed 11-500+ c \n",
273
+ "77716 Owiwi Undisclosed Undisclosed 11-500+ c \n",
274
+ "78561 Quantib Undisclosed Undisclosed 11-500+ c \n",
275
+ "77554 Earnin Undisclosed Undisclosed 11-500+ c \n",
276
+ "80694 Vibrent Health Undisclosed Undisclosed 11-500+ c \n",
277
+ "\n",
278
+ " country source \\\n",
279
+ "78931 sweden https://www.startupblink.com \n",
280
+ "77566 united-kingdom https://www.startupblink.com \n",
281
+ "78674 russia https://www.startupblink.com \n",
282
+ "81682 united-states https://www.startupblink.com \n",
283
+ "78926 argentina https://www.startupblink.com \n",
284
+ "... ... ... \n",
285
+ "80432 china https://www.startupblink.com \n",
286
+ "77716 greece https://www.startupblink.com \n",
287
+ "78561 the-netherlands https://www.startupblink.com \n",
288
+ "77554 united-states https://www.startupblink.com \n",
289
+ "80694 united-states https://www.startupblink.com \n",
290
+ "\n",
291
+ " description \\\n",
292
+ "78931 Developeration AB was founded 2016 and is a st... \n",
293
+ "77566 We are a financial crime solutions provider co... \n",
294
+ "78674 Atlas Biomedical Holding is developing a netwo... \n",
295
+ "81682 48 Factoring Inc. is a financial services comp... \n",
296
+ "78926 Incorporar residuos en la fabricaci&oacute;n d... \n",
297
+ "... ... \n",
298
+ "80432 Glow is an ambitious enterprise that uniquely ... \n",
299
+ "77716 Owiwi is a fun and engaging psychometric tool ... \n",
300
+ "78561 MRI scan technology to better diagnose -- and ... \n",
301
+ "77554 We're building a platform of community-support... \n",
302
+ "80694 The future of developing new cures for patient... \n",
303
+ "\n",
304
+ " tags \n",
305
+ "78931 ['healthtech'] \n",
306
+ "77566 ['fintech'] \n",
307
+ "78674 ['healthtech'] \n",
308
+ "81682 ['fintech'] \n",
309
+ "78926 ['energy' 'environment'] \n",
310
+ "... ... \n",
311
+ "80432 ['healthtech'] \n",
312
+ "77716 ['software' 'data'] \n",
313
+ "78561 ['healthtech'] \n",
314
+ "77554 ['fintech'] \n",
315
+ "80694 ['healthtech'] \n",
316
+ "\n",
317
+ "[94 rows x 9 columns]"
318
+ ]
319
+ },
320
+ "metadata": {},
321
+ "output_type": "display_data"
322
+ }
323
+ ],
324
+ "source": [
325
+ "def filter_df(df, column_name, filter_type, filter_value, minimum_acceptable_size=0):\n",
326
+ " if filter_type == '==':\n",
327
+ " df_filtered = df[df[column_name]==filter_value]\n",
328
+ " elif filter_type == '>=':\n",
329
+ " df_filtered = df[df[column_name]>=filter_value]\n",
330
+ " elif filter_type == '<=':\n",
331
+ " df_filtered = df[df[column_name]<=filter_value]\n",
332
+ " elif filter_type == 'contains':\n",
333
+ " df_filtered = df[df['target'].str.contains(filter_value)]\n",
334
+ "\n",
335
+ " if df_filtered.size >= minimum_acceptable_size:\n",
336
+ " return df_filtered\n",
337
+ " else:\n",
338
+ " return df\n",
339
+ "\n",
340
+ "#the first module becomes text1, the second module file1\n",
341
+ "def greet(size, target, stage, query): \n",
342
+ " def raised_zero(x):\n",
343
+ " if x == 0:\n",
344
+ " return 'Undisclosed'\n",
345
+ " else:\n",
346
+ " return x\n",
347
+ " df_knn = search(df, query)\n",
348
+ " #we live the sorting for last\n",
349
+ " df_knn = df_knn.sort_values('raised', ascending=False)\n",
350
+ " df_knn['raised'] = df_knn['raised'].apply(lambda x : raised_zero(x))\n",
351
+ "\n",
352
+ " df_size = filter_df(df_knn, 'size', '==', size, 1000)\n",
353
+ " df_target = filter_df(df_size, 'target', 'contains', target, 20)\n",
354
+ " df_stage = filter_df(df_target, 'stage', '==', stage.lower(), 10)\n",
355
+ " \n",
356
+ " display(df_stage)\n",
357
+ " # df_raised = df_target[(df_target['raised'] >= raised) | (df_target['raised'] == 0)]\n",
358
+ "\n",
359
+ " return df_stage[0:100]\n",
360
+ "\n",
361
+ "with gr.Blocks(theme=gr.themes.Soft(primary_hue='amber', secondary_hue='gray', neutral_hue='amber')) as demo:\n",
362
+ " gr.Markdown(\n",
363
+ " \"\"\"\n",
364
+ " # Startup Search Engine\n",
365
+ " \"\"\"\n",
366
+ " )\n",
367
+ " size = gr.Radio(['1-10', '11-50', '51-200', '201-500', '500+', '11-500+'], multiselect=False, value='11-500+', label='size')\n",
368
+ " target = gr.Radio(['B2B', 'B2C', 'B2G', 'B2B2C'], multiselect=False, value='B2B', label='target')\n",
369
+ " stage = gr.Radio(['pre-seed', 'A', 'B', 'C', 'exit'], multiselect=False, value='C', label='stage')\n",
370
+ " # raised = gr.Slider(0, 20, value=5, step_size=1, label=\"Minimum raising (in Millions)\")\n",
371
+ " query = gr.Textbox(label='Describe the Startup you are searching for', value='age reversing')\n",
372
+ " btn = gr.Button(value=\"Search for a Startup\")\n",
373
+ " output1 = gr.DataFrame(label='value')\n",
374
+ " # btn.click(greet, inputs='text', outputs=['dataframe'])\n",
375
+ " btn.click(greet, [size, target, stage, query], [output1])\n",
376
+ "demo.launch(share=False)"
377
+ ]
378
+ },
379
+ {
380
+ "cell_type": "code",
381
+ "execution_count": null,
382
+ "metadata": {},
383
+ "outputs": [],
384
+ "source": [
385
+ "# Define database of sentences\n",
386
+ "sentences = pd.Series(['The quick brown fox jumps over the lazy dog',\n",
387
+ " 'A quick brown dog jumps over the lazy fox',\n",
388
+ " 'The lazy dog jumps over the quick brown fox',\n",
389
+ " 'The quick brown fox jumps over the lazy cat',\n",
390
+ " 'The quick brown cat jumps over the lazy dog'])\n",
391
+ "\n",
392
+ "# Encode sentences\n",
393
+ "sentence_embeddings = model.encode(sentences)\n",
394
+ "\n",
395
+ "# Define query sentence\n",
396
+ "query = 'A lazy dog jumps over the quick brown fox'\n",
397
+ "\n",
398
+ "# Encode query\n",
399
+ "query_embedding = model.encode(query)\n",
400
+ "\n",
401
+ "# Search for similar sentences\n",
402
+ "cosine_scores = util.pytorch_cos_sim(query_embedding, sentence_embeddings)\n",
403
+ "most_similar_sentence = sentences[cosine_scores.argmax()]"
404
+ ]
405
+ }
406
+ ],
407
+ "metadata": {
408
+ "kernelspec": {
409
+ "display_name": "Python 3",
410
+ "language": "python",
411
+ "name": "python3"
412
+ },
413
+ "language_info": {
414
+ "codemirror_mode": {
415
+ "name": "ipython",
416
+ "version": 3
417
+ },
418
+ "file_extension": ".py",
419
+ "mimetype": "text/x-python",
420
+ "name": "python",
421
+ "nbconvert_exporter": "python",
422
+ "pygments_lexer": "ipython3",
423
+ "version": "3.9.13"
424
+ },
425
+ "orig_nbformat": 4
426
+ },
427
+ "nbformat": 4,
428
+ "nbformat_minor": 2
429
+ }
app.py CHANGED
@@ -7,7 +7,7 @@ from sentence_transformers import SentenceTransformer
7
 
8
  model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2
9
 
10
- df = pd.read_parquet('df_encoded.parquet')
11
  df['tags'] = df['tags'].apply(lambda x : str(x))
12
  def parse_raised(x):
13
  if x == 'Undisclosed':
@@ -20,52 +20,61 @@ def parse_raised(x):
20
  elif quantifier == 'M':
21
  return x
22
  df['raised'] = df['raised'].apply(lambda x : parse_raised(x))
 
23
  df = df.reset_index(drop=True)
24
 
25
  from sklearn.neighbors import NearestNeighbors
26
  import pandas as pd
27
  from sentence_transformers import SentenceTransformer
28
 
29
- def filter_df(df, column_name, filter_type, filter_value):
30
- if filter_type == '==':
31
- df_filtered = df[df[column_name]==filter_value]
32
- elif filter_type == '>=':
33
- df_filtered = df[df[column_name]>=filter_value]
34
- elif filter_type == '<=':
35
- df_filtered = df[df[column_name]<=filter_value]
36
- elif filter_type == 'contains':
37
- df_filtered = df[df['target'].str.contains(filter_value)]
38
- return df_filtered
39
 
40
  def search(df, query):
41
  product = model.encode(query).tolist()
42
  # product = df.iloc[0]['text_vector_'] #use one of the products as sample
43
 
44
  #prepare model
45
- nbrs = NearestNeighbors(n_neighbors=20, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())
46
-
47
  distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object
48
 
49
  #print out the description of every recommended product
50
- return df.iloc[list(indices)[0]][['name', 'description', 'raised', 'year', 'target', 'size', 'stage', 'country', 'source', 'tags']]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  #the first module becomes text1, the second module file1
53
- def greet(size, target, raised, query):
54
- df_size = filter_df(df, 'size', '==', size)
55
- df_target = filter_df(df_size, 'target', 'contains', target)
56
  def raised_zero(x):
57
  if x == 0:
58
  return 'Undisclosed'
59
  else:
60
  return x
61
- print('a')
62
- df_raised = df_target[(df_target['raised'] >= raised) | (df_target['raised'] == 0)]
63
- df_knn = search(df_raised, query)
64
  #we live the sorting for last
65
  df_knn = df_knn.sort_values('raised', ascending=False)
66
  df_knn['raised'] = df_knn['raised'].apply(lambda x : raised_zero(x))
67
 
68
- return df_knn
 
 
 
 
 
 
 
69
 
70
  with gr.Blocks(theme=gr.themes.Soft(primary_hue='amber', secondary_hue='gray', neutral_hue='amber')) as demo:
71
  gr.Markdown(
@@ -73,12 +82,13 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue='amber', secondary_hue='gray', n
73
  # Startup Search Engine
74
  """
75
  )
76
- size = gr.Radio(['1-10', '11-50', '51-200', '201-500', '500+'], multiselect=False, value='11-50', label='size')
77
- target = gr.Radio(['B2B', 'B2C', 'B2G', 'B2B2C'], value='B2B', multiselect=False, label='target')
78
- raised = gr.Slider(0, 20, value=5, step_size=1, label="Minimum raising (in Millions)")
 
79
  query = gr.Textbox(label='Describe the Startup you are searching for', value='age reversing')
80
  btn = gr.Button(value="Search for a Startup")
81
  output1 = gr.DataFrame(label='value')
82
  # btn.click(greet, inputs='text', outputs=['dataframe'])
83
- btn.click(greet, [size, target, raised, query], [output1])
84
  demo.launch(share=False)
 
7
 
8
  model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2
9
 
10
+ df = pd.read_parquet('df_encoded3.parquet')
11
  df['tags'] = df['tags'].apply(lambda x : str(x))
12
  def parse_raised(x):
13
  if x == 'Undisclosed':
 
20
  elif quantifier == 'M':
21
  return x
22
  df['raised'] = df['raised'].apply(lambda x : parse_raised(x))
23
+ df['stage'] = df['stage'].apply(lambda x : x.lower())
24
  df = df.reset_index(drop=True)
25
 
26
  from sklearn.neighbors import NearestNeighbors
27
  import pandas as pd
28
  from sentence_transformers import SentenceTransformer
29
 
30
+ nbrs = NearestNeighbors(n_neighbors=5000, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())
 
 
 
 
 
 
 
 
 
31
 
32
  def search(df, query):
33
  product = model.encode(query).tolist()
34
  # product = df.iloc[0]['text_vector_'] #use one of the products as sample
35
 
36
  #prepare model
37
+ #
 
38
  distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object
39
 
40
  #print out the description of every recommended product
41
+ return df.iloc[list(indices)[0]][['name', 'raised', 'target', 'size', 'stage', 'country', 'source', 'description', 'tags']]
42
+
43
+ def filter_df(df, column_name, filter_type, filter_value, minimum_acceptable_size=0):
44
+ if filter_type == '==':
45
+ df_filtered = df[df[column_name]==filter_value]
46
+ elif filter_type == '>=':
47
+ df_filtered = df[df[column_name]>=filter_value]
48
+ elif filter_type == '<=':
49
+ df_filtered = df[df[column_name]<=filter_value]
50
+ elif filter_type == 'contains':
51
+ df_filtered = df[df['target'].str.contains(filter_value)]
52
+
53
+ if df_filtered.size >= minimum_acceptable_size:
54
+ return df_filtered
55
+ else:
56
+ return df
57
 
58
  #the first module becomes text1, the second module file1
59
+ def greet(size, target, stage, query):
 
 
60
  def raised_zero(x):
61
  if x == 0:
62
  return 'Undisclosed'
63
  else:
64
  return x
65
+ df_knn = search(df, query)
 
 
66
  #we live the sorting for last
67
  df_knn = df_knn.sort_values('raised', ascending=False)
68
  df_knn['raised'] = df_knn['raised'].apply(lambda x : raised_zero(x))
69
 
70
+ df_size = filter_df(df_knn, 'size', '==', size, 1000)
71
+ df_target = filter_df(df_size, 'target', 'contains', target, 20)
72
+ df_stage = filter_df(df_target, 'stage', '==', stage.lower(), 10)
73
+
74
+ display(df_stage)
75
+ # df_raised = df_target[(df_target['raised'] >= raised) | (df_target['raised'] == 0)]
76
+
77
+ return df_stage[0:100]
78
 
79
  with gr.Blocks(theme=gr.themes.Soft(primary_hue='amber', secondary_hue='gray', neutral_hue='amber')) as demo:
80
  gr.Markdown(
 
82
  # Startup Search Engine
83
  """
84
  )
85
+ size = gr.Radio(['1-10', '11-50', '51-200', '201-500', '500+', '11-500+'], multiselect=False, value='11-500+', label='size')
86
+ target = gr.Radio(['B2B', 'B2C', 'B2G', 'B2B2C'], multiselect=False, value='B2B', label='target')
87
+ stage = gr.Radio(['pre-seed', 'A', 'B', 'C', 'exit'], multiselect=False, value='C', label='stage')
88
+ # raised = gr.Slider(0, 20, value=5, step_size=1, label="Minimum raising (in Millions)")
89
  query = gr.Textbox(label='Describe the Startup you are searching for', value='age reversing')
90
  btn = gr.Button(value="Search for a Startup")
91
  output1 = gr.DataFrame(label='value')
92
  # btn.click(greet, inputs='text', outputs=['dataframe'])
93
+ btn.click(greet, [size, target, stage, query], [output1])
94
  demo.launch(share=False)
data_manipulation.ipynb CHANGED
@@ -2,7 +2,49 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 78,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "metadata": {},
7
  "outputs": [
8
  {
@@ -34,6 +76,8 @@
34
  " <th>stage</th>\n",
35
  " <th>raised</th>\n",
36
  " <th>tags</th>\n",
 
 
37
  " <th>text_vector_</th>\n",
38
  " </tr>\n",
39
  " </thead>\n",
@@ -48,6 +92,8 @@
48
  " <td>Pre-Funding</td>\n",
49
  " <td>Undisclosed</td>\n",
50
  " <td>[connected-vehicles, adas, autonomous-vehicles...</td>\n",
 
 
51
  " <td>[-0.031224824488162994, -0.06342269480228424, ...</td>\n",
52
  " </tr>\n",
53
  " <tr>\n",
@@ -60,6 +106,8 @@
60
  " <td>Pre-Funding</td>\n",
61
  " <td>Undisclosed</td>\n",
62
  " <td>[sdg, schools, pre-k, serious-games, games, mo...</td>\n",
 
 
63
  " <td>[-0.038649097084999084, 0.028091922402381897, ...</td>\n",
64
  " </tr>\n",
65
  " <tr>\n",
@@ -72,6 +120,8 @@
72
  " <td>Seed</td>\n",
73
  " <td>$120M</td>\n",
74
  " <td>[pharmaceuticals, chronic-disease, immunology,...</td>\n",
 
 
75
  " <td>[0.04561534896492958, -0.017776092514395714, 0...</td>\n",
76
  " </tr>\n",
77
  " <tr>\n",
@@ -84,6 +134,8 @@
84
  " <td>A</td>\n",
85
  " <td>$25M</td>\n",
86
  " <td>[omni-channel, ecommerce, climate-tech, artifi...</td>\n",
 
 
87
  " <td>[0.0024080690927803516, -0.03042100928723812, ...</td>\n",
88
  " </tr>\n",
89
  " <tr>\n",
@@ -96,6 +148,8 @@
96
  " <td>A</td>\n",
97
  " <td>$16.1M</td>\n",
98
  " <td>[enterprise-solutions, data-protection, cyber-...</td>\n",
 
 
99
  " <td>[-0.01007091999053955, 0.10431888699531555, -0...</td>\n",
100
  " </tr>\n",
101
  " <tr>\n",
@@ -109,6 +163,8 @@
109
  " <td>...</td>\n",
110
  " <td>...</td>\n",
111
  " <td>...</td>\n",
 
 
112
  " </tr>\n",
113
  " <tr>\n",
114
  " <th>4981</th>\n",
@@ -120,6 +176,8 @@
120
  " <td>Pre-Funding</td>\n",
121
  " <td>Undisclosed</td>\n",
122
  " <td>[content-creators, e-learning, software-applic...</td>\n",
 
 
123
  " <td>[0.026961881667375565, 0.002459645736962557, -...</td>\n",
124
  " </tr>\n",
125
  " <tr>\n",
@@ -132,6 +190,8 @@
132
  " <td>Pre-Funding</td>\n",
133
  " <td>Undisclosed</td>\n",
134
  " <td>[ecommerce, p2p, delivery, online-shopping, ma...</td>\n",
 
 
135
  " <td>[0.0036857957020401955, 0.03582162782549858, -...</td>\n",
136
  " </tr>\n",
137
  " <tr>\n",
@@ -144,6 +204,8 @@
144
  " <td>Mature</td>\n",
145
  " <td>Undisclosed</td>\n",
146
  " <td>[crops, agtech, harvesting, machinery, sdg, cl...</td>\n",
 
 
147
  " <td>[0.027293115854263306, 0.010461761616170406, 0...</td>\n",
148
  " </tr>\n",
149
  " <tr>\n",
@@ -156,6 +218,8 @@
156
  " <td>Pre-Funding</td>\n",
157
  " <td>Undisclosed</td>\n",
158
  " <td>[fitness, digital-wallet, discount, mobile-app...</td>\n",
 
 
159
  " <td>[0.02851911261677742, 0.05474231392145157, -0....</td>\n",
160
  " </tr>\n",
161
  " <tr>\n",
@@ -168,11 +232,13 @@
168
  " <td>Seed</td>\n",
169
  " <td>$10M</td>\n",
170
  " <td>[endoscopy, medical-devices, minimally-invasiv...</td>\n",
 
 
171
  " <td>[0.012587728910148144, -0.07959864288568497, -...</td>\n",
172
  " </tr>\n",
173
  " </tbody>\n",
174
  "</table>\n",
175
- "<p>4986 rows × 9 columns</p>\n",
176
  "</div>"
177
  ],
178
  "text/plain": [
@@ -202,18 +268,31 @@
202
  "4984 2017.0 B2B, B2C, B2G 11-50 Pre-Funding Undisclosed \n",
203
  "4985 2013.0 B2B 11-50 Seed $10M \n",
204
  "\n",
205
- " tags \\\n",
206
- "0 [connected-vehicles, adas, autonomous-vehicles... \n",
207
- "1 [sdg, schools, pre-k, serious-games, games, mo... \n",
208
- "2 [pharmaceuticals, chronic-disease, immunology,... \n",
209
- "3 [omni-channel, ecommerce, climate-tech, artifi... \n",
210
- "4 [enterprise-solutions, data-protection, cyber-... \n",
211
- "... ... \n",
212
- "4981 [content-creators, e-learning, software-applic... \n",
213
- "4982 [ecommerce, p2p, delivery, online-shopping, ma... \n",
214
- "4983 [crops, agtech, harvesting, machinery, sdg, cl... \n",
215
- "4984 [fitness, digital-wallet, discount, mobile-app... \n",
216
- "4985 [endoscopy, medical-devices, minimally-invasiv... \n",
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  "\n",
218
  " text_vector_ \n",
219
  "0 [-0.031224824488162994, -0.06342269480228424, ... \n",
@@ -228,10 +307,10 @@
228
  "4984 [0.02851911261677742, 0.05474231392145157, -0.... \n",
229
  "4985 [0.012587728910148144, -0.07959864288568497, -... \n",
230
  "\n",
231
- "[4986 rows x 9 columns]"
232
  ]
233
  },
234
- "execution_count": 78,
235
  "metadata": {},
236
  "output_type": "execute_result"
237
  }
@@ -239,8 +318,275 @@
239
  "source": [
240
  "import pandas as pd\n",
241
  "\n",
242
- "df = pd.read_parquet('df_encoded.parquet')\n",
243
- "df"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  ]
245
  },
246
  {
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 4,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "data": {
10
+ "text/plain": [
11
+ "array(['Pre-Funding', 'Seed', 'A', 'Mature', 'C', 'Public', 'D',\n",
12
+ " 'Pre-Seed', 'B', 'Debt Financing', 'F', 'Crowdfunding', 'E'],\n",
13
+ " dtype=object)"
14
+ ]
15
+ },
16
+ "execution_count": 4,
17
+ "metadata": {},
18
+ "output_type": "execute_result"
19
+ }
20
+ ],
21
+ "source": [
22
+ "df1.stage.unique()"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": 5,
28
+ "metadata": {},
29
+ "outputs": [
30
+ {
31
+ "data": {
32
+ "text/plain": [
33
+ "array([0., 3., 1., 4., 2., 5.])"
34
+ ]
35
+ },
36
+ "execution_count": 5,
37
+ "metadata": {},
38
+ "output_type": "execute_result"
39
+ }
40
+ ],
41
+ "source": [
42
+ "df2.stage.unique()"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": 2,
48
  "metadata": {},
49
  "outputs": [
50
  {
 
76
  " <th>stage</th>\n",
77
  " <th>raised</th>\n",
78
  " <th>tags</th>\n",
79
+ " <th>country</th>\n",
80
+ " <th>source</th>\n",
81
  " <th>text_vector_</th>\n",
82
  " </tr>\n",
83
  " </thead>\n",
 
92
  " <td>Pre-Funding</td>\n",
93
  " <td>Undisclosed</td>\n",
94
  " <td>[connected-vehicles, adas, autonomous-vehicles...</td>\n",
95
+ " <td>Israel</td>\n",
96
+ " <td>https://finder.startupnationcentral.org/</td>\n",
97
  " <td>[-0.031224824488162994, -0.06342269480228424, ...</td>\n",
98
  " </tr>\n",
99
  " <tr>\n",
 
106
  " <td>Pre-Funding</td>\n",
107
  " <td>Undisclosed</td>\n",
108
  " <td>[sdg, schools, pre-k, serious-games, games, mo...</td>\n",
109
+ " <td>Israel</td>\n",
110
+ " <td>https://finder.startupnationcentral.org/</td>\n",
111
  " <td>[-0.038649097084999084, 0.028091922402381897, ...</td>\n",
112
  " </tr>\n",
113
  " <tr>\n",
 
120
  " <td>Seed</td>\n",
121
  " <td>$120M</td>\n",
122
  " <td>[pharmaceuticals, chronic-disease, immunology,...</td>\n",
123
+ " <td>Israel</td>\n",
124
+ " <td>https://finder.startupnationcentral.org/</td>\n",
125
  " <td>[0.04561534896492958, -0.017776092514395714, 0...</td>\n",
126
  " </tr>\n",
127
  " <tr>\n",
 
134
  " <td>A</td>\n",
135
  " <td>$25M</td>\n",
136
  " <td>[omni-channel, ecommerce, climate-tech, artifi...</td>\n",
137
+ " <td>Israel</td>\n",
138
+ " <td>https://finder.startupnationcentral.org/</td>\n",
139
  " <td>[0.0024080690927803516, -0.03042100928723812, ...</td>\n",
140
  " </tr>\n",
141
  " <tr>\n",
 
148
  " <td>A</td>\n",
149
  " <td>$16.1M</td>\n",
150
  " <td>[enterprise-solutions, data-protection, cyber-...</td>\n",
151
+ " <td>Israel</td>\n",
152
+ " <td>https://finder.startupnationcentral.org/</td>\n",
153
  " <td>[-0.01007091999053955, 0.10431888699531555, -0...</td>\n",
154
  " </tr>\n",
155
  " <tr>\n",
 
163
  " <td>...</td>\n",
164
  " <td>...</td>\n",
165
  " <td>...</td>\n",
166
+ " <td>...</td>\n",
167
+ " <td>...</td>\n",
168
  " </tr>\n",
169
  " <tr>\n",
170
  " <th>4981</th>\n",
 
176
  " <td>Pre-Funding</td>\n",
177
  " <td>Undisclosed</td>\n",
178
  " <td>[content-creators, e-learning, software-applic...</td>\n",
179
+ " <td>Israel</td>\n",
180
+ " <td>https://finder.startupnationcentral.org/</td>\n",
181
  " <td>[0.026961881667375565, 0.002459645736962557, -...</td>\n",
182
  " </tr>\n",
183
  " <tr>\n",
 
190
  " <td>Pre-Funding</td>\n",
191
  " <td>Undisclosed</td>\n",
192
  " <td>[ecommerce, p2p, delivery, online-shopping, ma...</td>\n",
193
+ " <td>Israel</td>\n",
194
+ " <td>https://finder.startupnationcentral.org/</td>\n",
195
  " <td>[0.0036857957020401955, 0.03582162782549858, -...</td>\n",
196
  " </tr>\n",
197
  " <tr>\n",
 
204
  " <td>Mature</td>\n",
205
  " <td>Undisclosed</td>\n",
206
  " <td>[crops, agtech, harvesting, machinery, sdg, cl...</td>\n",
207
+ " <td>Israel</td>\n",
208
+ " <td>https://finder.startupnationcentral.org/</td>\n",
209
  " <td>[0.027293115854263306, 0.010461761616170406, 0...</td>\n",
210
  " </tr>\n",
211
  " <tr>\n",
 
218
  " <td>Pre-Funding</td>\n",
219
  " <td>Undisclosed</td>\n",
220
  " <td>[fitness, digital-wallet, discount, mobile-app...</td>\n",
221
+ " <td>Israel</td>\n",
222
+ " <td>https://finder.startupnationcentral.org/</td>\n",
223
  " <td>[0.02851911261677742, 0.05474231392145157, -0....</td>\n",
224
  " </tr>\n",
225
  " <tr>\n",
 
232
  " <td>Seed</td>\n",
233
  " <td>$10M</td>\n",
234
  " <td>[endoscopy, medical-devices, minimally-invasiv...</td>\n",
235
+ " <td>Israel</td>\n",
236
+ " <td>https://finder.startupnationcentral.org/</td>\n",
237
  " <td>[0.012587728910148144, -0.07959864288568497, -...</td>\n",
238
  " </tr>\n",
239
  " </tbody>\n",
240
  "</table>\n",
241
+ "<p>4986 rows × 11 columns</p>\n",
242
  "</div>"
243
  ],
244
  "text/plain": [
 
268
  "4984 2017.0 B2B, B2C, B2G 11-50 Pre-Funding Undisclosed \n",
269
  "4985 2013.0 B2B 11-50 Seed $10M \n",
270
  "\n",
271
+ " tags country \\\n",
272
+ "0 [connected-vehicles, adas, autonomous-vehicles... Israel \n",
273
+ "1 [sdg, schools, pre-k, serious-games, games, mo... Israel \n",
274
+ "2 [pharmaceuticals, chronic-disease, immunology,... Israel \n",
275
+ "3 [omni-channel, ecommerce, climate-tech, artifi... Israel \n",
276
+ "4 [enterprise-solutions, data-protection, cyber-... Israel \n",
277
+ "... ... ... \n",
278
+ "4981 [content-creators, e-learning, software-applic... Israel \n",
279
+ "4982 [ecommerce, p2p, delivery, online-shopping, ma... Israel \n",
280
+ "4983 [crops, agtech, harvesting, machinery, sdg, cl... Israel \n",
281
+ "4984 [fitness, digital-wallet, discount, mobile-app... Israel \n",
282
+ "4985 [endoscopy, medical-devices, minimally-invasiv... Israel \n",
283
+ "\n",
284
+ " source \\\n",
285
+ "0 https://finder.startupnationcentral.org/ \n",
286
+ "1 https://finder.startupnationcentral.org/ \n",
287
+ "2 https://finder.startupnationcentral.org/ \n",
288
+ "3 https://finder.startupnationcentral.org/ \n",
289
+ "4 https://finder.startupnationcentral.org/ \n",
290
+ "... ... \n",
291
+ "4981 https://finder.startupnationcentral.org/ \n",
292
+ "4982 https://finder.startupnationcentral.org/ \n",
293
+ "4983 https://finder.startupnationcentral.org/ \n",
294
+ "4984 https://finder.startupnationcentral.org/ \n",
295
+ "4985 https://finder.startupnationcentral.org/ \n",
296
  "\n",
297
  " text_vector_ \n",
298
  "0 [-0.031224824488162994, -0.06342269480228424, ... \n",
 
307
  "4984 [0.02851911261677742, 0.05474231392145157, -0.... \n",
308
  "4985 [0.012587728910148144, -0.07959864288568497, -... \n",
309
  "\n",
310
+ "[4986 rows x 11 columns]"
311
  ]
312
  },
313
+ "execution_count": 2,
314
  "metadata": {},
315
  "output_type": "execute_result"
316
  }
 
318
  "source": [
319
  "import pandas as pd\n",
320
  "\n",
321
+ "df1 = pd.read_parquet('df_encoded.parquet')\n",
322
+ "df1"
323
+ ]
324
+ },
325
+ {
326
+ "cell_type": "code",
327
+ "execution_count": 3,
328
+ "metadata": {},
329
+ "outputs": [
330
+ {
331
+ "data": {
332
+ "text/html": [
333
+ "<div>\n",
334
+ "<style scoped>\n",
335
+ " .dataframe tbody tr th:only-of-type {\n",
336
+ " vertical-align: middle;\n",
337
+ " }\n",
338
+ "\n",
339
+ " .dataframe tbody tr th {\n",
340
+ " vertical-align: top;\n",
341
+ " }\n",
342
+ "\n",
343
+ " .dataframe thead th {\n",
344
+ " text-align: right;\n",
345
+ " }\n",
346
+ "</style>\n",
347
+ "<table border=\"1\" class=\"dataframe\">\n",
348
+ " <thead>\n",
349
+ " <tr style=\"text-align: right;\">\n",
350
+ " <th></th>\n",
351
+ " <th>title</th>\n",
352
+ " <th>description</th>\n",
353
+ " <th>stage</th>\n",
354
+ " <th>industry_name</th>\n",
355
+ " <th>url</th>\n",
356
+ " <th>country_slug</th>\n",
357
+ " <th>city_slug</th>\n",
358
+ " <th>location</th>\n",
359
+ " <th>region_name</th>\n",
360
+ " <th>text_vector_</th>\n",
361
+ " </tr>\n",
362
+ " </thead>\n",
363
+ " <tbody>\n",
364
+ " <tr>\n",
365
+ " <th>0</th>\n",
366
+ " <td>Digipal</td>\n",
367
+ " <td>Digipal is a digital consultancy based in Tbil...</td>\n",
368
+ " <td>0.0</td>\n",
369
+ " <td>Software &amp; Data</td>\n",
370
+ " <td>https://www.digipal.agency/</td>\n",
371
+ " <td>georgia</td>\n",
372
+ " <td>tbilisi</td>\n",
373
+ " <td>Tbilisi, Georgia</td>\n",
374
+ " <td>Europe</td>\n",
375
+ " <td>[0.017287444323301315, 0.06208805367350578, -0...</td>\n",
376
+ " </tr>\n",
377
+ " <tr>\n",
378
+ " <th>1</th>\n",
379
+ " <td>BeatBind</td>\n",
380
+ " <td>BeatBind is the industry's long overdue platfo...</td>\n",
381
+ " <td>0.0</td>\n",
382
+ " <td>Social &amp; Leisure</td>\n",
383
+ " <td>https://beatbind.io/</td>\n",
384
+ " <td>georgia</td>\n",
385
+ " <td>tbilisi</td>\n",
386
+ " <td>Tbilisi, Georgia</td>\n",
387
+ " <td>Europe</td>\n",
388
+ " <td>[-0.00438214186578989, -0.051213208585977554, ...</td>\n",
389
+ " </tr>\n",
390
+ " <tr>\n",
391
+ " <th>2</th>\n",
392
+ " <td>Smart Academy</td>\n",
393
+ " <td>Smart Academy is a modern educational institut...</td>\n",
394
+ " <td>0.0</td>\n",
395
+ " <td>Edtech</td>\n",
396
+ " <td>https://smartacademy.ge/</td>\n",
397
+ " <td>georgia</td>\n",
398
+ " <td>tbilisi</td>\n",
399
+ " <td>Tbilisi, Georgia</td>\n",
400
+ " <td>Europe</td>\n",
401
+ " <td>[0.0005468669114634395, -0.05331585183739662, ...</td>\n",
402
+ " </tr>\n",
403
+ " <tr>\n",
404
+ " <th>3</th>\n",
405
+ " <td>MaxinAI</td>\n",
406
+ " <td>MaxinAI isglobal AI development company that w...</td>\n",
407
+ " <td>0.0</td>\n",
408
+ " <td>Software &amp; Data</td>\n",
409
+ " <td>https://www.maxinai.com/#all-industries</td>\n",
410
+ " <td>georgia</td>\n",
411
+ " <td>tbilisi</td>\n",
412
+ " <td>Tbilisi, Georgia</td>\n",
413
+ " <td>Europe</td>\n",
414
+ " <td>[0.021948501467704773, 0.024166792631149292, -...</td>\n",
415
+ " </tr>\n",
416
+ " <tr>\n",
417
+ " <th>4</th>\n",
418
+ " <td>TLANCER</td>\n",
419
+ " <td>Tlancer aims to create an unlimited educationa...</td>\n",
420
+ " <td>0.0</td>\n",
421
+ " <td>Edtech</td>\n",
422
+ " <td>https://www.tlancer.ge/</td>\n",
423
+ " <td>georgia</td>\n",
424
+ " <td>tbilisi</td>\n",
425
+ " <td>Tbilisi, Georgia</td>\n",
426
+ " <td>Europe</td>\n",
427
+ " <td>[0.02025573141872883, -0.022812215611338615, -...</td>\n",
428
+ " </tr>\n",
429
+ " <tr>\n",
430
+ " <th>...</th>\n",
431
+ " <td>...</td>\n",
432
+ " <td>...</td>\n",
433
+ " <td>...</td>\n",
434
+ " <td>...</td>\n",
435
+ " <td>...</td>\n",
436
+ " <td>...</td>\n",
437
+ " <td>...</td>\n",
438
+ " <td>...</td>\n",
439
+ " <td>...</td>\n",
440
+ " <td>...</td>\n",
441
+ " </tr>\n",
442
+ " <tr>\n",
443
+ " <th>94521</th>\n",
444
+ " <td>OneTwo</td>\n",
445
+ " <td>klkdčksč kdč skdčlsk čdksčd ksčk dčskdčk čdk</td>\n",
446
+ " <td>0.0</td>\n",
447
+ " <td>Software &amp; Data</td>\n",
448
+ " <td>www.nethr</td>\n",
449
+ " <td>croatia</td>\n",
450
+ " <td>zagreb</td>\n",
451
+ " <td>Zagreb, Croatia</td>\n",
452
+ " <td>Europe</td>\n",
453
+ " <td>[0.07235302031040192, -0.05674564838409424, -0...</td>\n",
454
+ " </tr>\n",
455
+ " <tr>\n",
456
+ " <th>94522</th>\n",
457
+ " <td>Trialfire</td>\n",
458
+ " <td>Engaged trialers turn into customers, engaged ...</td>\n",
459
+ " <td>0.0</td>\n",
460
+ " <td>Software &amp; Data</td>\n",
461
+ " <td>http://www.trialfire.com</td>\n",
462
+ " <td>canada</td>\n",
463
+ " <td>toronto</td>\n",
464
+ " <td>Toronto, Canada</td>\n",
465
+ " <td>North America</td>\n",
466
+ " <td>[0.030764097347855568, 0.054082825779914856, -...</td>\n",
467
+ " </tr>\n",
468
+ " <tr>\n",
469
+ " <th>94523</th>\n",
470
+ " <td>ILLUMAGEAR</td>\n",
471
+ " <td>ILLUMAGEAR’s mission is to illuminate people a...</td>\n",
472
+ " <td>0.0</td>\n",
473
+ " <td>Software &amp; Data</td>\n",
474
+ " <td>http://www.illumagear.com</td>\n",
475
+ " <td>united-states</td>\n",
476
+ " <td>seattle</td>\n",
477
+ " <td>Seattle, United States</td>\n",
478
+ " <td>North America</td>\n",
479
+ " <td>[0.015447210520505905, -0.0984775498509407, 0....</td>\n",
480
+ " </tr>\n",
481
+ " <tr>\n",
482
+ " <th>94524</th>\n",
483
+ " <td>Knowillage</td>\n",
484
+ " <td>Knowillage lets you add personalization to you...</td>\n",
485
+ " <td>0.0</td>\n",
486
+ " <td>Edtech</td>\n",
487
+ " <td>http://www.knowillage.com</td>\n",
488
+ " <td>canada</td>\n",
489
+ " <td>vancouver</td>\n",
490
+ " <td>Vancouver, Canada</td>\n",
491
+ " <td>North America</td>\n",
492
+ " <td>[0.007970919832587242, -0.04347420111298561, -...</td>\n",
493
+ " </tr>\n",
494
+ " <tr>\n",
495
+ " <th>94525</th>\n",
496
+ " <td>Iris Holidays</td>\n",
497
+ " <td>Iris Holidays is a full service Kerala tours o...</td>\n",
498
+ " <td>0.0</td>\n",
499
+ " <td>Software &amp; Data</td>\n",
500
+ " <td>http://www.irisholidays.com</td>\n",
501
+ " <td>india</td>\n",
502
+ " <td>kochi</td>\n",
503
+ " <td>Kochi, India</td>\n",
504
+ " <td>Asia Pacific</td>\n",
505
+ " <td>[0.0032976483926177025, -0.010843133553862572,...</td>\n",
506
+ " </tr>\n",
507
+ " </tbody>\n",
508
+ "</table>\n",
509
+ "<p>94526 rows × 10 columns</p>\n",
510
+ "</div>"
511
+ ],
512
+ "text/plain": [
513
+ " title description \\\n",
514
+ "0 Digipal Digipal is a digital consultancy based in Tbil... \n",
515
+ "1 BeatBind BeatBind is the industry's long overdue platfo... \n",
516
+ "2 Smart Academy Smart Academy is a modern educational institut... \n",
517
+ "3 MaxinAI MaxinAI isglobal AI development company that w... \n",
518
+ "4 TLANCER Tlancer aims to create an unlimited educationa... \n",
519
+ "... ... ... \n",
520
+ "94521 OneTwo klkdčksč kdč skdčlsk čdksčd ksčk dčskdčk čdk \n",
521
+ "94522 Trialfire Engaged trialers turn into customers, engaged ... \n",
522
+ "94523 ILLUMAGEAR ILLUMAGEAR’s mission is to illuminate people a... \n",
523
+ "94524 Knowillage Knowillage lets you add personalization to you... \n",
524
+ "94525 Iris Holidays Iris Holidays is a full service Kerala tours o... \n",
525
+ "\n",
526
+ " stage industry_name url \\\n",
527
+ "0 0.0 Software & Data https://www.digipal.agency/ \n",
528
+ "1 0.0 Social & Leisure https://beatbind.io/ \n",
529
+ "2 0.0 Edtech https://smartacademy.ge/ \n",
530
+ "3 0.0 Software & Data https://www.maxinai.com/#all-industries \n",
531
+ "4 0.0 Edtech https://www.tlancer.ge/ \n",
532
+ "... ... ... ... \n",
533
+ "94521 0.0 Software & Data www.nethr \n",
534
+ "94522 0.0 Software & Data http://www.trialfire.com \n",
535
+ "94523 0.0 Software & Data http://www.illumagear.com \n",
536
+ "94524 0.0 Edtech http://www.knowillage.com \n",
537
+ "94525 0.0 Software & Data http://www.irisholidays.com \n",
538
+ "\n",
539
+ " country_slug city_slug location region_name \\\n",
540
+ "0 georgia tbilisi Tbilisi, Georgia Europe \n",
541
+ "1 georgia tbilisi Tbilisi, Georgia Europe \n",
542
+ "2 georgia tbilisi Tbilisi, Georgia Europe \n",
543
+ "3 georgia tbilisi Tbilisi, Georgia Europe \n",
544
+ "4 georgia tbilisi Tbilisi, Georgia Europe \n",
545
+ "... ... ... ... ... \n",
546
+ "94521 croatia zagreb Zagreb, Croatia Europe \n",
547
+ "94522 canada toronto Toronto, Canada North America \n",
548
+ "94523 united-states seattle Seattle, United States North America \n",
549
+ "94524 canada vancouver Vancouver, Canada North America \n",
550
+ "94525 india kochi Kochi, India Asia Pacific \n",
551
+ "\n",
552
+ " text_vector_ \n",
553
+ "0 [0.017287444323301315, 0.06208805367350578, -0... \n",
554
+ "1 [-0.00438214186578989, -0.051213208585977554, ... \n",
555
+ "2 [0.0005468669114634395, -0.05331585183739662, ... \n",
556
+ "3 [0.021948501467704773, 0.024166792631149292, -... \n",
557
+ "4 [0.02025573141872883, -0.022812215611338615, -... \n",
558
+ "... ... \n",
559
+ "94521 [0.07235302031040192, -0.05674564838409424, -0... \n",
560
+ "94522 [0.030764097347855568, 0.054082825779914856, -... \n",
561
+ "94523 [0.015447210520505905, -0.0984775498509407, 0.... \n",
562
+ "94524 [0.007970919832587242, -0.04347420111298561, -... \n",
563
+ "94525 [0.0032976483926177025, -0.010843133553862572,... \n",
564
+ "\n",
565
+ "[94526 rows x 10 columns]"
566
+ ]
567
+ },
568
+ "execution_count": 3,
569
+ "metadata": {},
570
+ "output_type": "execute_result"
571
+ }
572
+ ],
573
+ "source": [
574
+ "stage_dict = {\n",
575
+ " 0 : \"pre-seed\",\n",
576
+ " 1 : \"seed\",\n",
577
+ " 2 : \"A\",\n",
578
+ " 3 : \"B\",\n",
579
+ " 4 : \"C\",\n",
580
+ " 5 : \"Exit\",\n",
581
+ "}\n",
582
+ "\n",
583
+ "df2 = pd.read_parquet('df_encoded2.parquet')\n",
584
+ "df2.columns = [['name', 'description', 'stage', 'industry_name', 'url', 'country_slug', 'text_vector_']]\n",
585
+ "df2['stage'] = df2['stage'].apply(lambda x : stage_dict[x])\n",
586
+ "df2['raised'] = 'Undisclosed'\n",
587
+ "df2['size'] = '11-500+'\n",
588
+ "df2['source'] = 'https://www.startupblink.com'\n",
589
+ "df2.columns = [['name', 'description', 'stage', 'tags', 'url', 'country_slug', 'text_vector_', 'raised', 'size', 'source']]"
590
  ]
591
  },
592
  {
df_encoded2.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:439b1d44d59383eb4eb7c4626b733b4aca9db3c1a6ecf983ffad1c59eb5fd59b
3
+ size 460066850
df_encoded3.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:724948bf68f31a0c87e397b0d89c95be26dbcd0b769650175a0275d3b22c22e2
3
+ size 483543661