Spaces:

Michelangiolo
/

startup-finder

Runtime error

App Files Files Community

Michelangiolo commited on Apr 5, 2023

Commit

f2092a2

•

1 Parent(s): 8b29749

v3

Browse files

Files changed (5) hide show

_test.ipynb +429 -0
app.py +35 -25
data_manipulation.ipynb +364 -18
df_encoded2.parquet +3 -0
df_encoded3.parquet +3 -0

_test.ipynb ADDED Viewed

	@@ -0,0 +1,429 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "# os.system('pip install openpyxl')\n",
+    "# os.system('pip install sentence-transformers')\n",
+    "import pandas as pd\n",
+    "import gradio as gr\n",
+    "from sentence_transformers import SentenceTransformer\n",
+    "\n",
+    "model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2\n",
+    "\n",
+    "df = pd.read_parquet('df_encoded3.parquet')\n",
+    "df['tags'] = df['tags'].apply(lambda x : str(x))\n",
+    "def parse_raised(x):\n",
+    "    if x == 'Undisclosed':\n",
+    "        return 0\n",
+    "    else: \n",
+    "        quantifier = x[-1]\n",
+    "        x = float(x[1:-1])\n",
+    "        if quantifier == 'K':\n",
+    "            return x/1000\n",
+    "        elif quantifier == 'M':\n",
+    "            return x\n",
+    "df['raised'] = df['raised'].apply(lambda x : parse_raised(x))\n",
+    "df['stage'] = df['stage'].apply(lambda x : x.lower())\n",
+    "df = df.reset_index(drop=True)\n",
+    "\n",
+    "from sklearn.neighbors import NearestNeighbors\n",
+    "import pandas as pd\n",
+    "from sentence_transformers import SentenceTransformer\n",
+    "\n",
+    "nbrs = NearestNeighbors(n_neighbors=5000, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())\n",
+    "\n",
+    "def search(df, query):\n",
+    "    product = model.encode(query).tolist()\n",
+    "    # product = df.iloc[0]['text_vector_'] #use one of the products as sample\n",
+    "\n",
+    "    #prepare model\n",
+    "    # \n",
+    "    distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object\n",
+    "\n",
+    "    #print out the description of every recommended product\n",
+    "    return df.iloc[list(indices)[0]][['name', 'raised', 'target', 'size', 'stage', 'country', 'source', 'description', 'tags']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\ardit\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\gradio\\deprecation.py:43: UserWarning: You have unused kwarg parameters in Radio, please remove them: {'multiselect': False}\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running on local URL:  http://127.0.0.1:7884\n",
+      "\n",
+      "To create a public link, set `share=True` in `launch()`.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"http://127.0.0.1:7884/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>raised</th>\n",
+       "      <th>target</th>\n",
+       "      <th>size</th>\n",
+       "      <th>stage</th>\n",
+       "      <th>country</th>\n",
+       "      <th>source</th>\n",
+       "      <th>description</th>\n",
+       "      <th>tags</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>78931</th>\n",
+       "      <td>Developeration</td>\n",
+       "      <td>Undisclosed</td>\n",
+       "      <td>Undisclosed</td>\n",
+       "      <td>11-500+</td>\n",
+       "      <td>c</td>\n",
+       "      <td>sweden</td>\n",
+       "      <td>https://www.startupblink.com</td>\n",
+       "      <td>Developeration AB was founded 2016 and is a st...</td>\n",
+       "      <td>['healthtech']</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>77566</th>\n",
+       "      <td>ComplyAdvantage</td>\n",
+       "      <td>Undisclosed</td>\n",
+       "      <td>Undisclosed</td>\n",
+       "      <td>11-500+</td>\n",
+       "      <td>c</td>\n",
+       "      <td>united-kingdom</td>\n",
+       "      <td>https://www.startupblink.com</td>\n",
+       "      <td>We are a financial crime solutions provider co...</td>\n",
+       "      <td>['fintech']</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>78674</th>\n",
+       "      <td>Atlas</td>\n",
+       "      <td>Undisclosed</td>\n",
+       "      <td>Undisclosed</td>\n",
+       "      <td>11-500+</td>\n",
+       "      <td>c</td>\n",
+       "      <td>russia</td>\n",
+       "      <td>https://www.startupblink.com</td>\n",
+       "      <td>Atlas Biomedical Holding is developing a netwo...</td>\n",
+       "      <td>['healthtech']</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>81682</th>\n",
+       "      <td>48 Factoring Inc</td>\n",
+       "      <td>Undisclosed</td>\n",
+       "      <td>Undisclosed</td>\n",
+       "      <td>11-500+</td>\n",
+       "      <td>c</td>\n",
+       "      <td>united-states</td>\n",
+       "      <td>https://www.startupblink.com</td>\n",
+       "      <td>48 Factoring Inc. is a financial services comp...</td>\n",
+       "      <td>['fintech']</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>78926</th>\n",
+       "      <td>Xinca</td>\n",
+       "      <td>Undisclosed</td>\n",
+       "      <td>Undisclosed</td>\n",
+       "      <td>11-500+</td>\n",
+       "      <td>c</td>\n",
+       "      <td>argentina</td>\n",
+       "      <td>https://www.startupblink.com</td>\n",
+       "      <td>Incorporar residuos en la fabricaci&amp;oacute;n d...</td>\n",
+       "      <td>['energy' 'environment']</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>80432</th>\n",
+       "      <td>Glow</td>\n",
+       "      <td>Undisclosed</td>\n",
+       "      <td>Undisclosed</td>\n",
+       "      <td>11-500+</td>\n",
+       "      <td>c</td>\n",
+       "      <td>china</td>\n",
+       "      <td>https://www.startupblink.com</td>\n",
+       "      <td>Glow is an ambitious enterprise that uniquely ...</td>\n",
+       "      <td>['healthtech']</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>77716</th>\n",
+       "      <td>Owiwi</td>\n",
+       "      <td>Undisclosed</td>\n",
+       "      <td>Undisclosed</td>\n",
+       "      <td>11-500+</td>\n",
+       "      <td>c</td>\n",
+       "      <td>greece</td>\n",
+       "      <td>https://www.startupblink.com</td>\n",
+       "      <td>Owiwi is a fun and engaging psychometric tool ...</td>\n",
+       "      <td>['software' 'data']</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>78561</th>\n",
+       "      <td>Quantib</td>\n",
+       "      <td>Undisclosed</td>\n",
+       "      <td>Undisclosed</td>\n",
+       "      <td>11-500+</td>\n",
+       "      <td>c</td>\n",
+       "      <td>the-netherlands</td>\n",
+       "      <td>https://www.startupblink.com</td>\n",
+       "      <td>MRI scan technology to better diagnose -- and ...</td>\n",
+       "      <td>['healthtech']</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>77554</th>\n",
+       "      <td>Earnin</td>\n",
+       "      <td>Undisclosed</td>\n",
+       "      <td>Undisclosed</td>\n",
+       "      <td>11-500+</td>\n",
+       "      <td>c</td>\n",
+       "      <td>united-states</td>\n",
+       "      <td>https://www.startupblink.com</td>\n",
+       "      <td>We're building a platform of community-support...</td>\n",
+       "      <td>['fintech']</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>80694</th>\n",
+       "      <td>Vibrent Health</td>\n",
+       "      <td>Undisclosed</td>\n",
+       "      <td>Undisclosed</td>\n",
+       "      <td>11-500+</td>\n",
+       "      <td>c</td>\n",
+       "      <td>united-states</td>\n",
+       "      <td>https://www.startupblink.com</td>\n",
+       "      <td>The future of developing new cures for patient...</td>\n",
+       "      <td>['healthtech']</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>94 rows × 9 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                   name       raised       target     size stage  \\\n",
+       "78931    Developeration  Undisclosed  Undisclosed  11-500+     c   \n",
+       "77566   ComplyAdvantage  Undisclosed  Undisclosed  11-500+     c   \n",
+       "78674             Atlas  Undisclosed  Undisclosed  11-500+     c   \n",
+       "81682  48 Factoring Inc  Undisclosed  Undisclosed  11-500+     c   \n",
+       "78926             Xinca  Undisclosed  Undisclosed  11-500+     c   \n",
+       "...                 ...          ...          ...      ...   ...   \n",
+       "80432              Glow  Undisclosed  Undisclosed  11-500+     c   \n",
+       "77716             Owiwi  Undisclosed  Undisclosed  11-500+     c   \n",
+       "78561           Quantib  Undisclosed  Undisclosed  11-500+     c   \n",
+       "77554            Earnin  Undisclosed  Undisclosed  11-500+     c   \n",
+       "80694    Vibrent Health  Undisclosed  Undisclosed  11-500+     c   \n",
+       "\n",
+       "               country                        source  \\\n",
+       "78931           sweden  https://www.startupblink.com   \n",
+       "77566   united-kingdom  https://www.startupblink.com   \n",
+       "78674           russia  https://www.startupblink.com   \n",
+       "81682    united-states  https://www.startupblink.com   \n",
+       "78926        argentina  https://www.startupblink.com   \n",
+       "...                ...                           ...   \n",
+       "80432            china  https://www.startupblink.com   \n",
+       "77716           greece  https://www.startupblink.com   \n",
+       "78561  the-netherlands  https://www.startupblink.com   \n",
+       "77554    united-states  https://www.startupblink.com   \n",
+       "80694    united-states  https://www.startupblink.com   \n",
+       "\n",
+       "                                             description  \\\n",
+       "78931  Developeration AB was founded 2016 and is a st...   \n",
+       "77566  We are a financial crime solutions provider co...   \n",
+       "78674  Atlas Biomedical Holding is developing a netwo...   \n",
+       "81682  48 Factoring Inc. is a financial services comp...   \n",
+       "78926  Incorporar residuos en la fabricaci&oacute;n d...   \n",
+       "...                                                  ...   \n",
+       "80432  Glow is an ambitious enterprise that uniquely ...   \n",
+       "77716  Owiwi is a fun and engaging psychometric tool ...   \n",
+       "78561  MRI scan technology to better diagnose -- and ...   \n",
+       "77554  We're building a platform of community-support...   \n",
+       "80694  The future of developing new cures for patient...   \n",
+       "\n",
+       "                           tags  \n",
+       "78931            ['healthtech']  \n",
+       "77566               ['fintech']  \n",
+       "78674            ['healthtech']  \n",
+       "81682               ['fintech']  \n",
+       "78926  ['energy' 'environment']  \n",
+       "...                         ...  \n",
+       "80432            ['healthtech']  \n",
+       "77716       ['software' 'data']  \n",
+       "78561            ['healthtech']  \n",
+       "77554               ['fintech']  \n",
+       "80694            ['healthtech']  \n",
+       "\n",
+       "[94 rows x 9 columns]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "def filter_df(df, column_name, filter_type, filter_value, minimum_acceptable_size=0):\n",
+    "    if filter_type == '==':\n",
+    "        df_filtered = df[df[column_name]==filter_value]\n",
+    "    elif filter_type == '>=':\n",
+    "        df_filtered = df[df[column_name]>=filter_value]\n",
+    "    elif filter_type == '<=':\n",
+    "        df_filtered = df[df[column_name]<=filter_value]\n",
+    "    elif filter_type == 'contains':\n",
+    "        df_filtered = df[df['target'].str.contains(filter_value)]\n",
+    "\n",
+    "    if df_filtered.size >= minimum_acceptable_size:\n",
+    "        return df_filtered\n",
+    "    else:\n",
+    "        return df\n",
+    "\n",
+    "#the first module becomes text1, the second module file1\n",
+    "def greet(size, target, stage, query): \n",
+    "    def raised_zero(x):\n",
+    "        if x == 0:\n",
+    "            return 'Undisclosed'\n",
+    "        else:\n",
+    "            return x\n",
+    "    df_knn = search(df, query)\n",
+    "    #we live the sorting for last\n",
+    "    df_knn = df_knn.sort_values('raised', ascending=False)\n",
+    "    df_knn['raised'] = df_knn['raised'].apply(lambda x : raised_zero(x))\n",
+    "\n",
+    "    df_size = filter_df(df_knn, 'size', '==', size, 1000)\n",
+    "    df_target = filter_df(df_size, 'target', 'contains', target, 20)\n",
+    "    df_stage = filter_df(df_target, 'stage', '==', stage.lower(), 10)\n",
+    "    \n",
+    "    display(df_stage)\n",
+    "    # df_raised = df_target[(df_target['raised'] >= raised) | (df_target['raised'] == 0)]\n",
+    "\n",
+    "    return df_stage[0:100]\n",
+    "\n",
+    "with gr.Blocks(theme=gr.themes.Soft(primary_hue='amber', secondary_hue='gray', neutral_hue='amber')) as demo:\n",
+    "    gr.Markdown(\n",
+    "    \"\"\"\n",
+    "    # Startup Search Engine\n",
+    "    \"\"\"\n",
+    "    )\n",
+    "    size = gr.Radio(['1-10', '11-50', '51-200', '201-500', '500+', '11-500+'], multiselect=False, value='11-500+', label='size')\n",
+    "    target = gr.Radio(['B2B', 'B2C', 'B2G', 'B2B2C'], multiselect=False, value='B2B', label='target')\n",
+    "    stage = gr.Radio(['pre-seed', 'A', 'B', 'C', 'exit'], multiselect=False, value='C', label='stage')\n",
+    "    # raised = gr.Slider(0, 20, value=5, step_size=1, label=\"Minimum raising (in Millions)\")\n",
+    "    query = gr.Textbox(label='Describe the Startup you are searching for', value='age reversing')\n",
+    "    btn = gr.Button(value=\"Search for a Startup\")\n",
+    "    output1 = gr.DataFrame(label='value')\n",
+    "    # btn.click(greet, inputs='text', outputs=['dataframe'])\n",
+    "    btn.click(greet, [size, target, stage, query], [output1])\n",
+    "demo.launch(share=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define database of sentences\n",
+    "sentences = pd.Series(['The quick brown fox jumps over the lazy dog',\n",
+    "                       'A quick brown dog jumps over the lazy fox',\n",
+    "                       'The lazy dog jumps over the quick brown fox',\n",
+    "                       'The quick brown fox jumps over the lazy cat',\n",
+    "                       'The quick brown cat jumps over the lazy dog'])\n",
+    "\n",
+    "# Encode sentences\n",
+    "sentence_embeddings = model.encode(sentences)\n",
+    "\n",
+    "# Define query sentence\n",
+    "query = 'A lazy dog jumps over the quick brown fox'\n",
+    "\n",
+    "# Encode query\n",
+    "query_embedding = model.encode(query)\n",
+    "\n",
+    "# Search for similar sentences\n",
+    "cosine_scores = util.pytorch_cos_sim(query_embedding, sentence_embeddings)\n",
+    "most_similar_sentence = sentences[cosine_scores.argmax()]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ from sentence_transformers import SentenceTransformer
 model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2
-df = pd.read_parquet('df_encoded.parquet')
 df['tags'] = df['tags'].apply(lambda x : str(x))
 def parse_raised(x):
     if x == 'Undisclosed':
@@ -20,52 +20,61 @@ def parse_raised(x):
         elif quantifier == 'M':
             return x
 df['raised'] = df['raised'].apply(lambda x : parse_raised(x))
 df = df.reset_index(drop=True)
 from sklearn.neighbors import NearestNeighbors
 import pandas as pd
 from sentence_transformers import SentenceTransformer
-def filter_df(df, column_name, filter_type, filter_value):
-    if filter_type == '==':
-        df_filtered = df[df[column_name]==filter_value]
-    elif filter_type == '>=':
-        df_filtered = df[df[column_name]>=filter_value]
-    elif filter_type == '<=':
-        df_filtered = df[df[column_name]<=filter_value]
-    elif filter_type == 'contains':
-        df_filtered = df[df['target'].str.contains(filter_value)]
-    return df_filtered
 def search(df, query):
     product = model.encode(query).tolist()
     # product = df.iloc[0]['text_vector_'] #use one of the products as sample
     #prepare model
-    nbrs = NearestNeighbors(n_neighbors=20, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())
     distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object
     #print out the description of every recommended product
-    return df.iloc[list(indices)[0]][['name', 'description', 'raised', 'year', 'target', 'size', 'stage', 'country', 'source', 'tags']]
 #the first module becomes text1, the second module file1
-def greet(size, target, raised, query):
-    df_size = filter_df(df, 'size', '==', size)
-    df_target = filter_df(df_size, 'target', 'contains', target)
     def raised_zero(x):
         if x == 0:
             return 'Undisclosed'
         else:
             return x
-    print('a')
-    df_raised = df_target[(df_target['raised'] >= raised) | (df_target['raised'] == 0)]
-    df_knn = search(df_raised, query)
     #we live the sorting for last
     df_knn = df_knn.sort_values('raised', ascending=False)
     df_knn['raised'] = df_knn['raised'].apply(lambda x : raised_zero(x))
-    return df_knn
 with gr.Blocks(theme=gr.themes.Soft(primary_hue='amber', secondary_hue='gray', neutral_hue='amber')) as demo:
     gr.Markdown(
@@ -73,12 +82,13 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue='amber', secondary_hue='gray', n
     # Startup Search Engine
     """
     )
-    size = gr.Radio(['1-10', '11-50', '51-200', '201-500', '500+'], multiselect=False, value='11-50', label='size')
-    target = gr.Radio(['B2B', 'B2C', 'B2G', 'B2B2C'], value='B2B', multiselect=False, label='target')
-    raised = gr.Slider(0, 20, value=5, step_size=1, label="Minimum raising (in Millions)")
     query = gr.Textbox(label='Describe the Startup you are searching for', value='age reversing')
     btn = gr.Button(value="Search for a Startup")
     output1 = gr.DataFrame(label='value')
     # btn.click(greet, inputs='text', outputs=['dataframe'])
-    btn.click(greet, [size, target, raised, query], [output1])
 demo.launch(share=False)

 model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2
+df = pd.read_parquet('df_encoded3.parquet')
 df['tags'] = df['tags'].apply(lambda x : str(x))
 def parse_raised(x):
     if x == 'Undisclosed':
         elif quantifier == 'M':
             return x
 df['raised'] = df['raised'].apply(lambda x : parse_raised(x))
+df['stage'] = df['stage'].apply(lambda x : x.lower())
 df = df.reset_index(drop=True)
 from sklearn.neighbors import NearestNeighbors
 import pandas as pd
 from sentence_transformers import SentenceTransformer
+nbrs = NearestNeighbors(n_neighbors=5000, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())
 def search(df, query):
     product = model.encode(query).tolist()
     # product = df.iloc[0]['text_vector_'] #use one of the products as sample
     #prepare model
+    #
     distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object
     #print out the description of every recommended product
+    return df.iloc[list(indices)[0]][['name', 'raised', 'target', 'size', 'stage', 'country', 'source', 'description', 'tags']]
+def filter_df(df, column_name, filter_type, filter_value, minimum_acceptable_size=0):
+    if filter_type == '==':
+        df_filtered = df[df[column_name]==filter_value]
+    elif filter_type == '>=':
+        df_filtered = df[df[column_name]>=filter_value]
+    elif filter_type == '<=':
+        df_filtered = df[df[column_name]<=filter_value]
+    elif filter_type == 'contains':
+        df_filtered = df[df['target'].str.contains(filter_value)]
+    if df_filtered.size >= minimum_acceptable_size:
+        return df_filtered
+    else:
+        return df
 #the first module becomes text1, the second module file1
+def greet(size, target, stage, query):
     def raised_zero(x):
         if x == 0:
             return 'Undisclosed'
         else:
             return x
+    df_knn = search(df, query)
     #we live the sorting for last
     df_knn = df_knn.sort_values('raised', ascending=False)
     df_knn['raised'] = df_knn['raised'].apply(lambda x : raised_zero(x))
+    df_size = filter_df(df_knn, 'size', '==', size, 1000)
+    df_target = filter_df(df_size, 'target', 'contains', target, 20)
+    df_stage = filter_df(df_target, 'stage', '==', stage.lower(), 10)
+    display(df_stage)
+    # df_raised = df_target[(df_target['raised'] >= raised) | (df_target['raised'] == 0)]
+    return df_stage[0:100]
 with gr.Blocks(theme=gr.themes.Soft(primary_hue='amber', secondary_hue='gray', neutral_hue='amber')) as demo:
     gr.Markdown(
     # Startup Search Engine
     """
     )
+    size = gr.Radio(['1-10', '11-50', '51-200', '201-500', '500+', '11-500+'], multiselect=False, value='11-500+', label='size')
+    target = gr.Radio(['B2B', 'B2C', 'B2G', 'B2B2C'], multiselect=False, value='B2B', label='target')
+    stage = gr.Radio(['pre-seed', 'A', 'B', 'C', 'exit'], multiselect=False, value='C', label='stage')
+    # raised = gr.Slider(0, 20, value=5, step_size=1, label="Minimum raising (in Millions)")
     query = gr.Textbox(label='Describe the Startup you are searching for', value='age reversing')
     btn = gr.Button(value="Search for a Startup")
     output1 = gr.DataFrame(label='value')
     # btn.click(greet, inputs='text', outputs=['dataframe'])
+    btn.click(greet, [size, target, stage, query], [output1])
 demo.launch(share=False)

data_manipulation.ipynb CHANGED Viewed

@@ -2,7 +2,49 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 78,
    "metadata": {},
    "outputs": [
     {
@@ -34,6 +76,8 @@
        "      <th>stage</th>\n",
        "      <th>raised</th>\n",
        "      <th>tags</th>\n",
        "      <th>text_vector_</th>\n",
        "    </tr>\n",
        "  </thead>\n",
@@ -48,6 +92,8 @@
        "      <td>Pre-Funding</td>\n",
        "      <td>Undisclosed</td>\n",
        "      <td>[connected-vehicles, adas, autonomous-vehicles...</td>\n",
        "      <td>[-0.031224824488162994, -0.06342269480228424, ...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -60,6 +106,8 @@
        "      <td>Pre-Funding</td>\n",
        "      <td>Undisclosed</td>\n",
        "      <td>[sdg, schools, pre-k, serious-games, games, mo...</td>\n",
        "      <td>[-0.038649097084999084, 0.028091922402381897, ...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -72,6 +120,8 @@
        "      <td>Seed</td>\n",
        "      <td>$120M</td>\n",
        "      <td>[pharmaceuticals, chronic-disease, immunology,...</td>\n",
        "      <td>[0.04561534896492958, -0.017776092514395714, 0...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -84,6 +134,8 @@
        "      <td>A</td>\n",
        "      <td>$25M</td>\n",
        "      <td>[omni-channel, ecommerce, climate-tech, artifi...</td>\n",
        "      <td>[0.0024080690927803516, -0.03042100928723812, ...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -96,6 +148,8 @@
        "      <td>A</td>\n",
        "      <td>$16.1M</td>\n",
        "      <td>[enterprise-solutions, data-protection, cyber-...</td>\n",
        "      <td>[-0.01007091999053955, 0.10431888699531555, -0...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -109,6 +163,8 @@
        "      <td>...</td>\n",
        "      <td>...</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4981</th>\n",
@@ -120,6 +176,8 @@
        "      <td>Pre-Funding</td>\n",
        "      <td>Undisclosed</td>\n",
        "      <td>[content-creators, e-learning, software-applic...</td>\n",
        "      <td>[0.026961881667375565, 0.002459645736962557, -...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -132,6 +190,8 @@
        "      <td>Pre-Funding</td>\n",
        "      <td>Undisclosed</td>\n",
        "      <td>[ecommerce, p2p, delivery, online-shopping, ma...</td>\n",
        "      <td>[0.0036857957020401955, 0.03582162782549858, -...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -144,6 +204,8 @@
        "      <td>Mature</td>\n",
        "      <td>Undisclosed</td>\n",
        "      <td>[crops, agtech, harvesting, machinery, sdg, cl...</td>\n",
        "      <td>[0.027293115854263306, 0.010461761616170406, 0...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -156,6 +218,8 @@
        "      <td>Pre-Funding</td>\n",
        "      <td>Undisclosed</td>\n",
        "      <td>[fitness, digital-wallet, discount, mobile-app...</td>\n",
        "      <td>[0.02851911261677742, 0.05474231392145157, -0....</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -168,11 +232,13 @@
        "      <td>Seed</td>\n",
        "      <td>$10M</td>\n",
        "      <td>[endoscopy, medical-devices, minimally-invasiv...</td>\n",
        "      <td>[0.012587728910148144, -0.07959864288568497, -...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>4986 rows × 9 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
@@ -202,18 +268,31 @@
        "4984  2017.0  B2B, B2C, B2G   11-50  Pre-Funding  Undisclosed   \n",
        "4985  2013.0            B2B   11-50         Seed         $10M   \n",
        "\n",
-       "                                                   tags  \\\n",
-       "0     [connected-vehicles, adas, autonomous-vehicles...   \n",
-       "1     [sdg, schools, pre-k, serious-games, games, mo...   \n",
-       "2     [pharmaceuticals, chronic-disease, immunology,...   \n",
-       "3     [omni-channel, ecommerce, climate-tech, artifi...   \n",
-       "4     [enterprise-solutions, data-protection, cyber-...   \n",
-       "...                                                 ...   \n",
-       "4981  [content-creators, e-learning, software-applic...   \n",
-       "4982  [ecommerce, p2p, delivery, online-shopping, ma...   \n",
-       "4983  [crops, agtech, harvesting, machinery, sdg, cl...   \n",
-       "4984  [fitness, digital-wallet, discount, mobile-app...   \n",
-       "4985  [endoscopy, medical-devices, minimally-invasiv...   \n",
        "\n",
        "                                           text_vector_  \n",
        "0     [-0.031224824488162994, -0.06342269480228424, ...  \n",
@@ -228,10 +307,10 @@
        "4984  [0.02851911261677742, 0.05474231392145157, -0....  \n",
        "4985  [0.012587728910148144, -0.07959864288568497, -...  \n",
        "\n",
-       "[4986 rows x 9 columns]"
       ]
      },
-     "execution_count": 78,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -239,8 +318,275 @@
    "source": [
     "import pandas as pd\n",
     "\n",
-    "df = pd.read_parquet('df_encoded.parquet')\n",
-    "df"
    ]
   },
   {

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['Pre-Funding', 'Seed', 'A', 'Mature', 'C', 'Public', 'D',\n",
+       "       'Pre-Seed', 'B', 'Debt Financing', 'F', 'Crowdfunding', 'E'],\n",
+       "      dtype=object)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df1.stage.unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0., 3., 1., 4., 2., 5.])"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df2.stage.unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
        "      <th>stage</th>\n",
        "      <th>raised</th>\n",
        "      <th>tags</th>\n",
+       "      <th>country</th>\n",
+       "      <th>source</th>\n",
        "      <th>text_vector_</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "      <td>Pre-Funding</td>\n",
        "      <td>Undisclosed</td>\n",
        "      <td>[connected-vehicles, adas, autonomous-vehicles...</td>\n",
+       "      <td>Israel</td>\n",
+       "      <td>https://finder.startupnationcentral.org/</td>\n",
        "      <td>[-0.031224824488162994, -0.06342269480228424, ...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>Pre-Funding</td>\n",
        "      <td>Undisclosed</td>\n",
        "      <td>[sdg, schools, pre-k, serious-games, games, mo...</td>\n",
+       "      <td>Israel</td>\n",
+       "      <td>https://finder.startupnationcentral.org/</td>\n",
        "      <td>[-0.038649097084999084, 0.028091922402381897, ...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>Seed</td>\n",
        "      <td>$120M</td>\n",
        "      <td>[pharmaceuticals, chronic-disease, immunology,...</td>\n",
+       "      <td>Israel</td>\n",
+       "      <td>https://finder.startupnationcentral.org/</td>\n",
        "      <td>[0.04561534896492958, -0.017776092514395714, 0...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>A</td>\n",
        "      <td>$25M</td>\n",
        "      <td>[omni-channel, ecommerce, climate-tech, artifi...</td>\n",
+       "      <td>Israel</td>\n",
+       "      <td>https://finder.startupnationcentral.org/</td>\n",
        "      <td>[0.0024080690927803516, -0.03042100928723812, ...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>A</td>\n",
        "      <td>$16.1M</td>\n",
        "      <td>[enterprise-solutions, data-protection, cyber-...</td>\n",
+       "      <td>Israel</td>\n",
+       "      <td>https://finder.startupnationcentral.org/</td>\n",
        "      <td>[-0.01007091999053955, 0.10431888699531555, -0...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>...</td>\n",
        "      <td>...</td>\n",
        "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4981</th>\n",
        "      <td>Pre-Funding</td>\n",
        "      <td>Undisclosed</td>\n",
        "      <td>[content-creators, e-learning, software-applic...</td>\n",
+       "      <td>Israel</td>\n",
+       "      <td>https://finder.startupnationcentral.org/</td>\n",
        "      <td>[0.026961881667375565, 0.002459645736962557, -...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>Pre-Funding</td>\n",
        "      <td>Undisclosed</td>\n",
        "      <td>[ecommerce, p2p, delivery, online-shopping, ma...</td>\n",
+       "      <td>Israel</td>\n",
+       "      <td>https://finder.startupnationcentral.org/</td>\n",
        "      <td>[0.0036857957020401955, 0.03582162782549858, -...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>Mature</td>\n",
        "      <td>Undisclosed</td>\n",
        "      <td>[crops, agtech, harvesting, machinery, sdg, cl...</td>\n",
+       "      <td>Israel</td>\n",
+       "      <td>https://finder.startupnationcentral.org/</td>\n",
        "      <td>[0.027293115854263306, 0.010461761616170406, 0...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>Pre-Funding</td>\n",
        "      <td>Undisclosed</td>\n",
        "      <td>[fitness, digital-wallet, discount, mobile-app...</td>\n",
+       "      <td>Israel</td>\n",
+       "      <td>https://finder.startupnationcentral.org/</td>\n",
        "      <td>[0.02851911261677742, 0.05474231392145157, -0....</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>Seed</td>\n",
        "      <td>$10M</td>\n",
        "      <td>[endoscopy, medical-devices, minimally-invasiv...</td>\n",
+       "      <td>Israel</td>\n",
+       "      <td>https://finder.startupnationcentral.org/</td>\n",
        "      <td>[0.012587728910148144, -0.07959864288568497, -...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
+       "<p>4986 rows × 11 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
        "4984  2017.0  B2B, B2C, B2G   11-50  Pre-Funding  Undisclosed   \n",
        "4985  2013.0            B2B   11-50         Seed         $10M   \n",
        "\n",
+       "                                                   tags country  \\\n",
+       "0     [connected-vehicles, adas, autonomous-vehicles...  Israel   \n",
+       "1     [sdg, schools, pre-k, serious-games, games, mo...  Israel   \n",
+       "2     [pharmaceuticals, chronic-disease, immunology,...  Israel   \n",
+       "3     [omni-channel, ecommerce, climate-tech, artifi...  Israel   \n",
+       "4     [enterprise-solutions, data-protection, cyber-...  Israel   \n",
+       "...                                                 ...     ...   \n",
+       "4981  [content-creators, e-learning, software-applic...  Israel   \n",
+       "4982  [ecommerce, p2p, delivery, online-shopping, ma...  Israel   \n",
+       "4983  [crops, agtech, harvesting, machinery, sdg, cl...  Israel   \n",
+       "4984  [fitness, digital-wallet, discount, mobile-app...  Israel   \n",
+       "4985  [endoscopy, medical-devices, minimally-invasiv...  Israel   \n",
+       "\n",
+       "                                        source  \\\n",
+       "0     https://finder.startupnationcentral.org/   \n",
+       "1     https://finder.startupnationcentral.org/   \n",
+       "2     https://finder.startupnationcentral.org/   \n",
+       "3     https://finder.startupnationcentral.org/   \n",
+       "4     https://finder.startupnationcentral.org/   \n",
+       "...                                        ...   \n",
+       "4981  https://finder.startupnationcentral.org/   \n",
+       "4982  https://finder.startupnationcentral.org/   \n",
+       "4983  https://finder.startupnationcentral.org/   \n",
+       "4984  https://finder.startupnationcentral.org/   \n",
+       "4985  https://finder.startupnationcentral.org/   \n",
        "\n",
        "                                           text_vector_  \n",
        "0     [-0.031224824488162994, -0.06342269480228424, ...  \n",
        "4984  [0.02851911261677742, 0.05474231392145157, -0....  \n",
        "4985  [0.012587728910148144, -0.07959864288568497, -...  \n",
        "\n",
+       "[4986 rows x 11 columns]"
       ]
      },
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
    "source": [
     "import pandas as pd\n",
     "\n",
+    "df1 = pd.read_parquet('df_encoded.parquet')\n",
+    "df1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>title</th>\n",
+       "      <th>description</th>\n",
+       "      <th>stage</th>\n",
+       "      <th>industry_name</th>\n",
+       "      <th>url</th>\n",
+       "      <th>country_slug</th>\n",
+       "      <th>city_slug</th>\n",
+       "      <th>location</th>\n",
+       "      <th>region_name</th>\n",
+       "      <th>text_vector_</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Digipal</td>\n",
+       "      <td>Digipal is a digital consultancy based in Tbil...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>Software &amp; Data</td>\n",
+       "      <td>https://www.digipal.agency/</td>\n",
+       "      <td>georgia</td>\n",
+       "      <td>tbilisi</td>\n",
+       "      <td>Tbilisi, Georgia</td>\n",
+       "      <td>Europe</td>\n",
+       "      <td>[0.017287444323301315, 0.06208805367350578, -0...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>BeatBind</td>\n",
+       "      <td>BeatBind is the industry's long overdue platfo...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>Social &amp; Leisure</td>\n",
+       "      <td>https://beatbind.io/</td>\n",
+       "      <td>georgia</td>\n",
+       "      <td>tbilisi</td>\n",
+       "      <td>Tbilisi, Georgia</td>\n",
+       "      <td>Europe</td>\n",
+       "      <td>[-0.00438214186578989, -0.051213208585977554, ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Smart Academy</td>\n",
+       "      <td>Smart Academy is a modern educational institut...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>Edtech</td>\n",
+       "      <td>https://smartacademy.ge/</td>\n",
+       "      <td>georgia</td>\n",
+       "      <td>tbilisi</td>\n",
+       "      <td>Tbilisi, Georgia</td>\n",
+       "      <td>Europe</td>\n",
+       "      <td>[0.0005468669114634395, -0.05331585183739662, ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>MaxinAI</td>\n",
+       "      <td>MaxinAI isglobal AI development company that w...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>Software &amp; Data</td>\n",
+       "      <td>https://www.maxinai.com/#all-industries</td>\n",
+       "      <td>georgia</td>\n",
+       "      <td>tbilisi</td>\n",
+       "      <td>Tbilisi, Georgia</td>\n",
+       "      <td>Europe</td>\n",
+       "      <td>[0.021948501467704773, 0.024166792631149292, -...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>TLANCER</td>\n",
+       "      <td>Tlancer aims to create an unlimited educationa...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>Edtech</td>\n",
+       "      <td>https://www.tlancer.ge/</td>\n",
+       "      <td>georgia</td>\n",
+       "      <td>tbilisi</td>\n",
+       "      <td>Tbilisi, Georgia</td>\n",
+       "      <td>Europe</td>\n",
+       "      <td>[0.02025573141872883, -0.022812215611338615, -...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>94521</th>\n",
+       "      <td>OneTwo</td>\n",
+       "      <td>klkdčksč kdč skdčlsk čdksčd ksčk dčskdčk čdk</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>Software &amp; Data</td>\n",
+       "      <td>www.nethr</td>\n",
+       "      <td>croatia</td>\n",
+       "      <td>zagreb</td>\n",
+       "      <td>Zagreb, Croatia</td>\n",
+       "      <td>Europe</td>\n",
+       "      <td>[0.07235302031040192, -0.05674564838409424, -0...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>94522</th>\n",
+       "      <td>Trialfire</td>\n",
+       "      <td>Engaged trialers turn into customers, engaged ...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>Software &amp; Data</td>\n",
+       "      <td>http://www.trialfire.com</td>\n",
+       "      <td>canada</td>\n",
+       "      <td>toronto</td>\n",
+       "      <td>Toronto, Canada</td>\n",
+       "      <td>North America</td>\n",
+       "      <td>[0.030764097347855568, 0.054082825779914856, -...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>94523</th>\n",
+       "      <td>ILLUMAGEAR</td>\n",
+       "      <td>ILLUMAGEAR’s mission is to illuminate people a...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>Software &amp; Data</td>\n",
+       "      <td>http://www.illumagear.com</td>\n",
+       "      <td>united-states</td>\n",
+       "      <td>seattle</td>\n",
+       "      <td>Seattle, United States</td>\n",
+       "      <td>North America</td>\n",
+       "      <td>[0.015447210520505905, -0.0984775498509407, 0....</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>94524</th>\n",
+       "      <td>Knowillage</td>\n",
+       "      <td>Knowillage lets you add personalization to you...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>Edtech</td>\n",
+       "      <td>http://www.knowillage.com</td>\n",
+       "      <td>canada</td>\n",
+       "      <td>vancouver</td>\n",
+       "      <td>Vancouver, Canada</td>\n",
+       "      <td>North America</td>\n",
+       "      <td>[0.007970919832587242, -0.04347420111298561, -...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>94525</th>\n",
+       "      <td>Iris Holidays</td>\n",
+       "      <td>Iris Holidays is a full service Kerala tours o...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>Software &amp; Data</td>\n",
+       "      <td>http://www.irisholidays.com</td>\n",
+       "      <td>india</td>\n",
+       "      <td>kochi</td>\n",
+       "      <td>Kochi, India</td>\n",
+       "      <td>Asia Pacific</td>\n",
+       "      <td>[0.0032976483926177025, -0.010843133553862572,...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>94526 rows × 10 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                title                                        description  \\\n",
+       "0            Digipal   Digipal is a digital consultancy based in Tbil...   \n",
+       "1            BeatBind  BeatBind is the industry's long overdue platfo...   \n",
+       "2       Smart Academy  Smart Academy is a modern educational institut...   \n",
+       "3             MaxinAI  MaxinAI isglobal AI development company that w...   \n",
+       "4             TLANCER  Tlancer aims to create an unlimited educationa...   \n",
+       "...               ...                                                ...   \n",
+       "94521          OneTwo       klkdčksč kdč skdčlsk čdksčd ksčk dčskdčk čdk   \n",
+       "94522       Trialfire  Engaged trialers turn into customers, engaged ...   \n",
+       "94523      ILLUMAGEAR  ILLUMAGEAR’s mission is to illuminate people a...   \n",
+       "94524      Knowillage  Knowillage lets you add personalization to you...   \n",
+       "94525  Iris Holidays   Iris Holidays is a full service Kerala tours o...   \n",
+       "\n",
+       "       stage     industry_name                                      url  \\\n",
+       "0        0.0   Software & Data              https://www.digipal.agency/   \n",
+       "1        0.0  Social & Leisure                     https://beatbind.io/   \n",
+       "2        0.0            Edtech                 https://smartacademy.ge/   \n",
+       "3        0.0   Software & Data  https://www.maxinai.com/#all-industries   \n",
+       "4        0.0            Edtech                  https://www.tlancer.ge/   \n",
+       "...      ...               ...                                      ...   \n",
+       "94521    0.0   Software & Data                                www.nethr   \n",
+       "94522    0.0   Software & Data                 http://www.trialfire.com   \n",
+       "94523    0.0   Software & Data                http://www.illumagear.com   \n",
+       "94524    0.0            Edtech                http://www.knowillage.com   \n",
+       "94525    0.0   Software & Data              http://www.irisholidays.com   \n",
+       "\n",
+       "        country_slug  city_slug                location    region_name  \\\n",
+       "0            georgia    tbilisi        Tbilisi, Georgia         Europe   \n",
+       "1            georgia    tbilisi        Tbilisi, Georgia         Europe   \n",
+       "2            georgia    tbilisi        Tbilisi, Georgia         Europe   \n",
+       "3            georgia    tbilisi        Tbilisi, Georgia         Europe   \n",
+       "4            georgia    tbilisi        Tbilisi, Georgia         Europe   \n",
+       "...              ...        ...                     ...            ...   \n",
+       "94521        croatia     zagreb         Zagreb, Croatia         Europe   \n",
+       "94522         canada    toronto         Toronto, Canada  North America   \n",
+       "94523  united-states    seattle  Seattle, United States  North America   \n",
+       "94524         canada  vancouver       Vancouver, Canada  North America   \n",
+       "94525          india      kochi            Kochi, India   Asia Pacific   \n",
+       "\n",
+       "                                            text_vector_  \n",
+       "0      [0.017287444323301315, 0.06208805367350578, -0...  \n",
+       "1      [-0.00438214186578989, -0.051213208585977554, ...  \n",
+       "2      [0.0005468669114634395, -0.05331585183739662, ...  \n",
+       "3      [0.021948501467704773, 0.024166792631149292, -...  \n",
+       "4      [0.02025573141872883, -0.022812215611338615, -...  \n",
+       "...                                                  ...  \n",
+       "94521  [0.07235302031040192, -0.05674564838409424, -0...  \n",
+       "94522  [0.030764097347855568, 0.054082825779914856, -...  \n",
+       "94523  [0.015447210520505905, -0.0984775498509407, 0....  \n",
+       "94524  [0.007970919832587242, -0.04347420111298561, -...  \n",
+       "94525  [0.0032976483926177025, -0.010843133553862572,...  \n",
+       "\n",
+       "[94526 rows x 10 columns]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "stage_dict = {\n",
+    "    0 : \"pre-seed\",\n",
+    "    1 : \"seed\",\n",
+    "    2 : \"A\",\n",
+    "    3 : \"B\",\n",
+    "    4 : \"C\",\n",
+    "    5 : \"Exit\",\n",
+    "}\n",
+    "\n",
+    "df2 = pd.read_parquet('df_encoded2.parquet')\n",
+    "df2.columns = [['name', 'description', 'stage', 'industry_name', 'url', 'country_slug', 'text_vector_']]\n",
+    "df2['stage'] = df2['stage'].apply(lambda x : stage_dict[x])\n",
+    "df2['raised'] = 'Undisclosed'\n",
+    "df2['size'] = '11-500+'\n",
+    "df2['source'] = 'https://www.startupblink.com'\n",
+    "df2.columns = [['name', 'description', 'stage', 'tags', 'url', 'country_slug', 'text_vector_', 'raised', 'size', 'source']]"
    ]
   },
   {

df_encoded2.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:439b1d44d59383eb4eb7c4626b733b4aca9db3c1a6ecf983ffad1c59eb5fd59b
+size 460066850

df_encoded3.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:724948bf68f31a0c87e397b0d89c95be26dbcd0b769650175a0275d3b22c22e2
+size 483543661