{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "# os.system('pip install openpyxl')\n", "# os.system('pip install sentence-transformers')\n", "import pandas as pd\n", "import gradio as gr\n", "from sentence_transformers import SentenceTransformer\n", "\n", "model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2\n", "\n", "df = pd.read_parquet('df_encoded3.parquet')\n", "df['tags'] = df['tags'].apply(lambda x : str(x))\n", "def parse_raised(x):\n", " if x == 'Undisclosed':\n", " return 0\n", " else: \n", " quantifier = x[-1]\n", " x = float(x[1:-1])\n", " if quantifier == 'K':\n", " return x/1000\n", " elif quantifier == 'M':\n", " return x\n", "df['raised'] = df['raised'].apply(lambda x : parse_raised(x))\n", "df['stage'] = df['stage'].apply(lambda x : x.lower())\n", "df = df.reset_index(drop=True)\n", "\n", "from sklearn.neighbors import NearestNeighbors\n", "import pandas as pd\n", "from sentence_transformers import SentenceTransformer\n", "\n", "nbrs = NearestNeighbors(n_neighbors=5000, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())\n", "\n", "def search(df, query):\n", " product = model.encode(query).tolist()\n", " # product = df.iloc[0]['text_vector_'] #use one of the products as sample\n", "\n", " #prepare model\n", " # \n", " distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object\n", "\n", " #print out the description of every recommended product\n", " return df.iloc[list(indices)[0]][['name', 'raised', 'target', 'size', 'stage', 'country', 'source', 'description', 'tags']]" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\ardit\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\gradio\\deprecation.py:43: UserWarning: You have unused kwarg parameters in Radio, please remove them: {'multiselect': False}\n", " warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Running on local URL: http://127.0.0.1:7884\n", "\n", "To create a public link, set `share=True` in `launch()`.\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "\n", " | name | \n", "raised | \n", "target | \n", "size | \n", "stage | \n", "country | \n", "source | \n", "description | \n", "tags | \n", "
---|---|---|---|---|---|---|---|---|---|
78931 | \n", "Developeration | \n", "Undisclosed | \n", "Undisclosed | \n", "11-500+ | \n", "c | \n", "sweden | \n", "https://www.startupblink.com | \n", "Developeration AB was founded 2016 and is a st... | \n", "['healthtech'] | \n", "
77566 | \n", "ComplyAdvantage | \n", "Undisclosed | \n", "Undisclosed | \n", "11-500+ | \n", "c | \n", "united-kingdom | \n", "https://www.startupblink.com | \n", "We are a financial crime solutions provider co... | \n", "['fintech'] | \n", "
78674 | \n", "Atlas | \n", "Undisclosed | \n", "Undisclosed | \n", "11-500+ | \n", "c | \n", "russia | \n", "https://www.startupblink.com | \n", "Atlas Biomedical Holding is developing a netwo... | \n", "['healthtech'] | \n", "
81682 | \n", "48 Factoring Inc | \n", "Undisclosed | \n", "Undisclosed | \n", "11-500+ | \n", "c | \n", "united-states | \n", "https://www.startupblink.com | \n", "48 Factoring Inc. is a financial services comp... | \n", "['fintech'] | \n", "
78926 | \n", "Xinca | \n", "Undisclosed | \n", "Undisclosed | \n", "11-500+ | \n", "c | \n", "argentina | \n", "https://www.startupblink.com | \n", "Incorporar residuos en la fabricación d... | \n", "['energy' 'environment'] | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
80432 | \n", "Glow | \n", "Undisclosed | \n", "Undisclosed | \n", "11-500+ | \n", "c | \n", "china | \n", "https://www.startupblink.com | \n", "Glow is an ambitious enterprise that uniquely ... | \n", "['healthtech'] | \n", "
77716 | \n", "Owiwi | \n", "Undisclosed | \n", "Undisclosed | \n", "11-500+ | \n", "c | \n", "greece | \n", "https://www.startupblink.com | \n", "Owiwi is a fun and engaging psychometric tool ... | \n", "['software' 'data'] | \n", "
78561 | \n", "Quantib | \n", "Undisclosed | \n", "Undisclosed | \n", "11-500+ | \n", "c | \n", "the-netherlands | \n", "https://www.startupblink.com | \n", "MRI scan technology to better diagnose -- and ... | \n", "['healthtech'] | \n", "
77554 | \n", "Earnin | \n", "Undisclosed | \n", "Undisclosed | \n", "11-500+ | \n", "c | \n", "united-states | \n", "https://www.startupblink.com | \n", "We're building a platform of community-support... | \n", "['fintech'] | \n", "
80694 | \n", "Vibrent Health | \n", "Undisclosed | \n", "Undisclosed | \n", "11-500+ | \n", "c | \n", "united-states | \n", "https://www.startupblink.com | \n", "The future of developing new cures for patient... | \n", "['healthtech'] | \n", "
94 rows × 9 columns
\n", "