{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "# os.system('pip install openpyxl')\n", "# os.system('pip install sentence-transformers')\n", "import pandas as pd\n", "import gradio as gr\n", "from sentence_transformers import SentenceTransformer\n", "\n", "model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2\n", "\n", "df = pd.read_parquet('df_encoded3.parquet')\n", "df['tags'] = df['tags'].apply(lambda x : str(x))\n", "def parse_raised(x):\n", " if x == 'Undisclosed':\n", " return 0\n", " else: \n", " quantifier = x[-1]\n", " x = float(x[1:-1])\n", " if quantifier == 'K':\n", " return x/1000\n", " elif quantifier == 'M':\n", " return x\n", "df['raised'] = df['raised'].apply(lambda x : parse_raised(x))\n", "df['stage'] = df['stage'].apply(lambda x : x.lower())\n", "df = df.reset_index(drop=True)\n", "\n", "from sklearn.neighbors import NearestNeighbors\n", "import pandas as pd\n", "from sentence_transformers import SentenceTransformer\n", "\n", "nbrs = NearestNeighbors(n_neighbors=5000, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())\n", "\n", "def search(df, query):\n", " product = model.encode(query).tolist()\n", " # product = df.iloc[0]['text_vector_'] #use one of the products as sample\n", "\n", " #prepare model\n", " # \n", " distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object\n", "\n", " #print out the description of every recommended product\n", " return df.iloc[list(indices)[0]][['name', 'raised', 'target', 'size', 'stage', 'country', 'source', 'description', 'tags']]" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\ardit\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\gradio\\deprecation.py:43: UserWarning: You have unused kwarg parameters in Radio, please remove them: {'multiselect': False}\n", " warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Running on local URL: http://127.0.0.1:7884\n", "\n", "To create a public link, set `share=True` in `launch()`.\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameraisedtargetsizestagecountrysourcedescriptiontags
78931DeveloperationUndisclosedUndisclosed11-500+cswedenhttps://www.startupblink.comDeveloperation AB was founded 2016 and is a st...['healthtech']
77566ComplyAdvantageUndisclosedUndisclosed11-500+cunited-kingdomhttps://www.startupblink.comWe are a financial crime solutions provider co...['fintech']
78674AtlasUndisclosedUndisclosed11-500+crussiahttps://www.startupblink.comAtlas Biomedical Holding is developing a netwo...['healthtech']
8168248 Factoring IncUndisclosedUndisclosed11-500+cunited-stateshttps://www.startupblink.com48 Factoring Inc. is a financial services comp...['fintech']
78926XincaUndisclosedUndisclosed11-500+cargentinahttps://www.startupblink.comIncorporar residuos en la fabricación d...['energy' 'environment']
..............................
80432GlowUndisclosedUndisclosed11-500+cchinahttps://www.startupblink.comGlow is an ambitious enterprise that uniquely ...['healthtech']
77716OwiwiUndisclosedUndisclosed11-500+cgreecehttps://www.startupblink.comOwiwi is a fun and engaging psychometric tool ...['software' 'data']
78561QuantibUndisclosedUndisclosed11-500+cthe-netherlandshttps://www.startupblink.comMRI scan technology to better diagnose -- and ...['healthtech']
77554EarninUndisclosedUndisclosed11-500+cunited-stateshttps://www.startupblink.comWe're building a platform of community-support...['fintech']
80694Vibrent HealthUndisclosedUndisclosed11-500+cunited-stateshttps://www.startupblink.comThe future of developing new cures for patient...['healthtech']
\n", "

94 rows × 9 columns

\n", "
" ], "text/plain": [ " name raised target size stage \\\n", "78931 Developeration Undisclosed Undisclosed 11-500+ c \n", "77566 ComplyAdvantage Undisclosed Undisclosed 11-500+ c \n", "78674 Atlas Undisclosed Undisclosed 11-500+ c \n", "81682 48 Factoring Inc Undisclosed Undisclosed 11-500+ c \n", "78926 Xinca Undisclosed Undisclosed 11-500+ c \n", "... ... ... ... ... ... \n", "80432 Glow Undisclosed Undisclosed 11-500+ c \n", "77716 Owiwi Undisclosed Undisclosed 11-500+ c \n", "78561 Quantib Undisclosed Undisclosed 11-500+ c \n", "77554 Earnin Undisclosed Undisclosed 11-500+ c \n", "80694 Vibrent Health Undisclosed Undisclosed 11-500+ c \n", "\n", " country source \\\n", "78931 sweden https://www.startupblink.com \n", "77566 united-kingdom https://www.startupblink.com \n", "78674 russia https://www.startupblink.com \n", "81682 united-states https://www.startupblink.com \n", "78926 argentina https://www.startupblink.com \n", "... ... ... \n", "80432 china https://www.startupblink.com \n", "77716 greece https://www.startupblink.com \n", "78561 the-netherlands https://www.startupblink.com \n", "77554 united-states https://www.startupblink.com \n", "80694 united-states https://www.startupblink.com \n", "\n", " description \\\n", "78931 Developeration AB was founded 2016 and is a st... \n", "77566 We are a financial crime solutions provider co... \n", "78674 Atlas Biomedical Holding is developing a netwo... \n", "81682 48 Factoring Inc. is a financial services comp... \n", "78926 Incorporar residuos en la fabricación d... \n", "... ... \n", "80432 Glow is an ambitious enterprise that uniquely ... \n", "77716 Owiwi is a fun and engaging psychometric tool ... \n", "78561 MRI scan technology to better diagnose -- and ... \n", "77554 We're building a platform of community-support... \n", "80694 The future of developing new cures for patient... \n", "\n", " tags \n", "78931 ['healthtech'] \n", "77566 ['fintech'] \n", "78674 ['healthtech'] \n", "81682 ['fintech'] \n", "78926 ['energy' 'environment'] \n", "... ... \n", "80432 ['healthtech'] \n", "77716 ['software' 'data'] \n", "78561 ['healthtech'] \n", "77554 ['fintech'] \n", "80694 ['healthtech'] \n", "\n", "[94 rows x 9 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def filter_df(df, column_name, filter_type, filter_value, minimum_acceptable_size=0):\n", " if filter_type == '==':\n", " df_filtered = df[df[column_name]==filter_value]\n", " elif filter_type == '>=':\n", " df_filtered = df[df[column_name]>=filter_value]\n", " elif filter_type == '<=':\n", " df_filtered = df[df[column_name]<=filter_value]\n", " elif filter_type == 'contains':\n", " df_filtered = df[df['target'].str.contains(filter_value)]\n", "\n", " if df_filtered.size >= minimum_acceptable_size:\n", " return df_filtered\n", " else:\n", " return df\n", "\n", "#the first module becomes text1, the second module file1\n", "def greet(size, target, stage, query): \n", " def raised_zero(x):\n", " if x == 0:\n", " return 'Undisclosed'\n", " else:\n", " return x\n", " df_knn = search(df, query)\n", " #we live the sorting for last\n", " df_knn = df_knn.sort_values('raised', ascending=False)\n", " df_knn['raised'] = df_knn['raised'].apply(lambda x : raised_zero(x))\n", "\n", " df_size = filter_df(df_knn, 'size', '==', size, 1000)\n", " df_target = filter_df(df_size, 'target', 'contains', target, 20)\n", " df_stage = filter_df(df_target, 'stage', '==', stage.lower(), 10)\n", " \n", " display(df_stage)\n", " # df_raised = df_target[(df_target['raised'] >= raised) | (df_target['raised'] == 0)]\n", "\n", " return df_stage[0:100]\n", "\n", "with gr.Blocks(theme=gr.themes.Soft(primary_hue='amber', secondary_hue='gray', neutral_hue='amber')) as demo:\n", " gr.Markdown(\n", " \"\"\"\n", " # Startup Search Engine\n", " \"\"\"\n", " )\n", " size = gr.Radio(['1-10', '11-50', '51-200', '201-500', '500+', '11-500+'], multiselect=False, value='11-500+', label='size')\n", " target = gr.Radio(['B2B', 'B2C', 'B2G', 'B2B2C'], multiselect=False, value='B2B', label='target')\n", " stage = gr.Radio(['pre-seed', 'A', 'B', 'C', 'exit'], multiselect=False, value='C', label='stage')\n", " # raised = gr.Slider(0, 20, value=5, step_size=1, label=\"Minimum raising (in Millions)\")\n", " query = gr.Textbox(label='Describe the Startup you are searching for', value='age reversing')\n", " btn = gr.Button(value=\"Search for a Startup\")\n", " output1 = gr.DataFrame(label='value')\n", " # btn.click(greet, inputs='text', outputs=['dataframe'])\n", " btn.click(greet, [size, target, stage, query], [output1])\n", "demo.launch(share=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Define database of sentences\n", "sentences = pd.Series(['The quick brown fox jumps over the lazy dog',\n", " 'A quick brown dog jumps over the lazy fox',\n", " 'The lazy dog jumps over the quick brown fox',\n", " 'The quick brown fox jumps over the lazy cat',\n", " 'The quick brown cat jumps over the lazy dog'])\n", "\n", "# Encode sentences\n", "sentence_embeddings = model.encode(sentences)\n", "\n", "# Define query sentence\n", "query = 'A lazy dog jumps over the quick brown fox'\n", "\n", "# Encode query\n", "query_embedding = model.encode(query)\n", "\n", "# Search for similar sentences\n", "cosine_scores = util.pytorch_cos_sim(query_embedding, sentence_embeddings)\n", "most_similar_sentence = sentences[cosine_scores.argmax()]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }