File size: 3,781 Bytes
4e74356
de3cc49
 
0820f13
4e74356
0820f13
4e74356
0820f13
4e74356
f2092a2
9fa587a
 
 
 
 
 
 
 
 
 
 
 
f2092a2
0820f13
4e74356
0820f13
 
 
4e74356
f2092a2
4e74356
9fa587a
0820f13
 
4e74356
9fa587a
f2092a2
0820f13
4e74356
0820f13
f2092a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e74356
0820f13
f2092a2
9fa587a
 
 
 
 
f2092a2
9fa587a
 
3d4e8d2
 
38b18d6
3d4e8d2
38b18d6
 
3d4e8d2
 
 
39a7cf9
3d4e8d2
f2092a2
f7306fb
f2092a2
 
3a9ba56
3d4e8d2
9fa587a
 
 
 
 
 
 
f2092a2
 
3d4e8d2
f2092a2
9fa587a
 
 
 
f2092a2
9fa587a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
os.system('pip install openpyxl')
os.system('pip install sentence-transformers')
import pandas as pd
import gradio as gr
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2

df = pd.read_parquet('df_encoded3.parquet')
df['tags'] = df['tags'].apply(lambda x : str(x))
def parse_raised(x):
    if x == 'Undisclosed':
        return 0
    else: 
        quantifier = x[-1]
        x = float(x[1:-1])
        if quantifier == 'K':
            return x/1000
        elif quantifier == 'M':
            return x
df['raised'] = df['raised'].apply(lambda x : parse_raised(x))
df['stage'] = df['stage'].apply(lambda x : x.lower())
df = df.reset_index(drop=True)

from sklearn.neighbors import NearestNeighbors
import pandas as pd
from sentence_transformers import SentenceTransformer

nbrs = NearestNeighbors(n_neighbors=5000, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())

def search(df, query):
    product = model.encode(query).tolist()
    # product = df.iloc[0]['text_vector_'] #use one of the products as sample

    #prepare model
    # 
    distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object

    #print out the description of every recommended product
    return df.iloc[list(indices)[0]][['name', 'raised', 'target', 'size', 'stage', 'country', 'source', 'description', 'tags']]

def filter_df(df, column_name, filter_type, filter_value, minimum_acceptable_size=0):
    if filter_type == '==':
        df_filtered = df[df[column_name]==filter_value]
    elif filter_type == '>=':
        df_filtered = df[df[column_name]>=filter_value]
    elif filter_type == '<=':
        df_filtered = df[df[column_name]<=filter_value]
    elif filter_type == 'contains':
        df_filtered = df[df['target'].str.contains(filter_value)]

    if df_filtered.size >= minimum_acceptable_size:
        return df_filtered
    else:
        return df

#the first module becomes text1, the second module file1
def greet(size, target, stage, query): 
    def raised_zero(x):
        if x == 0:
            return 'Undisclosed'
        else:
            return x
    df_knn = search(df, query)
    df_knn['raised'] = df_knn['raised'].apply(lambda x : raised_zero(x))

    df_size = filter_df(df_knn, 'size', '==', size, 1)

    if stage != 'ALL':
        df_stage = filter_df(df_size, 'stage', '==', stage.lower(), 1)
    else:
        #we bypass the filter
        df_stage = df_size

    print(df_stage.size)

    df_target = filter_df(df_stage, 'target', 'contains', target, 1)
    
    # display(df_stage)
    # df_raised = df_target[(df_target['raised'] >= raised) | (df_target['raised'] == 0)]

    #we live the sorting for last
    return df_target[0:100] #.sort_values('raised', ascending=False)

with gr.Blocks(theme=gr.themes.Soft(primary_hue='amber', secondary_hue='gray', neutral_hue='amber')) as demo:
    gr.Markdown(
    """
    # Startup Search Engine
    """
    )
    size = gr.Radio(['1-10', '11-50', '51-200', '201-500', '500+', '11-500+'], multiselect=False, value='11-500+', label='size')
    target = gr.Radio(['B2B', 'B2C', 'B2G', 'B2B2C'], multiselect=False, value='B2B', label='target')
    stage = gr.Radio(['pre-seed', 'A', 'B', 'C', 'ALL'], multiselect=False, value='ALL', label='stage')
    # raised = gr.Slider(0, 20, value=5, step_size=1, label="Minimum raising (in Millions)")
    query = gr.Textbox(label='Describe the Startup you are searching for', value='age reversing')
    btn = gr.Button(value="Search for a Startup")
    output1 = gr.DataFrame(label='value')
    # btn.click(greet, inputs='text', outputs=['dataframe'])
    btn.click(greet, [size, target, stage, query], [output1])
demo.launch(share=False)