File size: 3,458 Bytes
7b86ace
 
 
 
 
 
 
 
427917d
50ed058
7b86ace
50ed058
7b86ace
 
427917d
7b86ace
3326af6
50ed058
7b86ace
427917d
7b86ace
 
030dff0
7b86ace
 
 
 
 
 
 
50ed058
7b86ace
 
 
 
 
 
 
 
 
 
 
 
 
 
d5175dd
 
 
7b86ace
d5175dd
 
 
 
 
 
 
 
 
 
 
 
 
7b86ace
d5175dd
 
 
 
 
7b86ace
d5175dd
 
7b86ace
28192e4
7b86ace
 
 
5d88c86
 
 
 
 
7b86ace
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#%%
import pandas as pd
import numpy as np
import torch
from sentence_transformers.util import cos_sim
from sentence_transformers import SentenceTransformer
import gradio as gr
#%%
# etalon = pd.read_csv("etalon_prod.csv")
df = pd.read_csv("preprocessed_complaints.csv")

model = SentenceTransformer('sentence-transformers/multi-qa-distilbert-cos-v1')


unique_complaints = df['Жалобы'].unique()

with open("embeddings.npy", 'rb') as f:
    embeddings = np.load(f)

#%%
def get_recommend(user_input, 
                  top_k_spec = 3,
                  top_k_services = 5,
                  treshold = 0.8):
    
    cols_for_top_k = ["Специальность врача",
                      "Рекомендуемые специалисты"]
    
    usr_embeddings = model.encode(user_input)

    cos_similarity = cos_sim(usr_embeddings, embeddings).detach().numpy()
    sorted_idx = cos_similarity[0].argsort()[::-1]
    cos_similarity.sort()
    
    cos_similarity = cos_similarity[0][::-1]
    
    sorted_df = df.loc[sorted_idx].copy()
    sorted_df['cos_sim'] = cos_similarity
    sorted_df = sorted_df[sorted_df['cos_sim'] > treshold]
    
    result = {}
    for col in cols_for_top_k:
        result[col] = sorted_df[col].value_counts()[:top_k_spec].index.tolist()
    result['Жалобы'] = sorted_df['Жалобы'].value_counts()[:top_k_services].index.tolist()
    
    top_k_mkb = sorted_df['Диагноз МКБ'].value_counts()[:top_k_services].index.tolist()
    result['Диагноз МКБ'] = top_k_mkb
    
    categories = ['Инструментальная диагностика', 'Лабораторная диагностика']
    
    top_k_services_lst_by_mkb = []
    for mkb in top_k_mkb:
        temp_lst = []
        slice_df = sorted_df[sorted_df['Диагноз МКБ'] == mkb]
        for category in categories:
            top_k_services_in_cat_mkb = slice_df[slice_df['service_name_category'] == category]['Рекомендации по обследованию'].value_counts()[:top_k_services].index.tolist()
            temp_lst.append({category:top_k_services_in_cat_mkb})
            
        top_k_services_lst_by_mkb.append({mkb:temp_lst})
            
    top_k_services_lst = []    
    
    for category in categories:
        slice_df = sorted_df[sorted_df['service_name_category'] == category]
        list_top_k_services = slice_df['Рекомендации по обследованию'].value_counts()[:top_k_services].index.tolist()
        top_k_services_lst.append({category:list_top_k_services})
        
        
    
    result['Рекомендации по обследованию'] = top_k_services_lst
    result['Рекомендации по обследованию по МКБ'] = top_k_services_lst_by_mkb

    return result
#%%
gradio_app = gr.Interface(
    get_recommend,
    inputs=['text',
            gr.Slider(minimum=1, maximum=10, step=1, label="Топ N специалистов", value=3),
            gr.Slider(minimum=1, maximum=10, step=1, label="Топ N услуг", value=5),
            gr.Slider(minimum=0, maximum=1, step=0.05, label="Порог релевантности", value=0.8)],
    outputs=[gr.JSON(label='Рекомендации: ')],
    # title="Предсказание топ-10 наиболее схожих услуг",
    description="Введите услугу:"
)

if __name__ == "__main__":
    gradio_app.launch()
# %%