warleagle commited on
Commit
7b86ace
1 Parent(s): 44ec690

Upload 5 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ preprocessed_train_classify_rec_spec_filtered_by_etalon.csv filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #%%
2
+ import pandas as pd
3
+ import numpy as np
4
+ import torch
5
+ from sentence_transformers.util import cos_sim
6
+ from sentence_transformers import SentenceTransformer
7
+ import gradio as gr
8
+ #%%
9
+ etalon = pd.read_csv("data/symptomps_specialist_mkb_issledovania/ranker/raw_data/etalon_prod.csv")
10
+ df = pd.read_csv("data/symptomps_specialist_mkb_issledovania/ranker/raw_data/preprocessed_train_classify_rec_spec_filtered_by_etalon.csv")
11
+ df = df[df['is_match'] == 1]
12
+
13
+ model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2').to("cuda")
14
+
15
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
16
+
17
+ unique_complaints = df['Жалобы'].values.tolist()
18
+
19
+ with open("experiments/symptom_recommender/embeddings_scripts/app/embeddings.npy", 'rb') as f:
20
+ unique_complaints_embeddings_st = np.load(f)
21
+
22
+ def get_recommend(user_input,
23
+ top_k_spec = 3,
24
+ top_k_services = 10,
25
+ treshold = 0.8):
26
+
27
+ cols_for_top_k = ["Специальность врача",
28
+ "Рекомендуемые специалисты"]
29
+
30
+ usr_embeddings = model.encode(user_input)
31
+
32
+ cos_similarity = cos_sim(usr_embeddings, unique_complaints_embeddings_st).detach().numpy()
33
+ sorted_idx = cos_similarity[0].argsort()[::-1]
34
+ cos_similarity.sort()
35
+
36
+ cos_similarity = cos_similarity[0][::-1]
37
+
38
+ sorted_df = df.loc[sorted_idx].copy()
39
+ sorted_df['cos_sim'] = cos_similarity
40
+ sorted_df = sorted_df[sorted_df['cos_sim'] > treshold]
41
+
42
+ result = {}
43
+ for col in cols_for_top_k:
44
+ result[col] = sorted_df[col].value_counts()[:top_k_spec].index.tolist()
45
+ result['Жалобы'] = sorted_df['Жалобы'].value_counts()[:top_k_services].index.tolist()
46
+
47
+ lst = []
48
+ categories = ['Инструментальная диагностика', 'Лабораторная диагностика']
49
+ for category in categories:
50
+ list_top_k_services = sorted_df[sorted_df['preds'] == category]['Рекомендации по обследованию'].value_counts()[:top_k_services].index.tolist()
51
+ lst.append({category:list_top_k_services})
52
+
53
+ result['Рекомендации по обследованию'] = lst
54
+
55
+ return result
56
+ #%%
57
+ gradio_app = gr.Interface(
58
+ get_recommend,
59
+ inputs='text',
60
+ outputs=gr.JSON(label='s'),
61
+ # title="Предсказание топ-10 наиболее схожих услуг",
62
+ description="Введите услугу:"
63
+ )
64
+
65
+ if __name__ == "__main__":
66
+ gradio_app.launch()
67
+ # %%
embeddings.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8a8829bdaaefb664cc0f6eabe9189155bffc2ba10eb65925e8f495fdf87eea2
3
+ size 106245248
etalon_prod.csv ADDED
The diff for this file is too large to render. See raw diff
 
preprocessed_train_classify_rec_spec_filtered_by_etalon.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e9f6f60f027af0575ab1cd10cb0ec3af316e90a689431ce0a51d80b9af11dd9
3
+ size 28425854
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ numpy
4
+ evaluate
5
+ scikit-learn
6
+ datasets
7
+ tqdm
8
+ minio
9
+ python-dotenv
10
+ accelerate
11
+ sentence-transformers