#%%
import pandas as pd
import numpy as np
import torch
import json
import re
from sentence_transformers.util import cos_sim
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
import gradio as gr
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
#%%

model = SentenceTransformer('sentence-transformers/multi-qa-distilbert-cos-v1')
russian_stopwords = stopwords.words('russian') + ['ВАШ']

with open("top_150_symps_by_spec.json", 'r') as f:
    symps = json.load(f)

with open("embeddings.npy", 'rb') as f:
    embs = np.load(f)

def remove_numbers(text):
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.strip()


vectorizer = CountVectorizer(ngram_range=(1, 3),
                             stop_words=russian_stopwords,
                             preprocessor=remove_numbers,
                             )
    
def get_symptomps_v2(text, treshold = 0.7):
    try:
        if isinstance(text, str):
            text = [text]
            
        X = vectorizer.fit_transform(text)
        text_emb = model.encode(vectorizer.get_feature_names_out(), batch_size=64)
        cos_sim_m = cos_sim(text_emb, embs).numpy()
        cos_sim_m = np.where(cos_sim_m > treshold, cos_sim_m, -1)
        
        arg_max_idx = np.argmax(cos_sim_m, axis=1)
        
        outputs = []
        for idx, cos_sim_row in zip(arg_max_idx, cos_sim_m):
            if cos_sim_row[idx] > 0:
                outputs.append(symps[idx])
        if len(outputs) == 0:
            return ['Симптомы не определены']
        return np.unique(outputs).tolist()
    except:
        return ['Симптомы не определены']
#%%
gradio_app = gr.Interface(
    get_symptomps_v2,
    inputs=['text',
            gr.Slider(minimum=0, maximum=1, step=0.05, label="Порог релевантности", value=0.8)],
    outputs=[gr.JSON(label='Симптомы: ')],
    description="Введите услугу:"
)

if __name__ == "__main__":
    gradio_app.launch()
# %%