import os import gensim from gensim.models.doc2vec import Doc2Vec, TaggedDocument import pandas as pd import json import streamlit as st try: from src.clean import preprocess_text, script_cleaner except: from clean import preprocess_text, script_cleaner MODEL_PATH = 'models/d2v.model' LICENSE_INDEX_PATH = 'data/index_license_map.json' if os.path.exists(LICENSE_INDEX_PATH): license_index_name_map = json.load(open(LICENSE_INDEX_PATH)) elif os.path.exists("../" + LICENSE_INDEX_PATH): license_index_name_map = json.load(open("../" + LICENSE_INDEX_PATH)) else: print("index_license_map Not Found!") def load_model(): ''' Load trained model parameters from file Args: Returns: Doc2Vec Model object ''' if os.path.exists(MODEL_PATH): model = Doc2Vec.load(MODEL_PATH) elif os.path.exists("../" + MODEL_PATH): model = Doc2Vec.load("../" + MODEL_PATH) else: print("d2v.model Not Found!") return None return model def preprocess(input): ''' Preprocess the input from the textbox Args: input: str Input string containing contents of license text Return: TaggedDocument TaggedDocument Object ''' clean_input = preprocess_text(script_cleaner(input)) tokens = gensim.utils.simple_preprocess(clean_input) tagged_doc = TaggedDocument(words=tokens, tags=[1]) return tagged_doc def inference_vector(model, tagged_doc): ''' Return inference vector Args: tagged_doc: TaggedDocument Input processed by 'preprocess' and converted to TaggedDocument model: Doc2Vec Doc2Vec Model object Return: model.infer_vector object Inference vector from model ''' return model.infer_vector(tagged_doc.words) def similarity_ranking(model, infer_vector): ''' Returns a list of tuples containing predictions and confidence scores Args: model: Doc2Vec infer_vector: Doc2Vec.infer_vector Returns: list list of tuples containing predictions and confidence scores ''' similar_doc = model.dv.most_similar([infer_vector], topn=len(model.dv)) pred_ranking = [] for pred in similar_doc: pred_ranking.append((license_index_name_map[pred[0]], pred[1])) return pred_ranking def scores_to_df(scores): '''' Covert list of tuples containing predictions and confidence values to a df Args: scores: list list of tuples containing predictions and confidence Return: DataFrame Dataframe containing license names and confidence scores ''' license_names = [] license_scores = [] for score in scores: license_names.append(score[0]) license_scores.append(score[1]) data = {'License': license_names, 'Similarity Scores': license_scores} return pd.DataFrame.from_dict(data) def inference(input): ''' Given text input, returns list of tuples containing predictions and confidence scores Args: input: str the input from the textbox Returns: list list of tuples containing predictions and confidence scores ''' model = load_model() processed_text = preprocess(input) infer_vec = inference_vector(model, processed_text) results = similarity_ranking(model, infer_vec) results_df = scores_to_df(results) return results_df