Nihal D'Souza
Removed st.cache from inference
0a1b9a0
import os
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pandas as pd
import json
import streamlit as st
try:
from src.clean import preprocess_text, script_cleaner
except:
from clean import preprocess_text, script_cleaner
MODEL_PATH = 'models/d2v.model'
LICENSE_INDEX_PATH = 'data/index_license_map.json'
if os.path.exists(LICENSE_INDEX_PATH):
license_index_name_map = json.load(open(LICENSE_INDEX_PATH))
elif os.path.exists("../" + LICENSE_INDEX_PATH):
license_index_name_map = json.load(open("../" + LICENSE_INDEX_PATH))
else:
print("index_license_map Not Found!")
def load_model():
'''
Load trained model parameters from file
Args:
Returns: Doc2Vec
Model object
'''
if os.path.exists(MODEL_PATH):
model = Doc2Vec.load(MODEL_PATH)
elif os.path.exists("../" + MODEL_PATH):
model = Doc2Vec.load("../" + MODEL_PATH)
else:
print("d2v.model Not Found!")
return None
return model
def preprocess(input):
'''
Preprocess the input from the textbox
Args:
input: str
Input string containing contents of license text
Return: TaggedDocument
TaggedDocument Object
'''
clean_input = preprocess_text(script_cleaner(input))
tokens = gensim.utils.simple_preprocess(clean_input)
tagged_doc = TaggedDocument(words=tokens, tags=[1])
return tagged_doc
def inference_vector(model, tagged_doc):
'''
Return inference vector
Args:
tagged_doc: TaggedDocument
Input processed by 'preprocess' and converted to TaggedDocument
model: Doc2Vec
Doc2Vec Model object
Return:
model.infer_vector object
Inference vector from model
'''
return model.infer_vector(tagged_doc.words)
def similarity_ranking(model, infer_vector):
'''
Returns a list of tuples containing predictions and confidence scores
Args:
model: Doc2Vec
infer_vector: Doc2Vec.infer_vector
Returns: list
list of tuples containing predictions and confidence scores
'''
similar_doc = model.dv.most_similar([infer_vector], topn=len(model.dv))
pred_ranking = []
for pred in similar_doc:
pred_ranking.append((license_index_name_map[pred[0]], pred[1]))
return pred_ranking
def scores_to_df(scores):
''''
Covert list of tuples containing predictions and confidence values to a df
Args:
scores: list
list of tuples containing predictions and confidence
Return: DataFrame
Dataframe containing license names and confidence scores
'''
license_names = []
license_scores = []
for score in scores:
license_names.append(score[0])
license_scores.append(score[1])
data = {'License': license_names, 'Similarity Scores': license_scores}
return pd.DataFrame.from_dict(data)
def inference(input):
'''
Given text input, returns list of tuples containing predictions and confidence scores
Args:
input: str
the input from the textbox
Returns: list
list of tuples containing predictions and confidence scores
'''
model = load_model()
processed_text = preprocess(input)
infer_vec = inference_vector(model, processed_text)
results = similarity_ranking(model, infer_vec)
results_df = scores_to_df(results)
return results_df