Spaces:
Runtime error
Runtime error
| import os | |
| import gensim | |
| from gensim.models.doc2vec import Doc2Vec, TaggedDocument | |
| import pandas as pd | |
| import json | |
| import streamlit as st | |
| try: | |
| from src.clean import preprocess_text, script_cleaner | |
| except: | |
| from clean import preprocess_text, script_cleaner | |
| MODEL_PATH = 'models/d2v.model' | |
| LICENSE_INDEX_PATH = 'data/index_license_map.json' | |
| if os.path.exists(LICENSE_INDEX_PATH): | |
| license_index_name_map = json.load(open(LICENSE_INDEX_PATH)) | |
| elif os.path.exists("../" + LICENSE_INDEX_PATH): | |
| license_index_name_map = json.load(open("../" + LICENSE_INDEX_PATH)) | |
| else: | |
| print("index_license_map Not Found!") | |
| def load_model(): | |
| ''' | |
| Load trained model parameters from file | |
| Args: | |
| Returns: Doc2Vec | |
| Model object | |
| ''' | |
| if os.path.exists(MODEL_PATH): | |
| model = Doc2Vec.load(MODEL_PATH) | |
| elif os.path.exists("../" + MODEL_PATH): | |
| model = Doc2Vec.load("../" + MODEL_PATH) | |
| else: | |
| print("d2v.model Not Found!") | |
| return None | |
| return model | |
| def preprocess(input): | |
| ''' | |
| Preprocess the input from the textbox | |
| Args: | |
| input: str | |
| Input string containing contents of license text | |
| Return: TaggedDocument | |
| TaggedDocument Object | |
| ''' | |
| clean_input = preprocess_text(script_cleaner(input)) | |
| tokens = gensim.utils.simple_preprocess(clean_input) | |
| tagged_doc = TaggedDocument(words=tokens, tags=[1]) | |
| return tagged_doc | |
| def inference_vector(model, tagged_doc): | |
| ''' | |
| Return inference vector | |
| Args: | |
| tagged_doc: TaggedDocument | |
| Input processed by 'preprocess' and converted to TaggedDocument | |
| model: Doc2Vec | |
| Doc2Vec Model object | |
| Return: | |
| model.infer_vector object | |
| Inference vector from model | |
| ''' | |
| return model.infer_vector(tagged_doc.words) | |
| def similarity_ranking(model, infer_vector): | |
| ''' | |
| Returns a list of tuples containing predictions and confidence scores | |
| Args: | |
| model: Doc2Vec | |
| infer_vector: Doc2Vec.infer_vector | |
| Returns: list | |
| list of tuples containing predictions and confidence scores | |
| ''' | |
| similar_doc = model.dv.most_similar([infer_vector], topn=len(model.dv)) | |
| pred_ranking = [] | |
| for pred in similar_doc: | |
| pred_ranking.append((license_index_name_map[pred[0]], pred[1])) | |
| return pred_ranking | |
| def scores_to_df(scores): | |
| '''' | |
| Covert list of tuples containing predictions and confidence values to a df | |
| Args: | |
| scores: list | |
| list of tuples containing predictions and confidence | |
| Return: DataFrame | |
| Dataframe containing license names and confidence scores | |
| ''' | |
| license_names = [] | |
| license_scores = [] | |
| for score in scores: | |
| license_names.append(score[0]) | |
| license_scores.append(score[1]) | |
| data = {'License': license_names, 'Similarity Scores': license_scores} | |
| return pd.DataFrame.from_dict(data) | |
| def inference(input): | |
| ''' | |
| Given text input, returns list of tuples containing predictions and confidence scores | |
| Args: | |
| input: str | |
| the input from the textbox | |
| Returns: list | |
| list of tuples containing predictions and confidence scores | |
| ''' | |
| model = load_model() | |
| processed_text = preprocess(input) | |
| infer_vec = inference_vector(model, processed_text) | |
| results = similarity_ranking(model, infer_vec) | |
| results_df = scores_to_df(results) | |
| return results_df |