Spaces:
Runtime error
Runtime error
| import my_new_openai | |
| import my_1_writer | |
| import json | |
| import numpy as np | |
| # sim search with dot_product and lin_distance | |
| # the newly vectorized TERM will be added to the database | |
| # database = .json file | |
| def sim_search_load_db(database, term, add_to_db=True, debug=False): | |
| if type(term) == str: | |
| print("str") | |
| vector1 = my_new_openai.vectorize_data(term) | |
| elif type(term) == list: | |
| print("list") | |
| vector1 = term | |
| else: | |
| print("invalid search_term/search_vector format") | |
| return | |
| with open(database, "r") as f: | |
| table = json.load(f) | |
| sim_search_dict = {} | |
| for key in table.keys(): | |
| vector2 = table[key] | |
| if debug: | |
| print("") | |
| print(f"{vector1}") | |
| print(f"{vector2}") | |
| print(f"doing dot product for {key} and {term}") | |
| dp = np.dot(vector1, vector2) | |
| distance = np.linalg.norm(np.array(vector1) - np.array(vector2)) | |
| if debug: | |
| print(f"the dp is {dp}") | |
| print(f"the distance is{distance}") | |
| print("") | |
| print("") | |
| print("") | |
| sim_search_dict[key] = dp * distance | |
| # sort with the biggest similarity | |
| sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1], reverse=True)) | |
| if debug: | |
| for key, value in sorted_table[:5]: | |
| print(f"{key}: {value}") | |
| if add_to_db: | |
| if term in table.keys(): | |
| print("the search term is in the database!") | |
| # add the newly vectorized term to the words, if not already in the vector table | |
| else: | |
| if database != "session/my_words_vec_table.json": | |
| database = "session/my_vecs.json" | |
| # table = load_df(database) # ?? | |
| table[str(term)] = vector1 | |
| my_1_writer.safe_my_dict_as_json(database, table) | |
| # first_key, first_value = list(sortedTable.items())[0] | |
| print(f"the closest word to your input is: {list(sorted_table.keys())[0]}") | |
| return sorted_table | |
| def dot_p_to_1(database, vector1=0, analysis_filename=0): | |
| with open(database, "r") as f: | |
| table = json.load(f) | |
| dot_product_to1 = {} | |
| if vector1 == 0: | |
| vector1 = [0.025515518153991442 for _ in range(1536)] | |
| elif vector1 == 1: | |
| vector1 = table[str(list(table.keys())[0])] | |
| for key in table.keys(): | |
| dot_product_to1[key] = np.dot(vector1, table[key]) | |
| my_1_writer.safe_my_dict_as_json(analysis_filename, dot_product_to1) | |
| print("dot p to 1 saved") | |
| def lin_dist(database, vector1=0, analysis_filename=0): | |
| with open(database, "r") as f: | |
| table = json.load(f) | |
| lin_dist_to_1 = {} | |
| if vector1 == 0: | |
| vector1 = [0.025515518153991442 for _ in range(1536)] | |
| elif vector1 == 1: | |
| vector1 = table[str(list(table.keys())[0])] | |
| for key in table.keys(): | |
| lin_dist_to_1[key] = np.linalg.norm(np.array(vector1) - np.array(table[key])) | |
| my_1_writer.safe_my_dict_as_json(analysis_filename, lin_dist_to_1) | |
| print("lin dist to 1 saved") | |
| def manhattan_dist(database, vector1=0, analysis_filename=0): | |
| with open(database, "r") as f: | |
| table = json.load(f) | |
| manhattan_dist_to_1 = {} | |
| if vector1 == 0: | |
| vector1 = [0.025515518153991442 for _ in range(1536)] | |
| elif vector1 == 1: | |
| vector1 = table[str(list(table.keys())[0])] | |
| for key in table.keys(): | |
| manhattan_dist_to_1[key] = sum(np.array(vector1) - np.array(table[key])) | |
| my_1_writer.safe_my_dict_as_json(analysis_filename, manhattan_dist_to_1) | |
| print("manhattan dist to 1 saved") | |
| #vec_table | |
| def sim_search_fly(vec_table, term, debug=False): | |
| if debug: | |
| print(type(vec_table)) | |
| print(type(term)) | |
| print(type(vec_table[list(vec_table.keys())[0]])) | |
| print("vec table:") | |
| print(vec_table[list(vec_table.keys())[5]][:4]) | |
| print("search term") | |
| print(term[:4]) | |
| if type(term) == str: | |
| print("str") | |
| vector1 = my_new_openai.vectorize_data(term) | |
| elif type(term) == list: | |
| print("list") | |
| vector1 = term | |
| else: | |
| print("invalid search_term/search_vector format") | |
| return | |
| sim_search_dict = {} | |
| for key in vec_table.keys(): | |
| vector2 = vec_table[key] | |
| if debug: | |
| print("") | |
| print(f"{vector1}") | |
| print(f"{vector2}") | |
| print(f"doing dot product for {key} and {term}") | |
| if vector2[0] == vector2[1] and vector2[3] == vector2[4] and vector2[5] == vector2[6]: | |
| dp = 200 | |
| else: | |
| dp = np.dot(vector1, vector2) | |
| #distance = np.linalg.norm(np.array(vector1) - np.array(vector2)) | |
| if debug: | |
| print(f"the dp is {dp}") | |
| #print(f"the distance is{distance}") | |
| print("") | |
| print("") | |
| print("") | |
| sim_search_dict[key] = dp #* distance | |
| # sort with the biggest similarity | |
| sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1], reverse=True)) | |
| if debug: | |
| for key, value in sorted_table[:5]: | |
| print(f"{key}: {value}") | |
| # first_key, first_value = list(sortedTable.items())[0] | |
| print(f"the closest word to your input is: {list(sorted_table.keys())[0]}") | |
| return sorted_table | |