seoul_backend / util /preprocessing.py
ldhldh's picture
Update util/preprocessing.py
7e589bc verified
import difflib
import pandas as pd
from util.search_data import *
def word_to_market_name(word):
markets_df = pd.read_csv('data/market_name_utf8.csv')
markets_names = markets_df['μ‹œμž₯λͺ…']
output = []
scores = dict()
for m in markets_names:
flag = True
for c in range(len(word)):
if c < len(m):
if m[c] != word[c]:
flag = False
if flag:
output.append(m)
else:
sm = difflib.SequenceMatcher(None, word, m)
scores[m] = sm.ratio()
sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)
top_3_markets = [market[0] for market in sorted_scores[:3]]
for i in range(len(top_3_markets)):
output.append(top_3_markets[i])
return output
def word_to_product_name(word):
if not os.path.exists("data/products.txt"):
products = get_all_product_names()
else:
temp = ''
with open("data/products.txt", "r", encoding = "utf-8") as f:
temp = f.read()
products = temp.split("\n")[:-2]
output = []
scores = dict()
for p in products:
flag = True
for c in range(len(word)):
if c < len(p):
if p[c] != word[c]:
flag = False
if flag:
output.append(p)
else:
sm = difflib.SequenceMatcher(None, word, p)
scores[p] = sm.ratio()
sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)
top_3_product = [product[0] for product in sorted_scores[:3]]
for i in range(len(top_3_product)):
output.append(top_3_product[i])
return output
def check_word(word):
markets_df = pd.read_csv('data/market_name_utf8.csv')
markets_names = markets_df['μ‹œμž₯λͺ…']
for m in markets_names:
if word == m:
print(f"check_word, {word}")
return True
return False
def check_product(word):
if not os.path.exists("data/products.txt"):
products = get_all_product_names()
else:
temp = ''
with open("data/products.txt", "r", encoding = "utf-8") as f:
temp = f.read()
products = temp.split("\n")[:-2]
for p in products:
if word == p:
print(f"check_word, {word}")
return True
return False