|
import os |
|
import sys |
|
import boto3 |
|
import botocore |
|
import re |
|
import pandas as pd |
|
from nltk.corpus import stopwords |
|
import warnings |
|
|
|
warnings.filterwarnings("ignore") |
|
|
|
from app.logger import Logger |
|
|
|
sys.path.insert(0, os.path.abspath(".")) |
|
|
|
|
|
def read_files( |
|
file_name, sort_by=None, drop_duplicates=None, drop_na=None, encoding=None |
|
): |
|
df = pd.read_csv( |
|
os.path.join("app/constants", file_name), low_memory=False, encoding=encoding |
|
) |
|
if sort_by: |
|
df = df.sort_values(by=[sort_by]) |
|
if drop_duplicates: |
|
print("Removing duplicates in ProdName..") |
|
print("df rows before removing duplicates = " + str(df.shape[0])) |
|
df.drop_duplicates(subset=drop_duplicates, keep="first", inplace=True) |
|
print("df rows after removing duplicates = " + str(df.shape[0])) |
|
if drop_na: |
|
print("Removing rows with null values..") |
|
print("df rows before removing nan values = " + str(df.shape[0])) |
|
df = df.dropna(subset=drop_na) |
|
print("df rows after removing nan values = " + str(df.shape[0])) |
|
df = df.reset_index(drop=True) |
|
return df |
|
|
|
|
|
def check_file_already_downloaded(file_name): |
|
files = os.listdir("app/constants") |
|
if file_name in files: |
|
return True |
|
return False |
|
|
|
|
|
def download_file_from_s3( |
|
file_name, bucket_name="sku-matching-ai-ml", skip_check=False, file_path=None |
|
): |
|
if check_file_already_downloaded(file_name) and not skip_check: |
|
return file_name |
|
else: |
|
print("STARTING DOWNLOADING: ", file_name) |
|
if not file_path: |
|
file_path = file_name |
|
s3 = boto3.client("s3") |
|
try: |
|
s3.download_file( |
|
Bucket=bucket_name, Key=file_path, Filename=f"app/constants/{file_name}" |
|
) |
|
print("DOWNLOADING FINISHED") |
|
return file_name |
|
|
|
except botocore.exceptions.ClientError as e: |
|
Logger().exception( |
|
message=f"Unable to download file: {file_name}", |
|
) |
|
return e |
|
|
|
|
|
def upload_files_to_s3(file_path, upload_path, bucket_name="sku-matching-ai-ml"): |
|
print("STARTING UPLOADING") |
|
s3 = boto3.client("s3") |
|
try: |
|
s3.upload_file(file_path, bucket_name, upload_path) |
|
except botocore.exceptions.ClientError as e: |
|
Logger().exception( |
|
message=f"Unable to uplaod file", |
|
) |
|
return e |
|
|
|
|
|
def clean(string): |
|
raw_text = re.sub("[^a-zA-Z]+", " ", string) |
|
words = raw_text.lower().split() |
|
stops = set(stopwords.words("english")) |
|
meaningful_words = [ |
|
word for word in words if ((not word in stops) and (len(word) >= 3)) |
|
] |
|
string = " ".join(meaningful_words) |
|
return string |
|
|
|
|
|
def close_open_brackets(input_str): |
|
opening_brackets = ["(", "[", "{"] |
|
closing_brackets = [")", "]", "}"] |
|
stack = [] |
|
|
|
for char in input_str: |
|
if char in opening_brackets: |
|
stack.append(char) |
|
elif char in closing_brackets: |
|
if len(stack) > 0: |
|
opening_bracket = stack.pop() |
|
if opening_brackets.index(opening_bracket) != closing_brackets.index( |
|
char |
|
): |
|
stack.append(opening_bracket) |
|
stack.append(char) |
|
else: |
|
input_str = input_str.replace(char, "") |
|
|
|
while len(stack) > 0: |
|
opening_bracket = stack.pop() |
|
closing_bracket = closing_brackets[opening_brackets.index(opening_bracket)] |
|
input_str += closing_bracket |
|
|
|
return input_str |
|
|
|
|
|
def iterative_filtering( |
|
df, |
|
product, |
|
column_name, |
|
skip_clean=False, |
|
consider_starts_with=True, |
|
regex=False, |
|
close_brackets=False, |
|
): |
|
if not skip_clean: |
|
product = clean(product) |
|
else: |
|
product = product.lower() |
|
words = product.split() |
|
new_df = df |
|
index = 0 |
|
out_df = new_df |
|
|
|
while new_df.shape[0] > 0 and index < len(words): |
|
out_df = new_df |
|
new_df = df_filtering_by_word( |
|
new_df, |
|
words[index], |
|
column_name, |
|
consider_starts_with, |
|
regex, |
|
close_brackets, |
|
) |
|
if new_df.shape[0] > 0: |
|
out_df = new_df |
|
new_df[column_name] = new_df[column_name].str.replace(words[index] + " ", "") |
|
index = index + 1 |
|
out_df = out_df.reset_index(drop=True) |
|
return out_df |
|
|
|
|
|
def df_filtering_by_word( |
|
df, word, column_name, consider_starts_with=True, regex=False, close_brackets=False |
|
): |
|
try: |
|
if close_brackets: |
|
word = close_open_brackets(word) |
|
|
|
if consider_starts_with: |
|
filtered_df = df[df[column_name].str.startswith(word)] |
|
if filtered_df.shape[0] == 0: |
|
filtered_df = df[df[column_name].str.contains(word)] |
|
else: |
|
if regex: |
|
filtered_df = df[ |
|
df[column_name].str.contains(rf"\b({word})\b", case=False) |
|
] |
|
else: |
|
filtered_df = df[df[column_name].str.contains(word)] |
|
if filtered_df.shape[0] == 0: |
|
filtered_df = df |
|
|
|
return filtered_df |
|
except Exception as e: |
|
return df_filtering_by_word(df, clean(word), consider_starts_with, regex) |
|
|
|
|
|
def remove_files(file_name): |
|
if os.path.exists(f"app/constants/{file_name}"): |
|
os.remove(f"app/constants/{file_name}") |
|
|
|
def get_top_mrf_product(mrf_product_attributes_list, dp_product_attributes, sequence_scores, default_attr_key_list): |
|
scores = [] |
|
for id, each_mrf_prod_attr in enumerate(mrf_product_attributes_list): |
|
score = sequence_scores[id] |
|
for key in default_attr_key_list: |
|
if key in dp_product_attributes and key in each_mrf_prod_attr: |
|
if pd.notna(dp_product_attributes[key]) and pd.notna(each_mrf_prod_attr[key]): |
|
if str(dp_product_attributes[key]).lower() == str(each_mrf_prod_attr[key]).lower(): |
|
score += 5 |
|
scores.append(score) |
|
|
|
max_index = scores.index(max(scores)) |
|
return max_index, max(scores) |
|
|
|
|
|
|
|
''' |
|
This Function is using for preprocessing the input product names |
|
''' |
|
def preprocess(text): |
|
text = re.sub(r'&', 'and', text) |
|
text = re.sub(r'[^\w\s]',' ', text) |
|
text = re.sub(' +', ' ', text) |
|
return text.strip().lower() |
|
|
|
|
|
def label_processing(label): |
|
label = re.sub('__label__', '', label) |
|
label = re.sub('_', ' ', label) |
|
label = re.sub(' +', ' ', label) |
|
return label.strip().lower() |
|
|
|
def get_return_labels(label,accuracy,threshold): |
|
if accuracy >= threshold: |
|
return_label = label |
|
return_score = accuracy |
|
label_status = f"Classified - Above threshold {threshold}" |
|
else: |
|
return_label = None |
|
return_score = None |
|
label_status = f"Unclassfied - Below threshold {threshold}" |
|
return return_label,return_score,label_status |
|
|
|
|
|
def get_label_and_accuracy(model,product_name): |
|
prediction = model.predict(product_name) |
|
label = prediction[0][0] |
|
label = label_processing(label) |
|
accuracy = round(prediction[1][0],3) |
|
|
|
return label,accuracy |
|
|
|
|
|
''' |
|
Some products may contain new line characters in middle of product names. |
|
This may occur because of preprocessing. It can lead to result \n in middle of the |
|
product names. |
|
''' |
|
def remove_new_lines(text): |
|
text = re.sub('\n', ' ', text) |
|
return text.strip().lower() |