test files added

Browse files

Files changed (2) hide show

Huggin_face_test/fsa.py +304 -0
Huggin_face_test/helpers.py +246 -0

Huggin_face_test/fsa.py ADDED Viewed

	@@ -0,0 +1,304 @@

+# Importing libraries
+from threading import Thread
+from flask import Blueprint, jsonify, request
+from flask_cors import CORS
+import sys
+import os
+# Importing process pool executor
+from concurrent.futures import ProcessPoolExecutor
+# Fasttext for model handling
+import fasttext
+# Setting absolute path
+sys.path.insert(0, os.path.abspath("."))
+from app.config import Config
+from app.helpers import *
+from app.db.models import Tasks
+from app.database import db
+from app.threads.process_fsa_v2 import process_fsa_categories_v2
+# from app.threads.process_fsa_v2 import test_function
+# Create a Blueprint of classification
+fsa = Blueprint("fsa_v2", __name__, url_prefix="/api/v2/fsa")
+# Enabling CORS for the blueprint
+CORS(
+    fsa,
+    supports_credentials=True
+)
+# Thread class to run the bacth processing in the thread
+class FSAThread_V2(Thread):
+    def __init__(self, data={}) -> None:
+        Thread.__init__(self)
+        self.data = data
+    # Run function of the thread
+    def run(self) -> None:
+        process_fsa_categories_v2(self.data)
+# Creating a process pool executor
+# Set maximum processes
+max_processes = 4
+process_executor = ProcessPoolExecutor(max_workers=max_processes)
+# Update the database
+def update_db(table_idx, remarks=None):
+    from app.api import app
+    with app.app_context():
+        Tasks.update_by_id(table_idx, remarks)
+        db.session.close()
+# Prediction for single product
+@fsa.route("/single-product", methods=["POST"])
+def predict_categories():
+    # Get the request
+    body = request.json
+    # If there is no body in the request send error message
+    if not body:
+        return jsonify({"message": "Cannot decode JSON from the body"}), 422
+    # Get the product name from the JSON
+    product_name = body.get("product_name")
+    # Check whether product name is missing
+    if not product_name:
+        return jsonify({"message": "Product name is missing"}), 422
+    # Preprocessing product names for input
+    product_name = preprocess(product_name)
+    # Prediction
+    # Logging processing
+    Logger.info(message="Processing FSA categorical data for " + product_name)
+    # Loading L0 model to model
+    try:
+        model = fasttext.load_model('app/models/L0/L0_model.bin')
+    except:
+        return jsonify({"message": "Can't load the L0 model"}), 500
+    #Getting L0 prediction and accuracy
+    L0_label,L0_accuracy = get_label_and_accuracy(model,product_name)
+    L0_return_label,L0_return_score,L0_label_status = get_return_labels(L0_label,L0_accuracy,0.95)
+    print("L0",L0_label,L0_accuracy)
+    if not L0_label:
+        return jsonify({"message": "Error predicting L0 Category"}), 500
+    #Loading L1 model to model
+    try:
+        model = fasttext.load_model('app/models/L1/L1_model.bin')
+    except:
+        return jsonify({"message": "Can't load the L1 model"}), 500
+    #Getting L1 prediction and accuracy
+    L1_label,L1_accuracy = get_label_and_accuracy(model,L0_label +" " + product_name)
+    L1_return_label,L1_return_score,L1_label_status = get_return_labels(L1_label,L1_accuracy,0.95)
+    print("L1",L1_label,L1_accuracy)
+    if not L1_label:
+        return jsonify({"message": "Error predicting L1 Category"}), 500
+    #Loading L2 model to model
+    try:
+        model = fasttext.load_model('app/models/L2/L2_model.bin')
+    except:
+        return jsonify({"message": "Can't load the L2 model"}), 500
+    #Getting L2 prediction and accuracy
+    L2_label,L2_accuracy = get_label_and_accuracy(model,L1_label+" "+product_name)
+    L2_return_label,L2_return_score,L2_label_status = get_return_labels(L2_label,L2_accuracy,0.95)
+    print("L2",L2_label,L2_accuracy)
+    if not L2_label:
+        return jsonify({"message": "Error predicting L2 Category"}), 500
+    #Loading L3 model to model
+    try:
+        model = fasttext.load_model('app/models/L3/L3_model.bin')
+    except:
+        return jsonify({"message": "Can't load the L3 model"}), 500
+    #Getting L3 prediction and accuracy
+    L3_label,L3_accuracy = get_label_and_accuracy(model,L2_label+" "+product_name)
+    L3_return_label,L3_return_score,L3_label_status = get_return_labels(L3_label,L3_accuracy,0.95)
+    print("L3",L3_label,L3_accuracy)
+    if not L3_label:
+        return jsonify({"message": "Error predicting L3 Category"}), 500
+    if L0_label == "administrative":
+        try:
+            model = fasttext.load_model('app/models/L4/administrative/L4_Admin_model.bin')
+        except:
+            return jsonify({"message": "Can't load the L4 (Administrative) model"}), 500
+        #Getting L4 prediction and accuracy
+        L4_label,L4_accuracy = get_label_and_accuracy(model,(L3_label+ " " +product_name))
+        L4_return_label,L4_return_score,L4_label_status = get_return_labels(L4_label,L4_accuracy,0.75)
+        print("L4",L4_label,L4_accuracy)
+    # L0 = Beverage
+    elif L0_label == "beverage":
+        try:
+            model = fasttext.load_model('app/models/L4/beverage/L4_beverage_model.bin')
+        except:
+            return jsonify({"message": "Can't load the L4 (Beverage) model"}), 500
+        #Getting L4 prediction and accuracy
+        L4_label,L4_accuracy = get_label_and_accuracy(model,(L3_label+" "+product_name))
+        L4_return_score = None
+        L4_return_label,L4_return_score,L4_label_status = get_return_labels(L4_label,L4_accuracy,0.66)
+        print("L4",L4_label,L4_accuracy)
+    # L0 = Food
+    elif L0_label == "food":
+        try:
+            model = fasttext.load_model('app/models/L4/food/L4_food_model.bin')
+        except:
+            return jsonify({"message": "Can't load the L4 (Food) model"}), 500
+        #Getting L4 prediction and accuracy
+        L4_label,L4_accuracy = get_label_and_accuracy(model,(L3_label+" "+product_name))
+        L4_return_label,L4_return_score,L4_label_status = get_return_labels(L4_label,L4_accuracy,0.85)
+        print("L4",L4_label,L4_accuracy)
+    # L0 = Operationals
+    elif L0_label == "operationals":
+        try:
+            model = fasttext.load_model('app/models/L4/operationals/L4_operationals_model.bin')
+        except:
+            return jsonify({"message": "Can't load the L4 (Operationals) model"}), 500
+        #Getting L4 prediction and accuracy
+        L4_label,L4_accuracy = get_label_and_accuracy(model,(L3_label+" "+product_name))
+        L4_return_label,L4_return_score,L4_label_status = get_return_labels(L4_label,L4_accuracy,0.8)
+        print("L4",L4_label,L4_accuracy)
+    # Error prediction on L4 Category (Can't happen)
+    else:
+        return jsonify({"message": "Error prediction of L4 Category"}), 422
+    if not L4_label:
+        return jsonify({"message": "Error predicting L4 Category"}), 422
+    # Logging the task
+    Logger.info(message="Done processing FSA categorical data for" + product_name)
+    # Rreturning the result as JSON
+    return jsonify({
+    "classification_results": {
+        "l0": L0_return_label,
+        "l1": L1_return_label,
+        "l2": L2_return_label,
+        "l3": L3_return_label,
+        "l4": L4_return_label
+    },
+    "scores": {
+        "l0": L0_return_score,
+        "l1": L1_return_score,
+        "l2": L2_return_score,
+        "l3": L3_return_score,
+        "l4": L4_return_score
+    },
+    "remarks":{
+        "l0": L0_label_status,
+        "l1": L1_label_status,
+        "l2": L2_label_status,
+        "l3": L3_label_status,
+        "l4": L4_label_status
+    },
+    "all_classification_results": {
+        "L0": L0_label,
+        "L1": L1_label,
+        "L2": L2_label,
+        "L3": L3_label,
+        "L4": L4_label
+    },
+    "all_scores": {
+        "L0": L0_accuracy,
+        "L1": L1_accuracy,
+        "L2": L2_accuracy,
+        "L3": L3_accuracy,
+        "L4": L4_accuracy
+    }
+}), 200
+# Batch processing
+@fsa.route("/process-csv", methods=["POST"])
+def process_csv():
+    # Get the body of the json
+    body = request.json
+    # Error passing for missing body
+    if not body:
+        return jsonify({"message": "Cannot decode JSON from the body"}), 422
+    # It is assumed that uploaded file name in the file_name JSON field
+    file_name = body.get("uploaded_file_name")
+    # Original file name
+    original_file_name = body.get("original_file_name") or file_name
+    # Missing file name
+    if not file_name:
+        return jsonify({"message": "File name is missing"}), 422
+    files = [{"name": f"fsa_input_{file_name}", "path": f"FSA Categorization/input/{file_name}"}]
+    # Download files from S3 bucket of AWS
+    # File is downloaded to th 'app/constants/{file}'
+    for file in files:
+        download_status = download_file_from_s3(
+            file_name=file["name"], file_path=file["path"]
+        )
+        if isinstance(download_status, botocore.exceptions.ClientError):
+            return (
+                jsonify({"message": f"Error downloading {file} from s3"}),
+                422,
+            )
+    # Get the dataframe of the csv to check whether "ProdName" column is available
+    df = read_files(file_name=file_name)
+    # Check for product_names in columns
+    if "product_name" not in df.columns:
+        remove_files(f"fsa_input_{file_name}")
+        return jsonify({"message": "Product name column is missing from the CSV"}), 422
+    # Create a task
+    created_task = Tasks.create(file_name=file_name, original_file_name=original_file_name)
+    # Create a json object of data to pass the process
+    data = {
+        "file_name": file_name,
+        "table_idx": created_task.id,
+        "update_db": update_db
+    }
+    db.session.close()
+    # Add the process to process pool executor
+    result_future = process_executor.submit(process_fsa_categories_v2, (data))
+    # Creating a thread with data
+    # thread = FSAThread_V2(data=data)
+    # thread.start()
+    # Testing route
+    return jsonify({"message": f"{file_name} - File processing starting"}), 200

Huggin_face_test/helpers.py ADDED Viewed

	@@ -0,0 +1,246 @@

+import os
+import sys
+import boto3
+import botocore
+import re
+import pandas as pd
+from nltk.corpus import stopwords
+import warnings
+warnings.filterwarnings("ignore")
+from app.logger import Logger
+sys.path.insert(0, os.path.abspath("."))
+def read_files(
+    file_name, sort_by=None, drop_duplicates=None, drop_na=None, encoding=None
+):
+    df = pd.read_csv(
+        os.path.join("app/constants", file_name), low_memory=False, encoding=encoding
+    )
+    if sort_by:
+        df = df.sort_values(by=[sort_by])
+    if drop_duplicates:
+        print("Removing duplicates in ProdName..")
+        print("df rows before removing duplicates = " + str(df.shape[0]))
+        df.drop_duplicates(subset=drop_duplicates, keep="first", inplace=True)
+        print("df rows after removing duplicates = " + str(df.shape[0]))
+    if drop_na:
+        print("Removing rows with null values..")
+        print("df rows before removing nan values = " + str(df.shape[0]))
+        df = df.dropna(subset=drop_na)
+        print("df rows after removing nan values = " + str(df.shape[0]))
+    df = df.reset_index(drop=True)
+    return df
+def check_file_already_downloaded(file_name):
+    files = os.listdir("app/constants")
+    if file_name in files:
+        return True
+    return False
+def download_file_from_s3(
+    file_name, bucket_name="sku-matching-ai-ml", skip_check=False, file_path=None
+):
+    if check_file_already_downloaded(file_name) and not skip_check:
+        return file_name
+    else:
+        print("STARTING DOWNLOADING: ", file_name)
+        if not file_path:
+            file_path = file_name
+        s3 = boto3.client("s3")
+        try:
+            s3.download_file(
+                Bucket=bucket_name, Key=file_path, Filename=f"app/constants/{file_name}"
+            )
+            print("DOWNLOADING FINISHED")
+            return file_name
+        # pylint: disable=invalid-name
+        except botocore.exceptions.ClientError as e:
+            Logger().exception(
+                message=f"Unable to download file: {file_name}",
+            )
+            return e
+def upload_files_to_s3(file_path, upload_path, bucket_name="sku-matching-ai-ml"):
+    print("STARTING UPLOADING")
+    s3 = boto3.client("s3")
+    try:
+        s3.upload_file(file_path, bucket_name, upload_path)
+    except botocore.exceptions.ClientError as e:
+        Logger().exception(
+            message=f"Unable to uplaod file",
+        )
+        return e
+def clean(string):
+    raw_text = re.sub("[^a-zA-Z]+", " ", string)
+    words = raw_text.lower().split()
+    stops = set(stopwords.words("english"))
+    meaningful_words = [
+        word for word in words if ((not word in stops) and (len(word) >= 3))
+    ]
+    string = " ".join(meaningful_words)
+    return string
+def close_open_brackets(input_str):
+    opening_brackets = ["(", "[", "{"]
+    closing_brackets = [")", "]", "}"]
+    stack = []
+    for char in input_str:
+        if char in opening_brackets:
+            stack.append(char)
+        elif char in closing_brackets:
+            if len(stack) > 0:
+                opening_bracket = stack.pop()
+                if opening_brackets.index(opening_bracket) != closing_brackets.index(
+                    char
+                ):
+                    stack.append(opening_bracket)
+                    stack.append(char)
+            else:
+                input_str = input_str.replace(char, "")
+    while len(stack) > 0:
+        opening_bracket = stack.pop()
+        closing_bracket = closing_brackets[opening_brackets.index(opening_bracket)]
+        input_str += closing_bracket
+    return input_str
+def iterative_filtering(
+    df,
+    product,
+    column_name,
+    skip_clean=False,
+    consider_starts_with=True,
+    regex=False,
+    close_brackets=False,
+):
+    if not skip_clean:
+        product = clean(product)
+    else:
+        product = product.lower()
+    words = product.split()
+    new_df = df
+    index = 0
+    out_df = new_df
+    while new_df.shape[0] > 0 and index < len(words):
+        out_df = new_df
+        new_df = df_filtering_by_word(
+            new_df,
+            words[index],
+            column_name,
+            consider_starts_with,
+            regex,
+            close_brackets,
+        )
+        if new_df.shape[0] > 0:
+            out_df = new_df
+        new_df[column_name] = new_df[column_name].str.replace(words[index] + " ", "")
+        index = index + 1
+    out_df = out_df.reset_index(drop=True)
+    return out_df
+def df_filtering_by_word(
+    df, word, column_name, consider_starts_with=True, regex=False, close_brackets=False
+):
+    try:
+        if close_brackets:
+            word = close_open_brackets(word)
+        if consider_starts_with:
+            filtered_df = df[df[column_name].str.startswith(word)]
+            if filtered_df.shape[0] == 0:
+                filtered_df = df[df[column_name].str.contains(word)]
+        else:
+            if regex:
+                filtered_df = df[
+                    df[column_name].str.contains(rf"\b({word})\b", case=False)
+                ]
+            else:
+                filtered_df = df[df[column_name].str.contains(word)]
+        if filtered_df.shape[0] == 0:
+            filtered_df = df
+        return filtered_df
+    except Exception as e:
+        return df_filtering_by_word(df, clean(word), consider_starts_with, regex)
+def remove_files(file_name):
+    if os.path.exists(f"app/constants/{file_name}"):
+        os.remove(f"app/constants/{file_name}")
+def get_top_mrf_product(mrf_product_attributes_list, dp_product_attributes, sequence_scores, default_attr_key_list):
+    scores = []
+    for id, each_mrf_prod_attr in enumerate(mrf_product_attributes_list):
+        score = sequence_scores[id]
+        for key in default_attr_key_list:
+            if key in dp_product_attributes and key in each_mrf_prod_attr:
+                if pd.notna(dp_product_attributes[key]) and pd.notna(each_mrf_prod_attr[key]):
+                    if str(dp_product_attributes[key]).lower() == str(each_mrf_prod_attr[key]).lower():
+                        score += 5
+        scores.append(score)
+    max_index = scores.index(max(scores))
+    return max_index, max(scores)
+# Helper files required for FSA V2
+# Preprocessing Function
+'''
+This Function is using for preprocessing the input product names
+'''
+def preprocess(text):
+    text = re.sub(r'&', 'and', text)
+    text = re.sub(r'[^\w\s]',' ', text)
+    text = re.sub(' +', ' ', text)
+    return text.strip().lower()
+# Function to preprocess labels from the previous prediction
+def label_processing(label):
+    label = re.sub('__label__', '', label)
+    label = re.sub('_', ' ', label)
+    label = re.sub(' +', ' ', label)
+    return label.strip().lower()
+def get_return_labels(label,accuracy,threshold):
+    if accuracy >= threshold:
+        return_label = label
+        return_score = accuracy
+        label_status = f"Classified - Above threshold {threshold}"
+    else:
+        return_label = None
+        return_score = None
+        label_status = f"Unclassfied - Below threshold {threshold}"
+    return return_label,return_score,label_status
+#Function to get the product label and accuracy
+def get_label_and_accuracy(model,product_name):
+    prediction = model.predict(product_name)
+    label = prediction[0][0]
+    label = label_processing(label)
+    accuracy = round(prediction[1][0],3)
+    return label,accuracy
+# Function for remove new line in product name
+'''
+Some products may contain new line characters in middle of product names.
+This may occur because of preprocessing. It can lead to result \n in middle of the
+product names.
+'''
+def remove_new_lines(text):
+    text = re.sub('\n', ' ', text)
+    return text.strip().lower()