ardifarizky commited on
Commit
bba5e41
1 Parent(s): c98ff59

Upload 7 files

Browse files
Files changed (7) hide show
  1. Dockerfile +7 -0
  2. app.py +157 -0
  3. image1.PNG +0 -0
  4. logo.png +0 -0
  5. random_forest_model.pkl +3 -0
  6. requirements.txt +8 -0
  7. tfidf_vectorizer.pkl +3 -0
Dockerfile ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+ EXPOSE 8080
3
+ ADD requirements.txt requirements.txt
4
+ RUN pip install -r requirements.txt
5
+ WORKDIR /app
6
+ COPY . ./
7
+ ENTRYPOINT ["streamlit", "run", "st.py", "--server.port=8080", "--server.address=0.0.0.0"]
app.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
+ import streamlit as st
3
+ import pandas as pd
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ import numpy as np
6
+ import joblib
7
+ import base64
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+ import re
10
+ from PIL import Image
11
+
12
+ # Load the trained Random Forest model and TF-IDF vectorizer
13
+ rf_classifier = joblib.load('random_forest_model.pkl')
14
+ vectorizer = joblib.load('tfidf_vectorizer.pkl')
15
+ image1 = Image.open('image1.PNG')
16
+ logo = Image.open('logo.png')
17
+
18
+ hide_streamlit_style = """
19
+ <style>
20
+ #MainMenu {visibility: hidden;}
21
+ footer {visibility: hidden;}
22
+ </style>
23
+ """
24
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
25
+
26
+ def main():
27
+ st.title('Batch Product SKU Predictor')
28
+
29
+ # Sidebar
30
+ display_sidebar()
31
+
32
+ # Main UI sections
33
+ st.subheader('1. File Upload')
34
+ uploaded_file = st.file_uploader("Choose a CSV or Excel file. Make sure the number of rows is less than 20,000.", type=['csv', 'xlsx'])
35
+ if uploaded_file:
36
+ st.success("File uploaded successfully!")
37
+ st.subheader('2. Processing Data...')
38
+ process_data(uploaded_file)
39
+ else:
40
+ st.info("Please upload a CSV or Excel file to get started.")
41
+
42
+ def display_sidebar():
43
+ """Displays information on the sidebar."""
44
+ st.sidebar.image(logo, width=250)
45
+ st.sidebar.header('About')
46
+ st.sidebar.text('This app predicts product SKUs based\non uploaded data.')
47
+ st.sidebar.subheader('Instructions:')
48
+ st.sidebar.text('1. Upload your data file.')
49
+ st.sidebar.text('2. Make sure your column name is\n"Product Name".')
50
+ st.sidebar.image(image1, 'example')
51
+ st.sidebar.text('3. Wait for processing.')
52
+ st.sidebar.text('4. View and download the results.')
53
+ # Function to transform product names into SKU names
54
+
55
+ def transform_to_sku(product_name):
56
+ if isinstance(product_name, str):
57
+ # Remove unwanted characters
58
+ product_name = product_name.replace('.', '').replace('@', '').replace('+', '')
59
+ # Remove parentheses
60
+ product_name = re.sub(r'\((.*?)\)', r'\1', product_name)
61
+ # Insert hyphens between numbers and letters if there is no space
62
+ product_name = re.sub(r'(\d+)([a-zA-Z])', r'\1-\2', product_name)
63
+ product_name = re.sub(r'([a-zA-Z])(\d+)', r'\1-\2', product_name)
64
+ # Split, join with hyphens, and convert to uppercase
65
+ sku_name = '-'.join(product_name.split()).upper()
66
+ # Collapse multiple hyphens into one
67
+ sku_name = re.sub(r'-{2,}', '-', sku_name)
68
+ else:
69
+ sku_name = "UNKNOWN-SKU"
70
+ return sku_name
71
+
72
+ def process_file_upload():
73
+ """Handles the file upload and processing."""
74
+ uploaded_file = st.file_uploader("Choose a CSV or Excel file", type=['csv', 'xlsx'])
75
+ if uploaded_file:
76
+ st.write("File uploaded successfully. Processing...")
77
+ process_data(uploaded_file)
78
+ else:
79
+ st.write("Awaiting file upload...")
80
+
81
+ def process_data(uploaded_file):
82
+ """Processes the uploaded file and displays the results."""
83
+ progress_bar = st.progress(0)
84
+ try:
85
+ data = load_data(uploaded_file)
86
+ progress_bar.progress(25)
87
+
88
+ product_vectors = preprocess_data(data)
89
+ progress_bar.progress(50)
90
+
91
+ data = predict_and_score(data, product_vectors)
92
+ progress_bar.progress(75)
93
+
94
+ display_results(data)
95
+ progress_bar.progress(100)
96
+ except Exception as e:
97
+ st.write(f"⚠️ An error occurred: {str(e)}", color='red')
98
+
99
+ def load_data(uploaded_file):
100
+ """Loads the uploaded CSV or Excel file into a DataFrame."""
101
+ if uploaded_file.name.endswith('.csv'):
102
+ return pd.read_csv(uploaded_file)
103
+ else:
104
+ return pd.read_excel(uploaded_file)
105
+
106
+ def preprocess_data(data):
107
+ """Preprocesses the data and returns product vectors."""
108
+ data['Product Name'].fillna("", inplace=True)
109
+ return vectorizer.transform(data['Product Name'])
110
+
111
+ def predict_and_score(data, product_vectors):
112
+ """Predicts SKUs and calculates similarity scores."""
113
+ data['Predicted SKU'] = rf_classifier.predict(product_vectors)
114
+ predicted_sku_vectors = vectorizer.transform(data['Predicted SKU'].astype(str))
115
+ similarity_scores = cosine_similarity(product_vectors, predicted_sku_vectors)
116
+
117
+ # Update 'Predicted SKU' based on similarity score
118
+ for i in range(similarity_scores.shape[0]):
119
+ if similarity_scores[i][i] == 0:
120
+ data.at[i, 'Predicted SKU'] = "-"
121
+
122
+ # Create SKU suggestions based on similarity score
123
+ data['SKU Suggestion'] = [
124
+ "Propose New SKU" if similarity_scores[i][i] < 0.5 else "No Action Needed"
125
+ for i in range(similarity_scores.shape[0])
126
+ ]
127
+
128
+ # Apply the transformation function to the 'Product Name' column to create a 'Transformed SKU' column
129
+ data['SKU Suggestion'] = data.apply(
130
+ lambda row: '-' if row['SKU Suggestion'] == "No Action Needed" else transform_to_sku(row['Product Name']),
131
+ axis=1
132
+ )
133
+ return data
134
+
135
+
136
+
137
+ def display_results(data):
138
+ """Displays the processed data and a download link."""
139
+ st.subheader('3. Predicted Results')
140
+
141
+ # Show a preview of the data with an option to view all
142
+ num_rows = st.slider("Select number of rows to view", 5, len(data), 10)
143
+ st.write(data.head(num_rows))
144
+
145
+ st.subheader('4. Download Results')
146
+ st.markdown(get_table_download_link(data), unsafe_allow_html=True)
147
+
148
+
149
+ def get_table_download_link(df):
150
+ """Generates a download link for the DataFrame."""
151
+ csv = df.to_csv(index=False)
152
+ b64 = base64.b64encode(csv.encode()).decode()
153
+ href = f'<a href="data:file/csv;base64,{b64}" download="predicted_data.csv">Download CSV File</a>'
154
+ return href
155
+
156
+ if __name__ == "__main__":
157
+ main()
image1.PNG ADDED
logo.png ADDED
random_forest_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cb31a2892079651c48d307da5dfed3928f7e458e4ba9cd58c3fb310d9e7209d
3
+ size 115786409
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ scikit-learn==1.3
4
+ numpy
5
+ joblib
6
+ regex
7
+ openpyxl
8
+ Pillow
tfidf_vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:729ec12ad98cde5efdf61750411ecd8477347dab6329700e44dfe02c21ea5c70
3
+ size 434688