Meera2602 commited on
Commit
0928d05
1 Parent(s): 605c0c5
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dvcignore +3 -0
  2. .gitattributes +1 -0
  3. .github/workflows/sync2_HF_hub.yml +23 -0
  4. Dockerfile +6 -0
  5. README.md +136 -0
  6. app.py +181 -0
  7. artifacts/eda/eda.json +0 -0
  8. artifacts/eda/question_five_violin_plots.html +0 -0
  9. artifacts/eda/question_four_correlation_matrix.html +0 -0
  10. artifacts/eda/question_four_ttest.json +1 -0
  11. artifacts/eda/question_one.html +0 -0
  12. artifacts/eda/question_three_boxplots.html +0 -0
  13. artifacts/eda/question_three_histograms.html +0 -0
  14. artifacts/eda/question_two.html +0 -0
  15. artifacts/model1/model_1.pkl +3 -0
  16. artifacts/model2/model_2.pkl +3 -0
  17. artifacts/preprocessed_csv.csv +0 -0
  18. artifacts/raw.csv +0 -0
  19. artifacts/scaler.pkl +3 -0
  20. frontend/main/main_page.md +124 -0
  21. frontend/reports/classification_performance_report.html +0 -0
  22. frontend/reports/classification_performance_report2.html +0 -0
  23. frontend/reports/data_drift.html +0 -0
  24. init_setup.sh +0 -0
  25. mlruns/0/0f148d87fa9e434e8be67038ff601c1e/artifacts/Logistic Regression/MLmodel +20 -0
  26. mlruns/0/0f148d87fa9e434e8be67038ff601c1e/artifacts/Logistic Regression/conda.yaml +15 -0
  27. mlruns/0/0f148d87fa9e434e8be67038ff601c1e/artifacts/Logistic Regression/metadata/MLmodel +20 -0
  28. mlruns/0/0f148d87fa9e434e8be67038ff601c1e/artifacts/Logistic Regression/metadata/conda.yaml +15 -0
  29. mlruns/0/0f148d87fa9e434e8be67038ff601c1e/artifacts/Logistic Regression/metadata/python_env.yaml +7 -0
  30. mlruns/0/0f148d87fa9e434e8be67038ff601c1e/artifacts/Logistic Regression/metadata/requirements.txt +8 -0
  31. mlruns/0/0f148d87fa9e434e8be67038ff601c1e/artifacts/Logistic Regression/model.pkl +3 -0
  32. mlruns/0/0f148d87fa9e434e8be67038ff601c1e/artifacts/Logistic Regression/python_env.yaml +7 -0
  33. mlruns/0/0f148d87fa9e434e8be67038ff601c1e/artifacts/Logistic Regression/requirements.txt +8 -0
  34. mlruns/0/0f148d87fa9e434e8be67038ff601c1e/meta.yaml +15 -0
  35. mlruns/0/0f148d87fa9e434e8be67038ff601c1e/metrics/Accuracy +1 -0
  36. mlruns/0/0f148d87fa9e434e8be67038ff601c1e/metrics/F1 +1 -0
  37. mlruns/0/0f148d87fa9e434e8be67038ff601c1e/metrics/Precision +1 -0
  38. mlruns/0/0f148d87fa9e434e8be67038ff601c1e/metrics/Recall +1 -0
  39. mlruns/0/0f148d87fa9e434e8be67038ff601c1e/tags/mlflow.log-model.history +1 -0
  40. mlruns/0/0f148d87fa9e434e8be67038ff601c1e/tags/mlflow.runName +1 -0
  41. mlruns/0/0f148d87fa9e434e8be67038ff601c1e/tags/mlflow.source.name +1 -0
  42. mlruns/0/0f148d87fa9e434e8be67038ff601c1e/tags/mlflow.source.type +1 -0
  43. mlruns/0/0f148d87fa9e434e8be67038ff601c1e/tags/mlflow.user +1 -0
  44. mlruns/0/25cb228506ae475fa3e381cac63bc8a6/artifacts/Random Forest/MLmodel +20 -0
  45. mlruns/0/25cb228506ae475fa3e381cac63bc8a6/artifacts/Random Forest/conda.yaml +15 -0
  46. mlruns/0/25cb228506ae475fa3e381cac63bc8a6/artifacts/Random Forest/metadata/MLmodel +20 -0
  47. mlruns/0/25cb228506ae475fa3e381cac63bc8a6/artifacts/Random Forest/metadata/conda.yaml +15 -0
  48. mlruns/0/25cb228506ae475fa3e381cac63bc8a6/artifacts/Random Forest/metadata/python_env.yaml +7 -0
  49. mlruns/0/25cb228506ae475fa3e381cac63bc8a6/artifacts/Random Forest/metadata/requirements.txt +8 -0
  50. mlruns/0/25cb228506ae475fa3e381cac63bc8a6/artifacts/Random Forest/model.pkl +3 -0
.dvcignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Add patterns of files dvc should ignore, which could improve
2
+ # the performance. Learn more at
3
+ # https://dvc.org/doc/user-guide/dvcignore
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.pkl filter=lfs diff=lfs merge=lfs -text
.github/workflows/sync2_HF_hub.yml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v3
14
+ with:
15
+ fetch-depth: 0
16
+ lfs: true
17
+ - name: Push to hub
18
+ env:
19
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
20
+ run: |
21
+ # git remote add huggingface https://Meera2602:$HF_TOKEN@huggingface.co/spaces/Meera2602/pred-maintenance-app
22
+ # git push huggingface main --force
23
+ git push --force https://MEERA2602:$HF_TOKEN@huggingface.co/spaces/Meera2602/pred-maintenance-app main
Dockerfile ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ FROM python:3.8-slim-buster
2
+ WORKDIR /service
3
+ COPY requirements.txt .
4
+ COPY . ./
5
+ RUN pip install -r requirements.txt
6
+ ENTRYPOINT ["python3", "app.py"]
README.md ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Pred Maintenance App
3
+ emoji: 👷‍♀️🛠️
4
+ colorFrom: yellow
5
+ colorTo: purple
6
+ sdk: streamlit
7
+ sdk_version: 1.34.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ # Equipment Predictive Maintenance
14
+
15
+ ## Overview
16
+
17
+ ---
18
+
19
+ Predictive maintenance can help companies minimize downtime, reduce repair costs, and improve operational efficiency. Developing a web application for predictive maintenance can provide users with real-time insights into equipment performance, enabling proactive maintenance, and reducing unplanned downtime.
20
+
21
+ The important business questions that are solved by employing data driven approach using machine learning models are:
22
+
23
+ - Predict whether an equipment will fail or not based.
24
+
25
+ - Identifies the type of equipment failure.
26
+
27
+ This web application is a demonstration of predictive maintenance using a synthetic dataset. The dataset comprises of process parameters, ambient air and process temperatures, rotational speed, torque, and tool wear.
28
+
29
+ ---
30
+
31
+ ## Performance metrics
32
+
33
+ ---
34
+
35
+ To evaluate the performance of the ML models used in the project following metrics are used:
36
+
37
+ - Precsion, recall, and F1 score of the machine learning models.
38
+ - Responsiveness and ease of use of the web application.
39
+
40
+ ---
41
+
42
+ The web application provides the following functionality:
43
+
44
+ - Users can provide the process parameters to the model and receive a prediction of whether the equipment will fail or not, and the type of failure.
45
+ - Users can view and infer the performance metrics of different machine learning models.
46
+ - Users can visualize the data and gain insights into the behavior of the equipment.
47
+ - The application should be built using Streamlit and deployed using Docker and Huggingface spaces.
48
+ - The cost of deployment should be minimal.
49
+
50
+ ---
51
+
52
+ ## Problem Statement
53
+
54
+ ---
55
+
56
+ The problem is to develop a machine learning model that predicts equipment failures based on process parameters.
57
+
58
+ ---
59
+ ## Dataset
60
+ ---
61
+
62
+ The dataset consists of more than 10,000 data points stored as rows with 14 features in columns. The features include process parameters such as air and process temperatures, rotational speed, torque, and tool wear. The target variable is a binary label indicating whether the equipment failed or not.
63
+
64
+ The dataset consists of 10 000 data points stored as rows with 14 features in columns
65
+
66
+ UID: unique identifier ranging from 1 to 10000
67
+
68
+ product ID: consisting of a letter L, M, or H for low (50% of all products), medium (30%) and high (20%) as
69
+
70
+ product quality variants and a variant-specific serial number
71
+
72
+ air temperature [K]: generated using a random walk process later normalized to a standard deviation of 2 K around 300 K
73
+
74
+ process temperature [K]: generated using a random walk process normalized to a standard deviation of 1 K, added to the air temperature plus 10 K.
75
+
76
+ rotational speed [rpm]: calculated from a power of 2860 W, overlaid with a normally distributed noise
77
+
78
+ torque [Nm]: torque values are normally distributed around 40 Nm
79
+
80
+ tool wear [min]: The quality variants H/M/L add 5/3/2 minutes of tool wear to the used tool in the process. and a 'machine failure' label that indicates, whether the machine has failed in this particular datapoint for any of the following failure modes are true.
81
+
82
+
83
+ ---
84
+ ## ML models
85
+ ---
86
+ We will utilize both a binary classification model, and a multi-class classification model to predict equipment failures, and type of equipment failure respectively.
87
+
88
+ ### Machine failure consists of five unique modes
89
+ tool wear failure (TWF): the tool will be replaced of fail at a randomly selected tool wear time between 200 to 240 mins. At this point in time, the tool is replaced 69 times, and fails 51 times (randomly assigned).
90
+
91
+ heat dissipation failure (HDF): heat dissipation causes a process failure, if the difference between air- and process temperature is below 8.6 K and the tool's rotational speed is below 1380 rpm. This is the case for 115 data points.
92
+
93
+ power failure (PWF): the product of torque and rotational speed (in rad/s) equals the power required for the process. If this power is below 3500 W or above 9000 W, the process fails, which is the case 95 times in our dataset.
94
+
95
+ overstrain failure (OSF): if the product of tool wear and torque exceeds 11,000 minNm for the L product variant (12,000 M, 13,000 H), the process fails due to overstrain. This is true for 98 datapoints.
96
+
97
+ random failures (RNF): each process has a chance of 0,1 % to fail regardless of its process parameters. This is the case for only 5 datapoints, less than could be expected for 10,000 datapoints in our dataset.
98
+
99
+ If at least one of the above failure modes is true, the process fails and the 'machine failure' label is set to 1. It is therefore not transparent to the machine learning method, which of the failure modes has caused the process to fail
100
+
101
+ The following machine learning models will be used:
102
+
103
+ - Random Forest
104
+ - Decision Tree
105
+ - Logistic Regression
106
+ - Support Vector Machine (SVM)
107
+ ---
108
+ ## Generic steps
109
+ ---
110
+ - Data preprocessing and cleaning
111
+ - Feature engineering and selection
112
+ - Model selection and training
113
+ - Hyperparameter tuning
114
+ - Model evaluation and testing
115
+ ---
116
+ ## Architecture
117
+ ---
118
+ The web application architecture will consist of the following components:
119
+
120
+ - A frontend web application built using Streamlit
121
+ - A machine learning model for equipment failure prediction
122
+ - Docker containers to run the frontend, backend, and model
123
+ - Cloud infrastructure to host the application
124
+ - CI/CD pipeline using GitHub Actions for automated deployment
125
+
126
+ The frontend will interact with the backend server through API calls to request predictions, model training, and data storage. The backend server will manage user authentication, data storage, and model training. The machine learning model will be trained and deployed using Docker containers. The application will be hosted on Huggingface sppaces. The CI/CD pipeline will be used to automate the deployment process.
127
+
128
+ ---
129
+
130
+ ## Mlops practices
131
+
132
+ ---
133
+
134
+ This project is designed to create an end-to-end workflow for developing and deploying a web application that performs data preprocessing, model training, model evaluation, and prediction. The pipeline leverages Docker containers for encapsulating code, artifacts, and both the frontend and backend components of the application. The application is deployed on a huggingface space to provide a cloud hosting solution.
135
+
136
+ ---
app.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit_extras import add_vertical_space
3
+ import streamlit.components.v1 as components
4
+ from annotated_text import annotated_text
5
+
6
+
7
+ st.set_page_config(layout='wide')
8
+
9
+ import pandas as pd
10
+
11
+ import json
12
+ import plotly.express as px
13
+ import plotly.graph_objs as go
14
+
15
+ from src.Predictive_Maintenance.pipelines.prediction_pipeline import prediction
16
+
17
+ with st.sidebar:
18
+ st.title("Predictive Maintenance Project")
19
+
20
+ choice = st.radio("Choose from the below options:", ["Main","EDA","Monitoring Reports","Performance Measures","Prediction"])
21
+
22
+
23
+
24
+
25
+
26
+ if choice == "Main":
27
+ with open("frontend/main/main_page.md", "r") as file:
28
+ readme_contents = file.read()
29
+ st.markdown(readme_contents)
30
+
31
+
32
+
33
+ if choice == "EDA":
34
+ st.title('Exploratory Data Analysis')
35
+
36
+ st.header('Question 1')
37
+ st.write("What is the frequency distribution of the target label 'machine failure' in the dataset? How many instances indicate a failure compared to those that do not??")
38
+ st.image("reports/q1.png")
39
+ st.write("**The success rate of the machine is 96.52% and the highest type of failure is HDF(Heat Dissipation Failure) with 1.15% failure rate**")
40
+
41
+
42
+ st.header('Question 2')
43
+ st.write("How is the 'productID' variable distributed across the dataset? Specifically, how many instances correspond to low, medium, and high-quality variants?")
44
+ st.image("reports/q2.png")
45
+ st.write("**Low quality varient makes up majority of the dataset with 60% of the data, followed by medium quality varient with 30% and high quality varient with 10%**")
46
+
47
+
48
+ st.header('Question 3')
49
+ st.write("What are the minimum, maximum, and typical values for 'air temperature', 'process temperature', 'rotational speed', 'torque', and 'tool wear'?")
50
+ st.write("Are there any significant outliers in these variables?")
51
+ st.image("reports/q3.png")
52
+ st.write("**Rotational speed may or may not be actual outliers, therefore we'll keep them in the dataset for now.**")
53
+
54
+
55
+ st.header('Question 4')
56
+ st.write("Is there any correlation between the continuous variables and the 'machine failure' label? For example, does the tool wear increase the likelihood of machine failure?")
57
+ st.image("reports/q4.png")
58
+ st.write("**Null Hypothesis: There is no significant relationship between the different colums and Machine Failure**")
59
+ st.write("**Alternate Hypothesis: There is a significant relationship between the tool wear and the machine failure label**")
60
+ st.image("reports/h0.png")
61
+
62
+
63
+ st.header('Question 5')
64
+ st.write("Is there any correlation between the categorical variable 'productID' and the continuous variable? For example, is the 'rotational speed' higher for high-quality products than for low-quality products? ")
65
+ st.image("reports/q5.png")
66
+ st.write("**Process Temperature seems to have an effect on high quality varient machines. Therefore we can say that Process Temperature is correlated with machine type.**")
67
+
68
+ st.header('Question 6')
69
+ st.write("Are there any significant interactions or non-linear relationships between the variables that could be important for predictive maintenance? For example, does torque increase non-linearly with rotational speed?")
70
+ st.image("reports/q5.png")
71
+ st.write("**Process Temperature seems to have an effect on high quality varient machines. Therefore we can say that Process Temperature is correlated with machine type.**")
72
+
73
+ st.header('Question 7')
74
+ st.write("Are there any discernible patterns or anomalies in the timing of machine failures?How do machine failure rates change over time? ")
75
+ st.image("reports/q5.png")
76
+ st.write("**Process Temperature seems to have an effect on high quality varient machines. Therefore we can say that Process Temperature is correlated with machine type.**")
77
+
78
+ st.header('Question 8')
79
+ st.write("Do the distributions of continuous variables differ significantly across various product types?")
80
+ st.image("reports/q5.png")
81
+ # st.write("**Process Temperature seems to have an effect on high quality varient machines. Therefore we can say that Process Temperature is correlated with machine type.**")
82
+
83
+ st.header('Question 9')
84
+ st.write("How does machine failure frequency vary with different operating conditions, such as changes in air temperature and rotational speed?")
85
+ st.image("reports/q5.png")
86
+ st.write("**Process Temperature seems to have an effect on high quality varient machines. Therefore we can say that Process Temperature is correlated with machine type.**")
87
+
88
+
89
+
90
+
91
+
92
+
93
+ if choice == "Performance Measures":
94
+
95
+ st.title("Model 1")
96
+ annotated_text(("Best Model 1", "Random Forest Classifier"))
97
+ st.image("reports/model1.png")
98
+
99
+
100
+
101
+ st.title("Model 2")
102
+ annotated_text(("Best Model 1", "Random Forest Classifier"))
103
+ st.image("reports/model2.png")
104
+
105
+
106
+
107
+
108
+
109
+ if choice == "Monitoring Reports":
110
+
111
+ options = st.selectbox('Choose the reports: ',('Data Report', 'Model 1 report', 'Model 2 report'))
112
+
113
+ if options=='Data Report':
114
+ with open("reports/data_drift.html", "r",encoding="utf-8") as f:
115
+ html_report = f.read()
116
+
117
+ components.html(html_report, scrolling=True, height=700)
118
+
119
+
120
+ if options=='Model 1 report':
121
+ with open("reports/classification_performance_report.html", "r",encoding="utf-8") as f:
122
+ html_report = f.read()
123
+
124
+ components.html(html_report, height=750, scrolling=True)
125
+
126
+ if options=='Model 2 report':
127
+ with open("reports/classification_performance_report2.html", "r",encoding="utf-8") as f:
128
+ html_report = f.read()
129
+
130
+ components.html(html_report, height=750, scrolling=True)
131
+
132
+
133
+
134
+
135
+
136
+
137
+
138
+
139
+ if choice == "Prediction":
140
+
141
+ st.title('Predictive Maintenance')
142
+ st.write("**Please enter the following parameters**")
143
+
144
+ type = st.selectbox(
145
+ 'Type',('Low', 'Medium', 'High'))
146
+
147
+ st.write('You selected:', type)
148
+
149
+ rpm = st.number_input('RPM', value=1500.0)
150
+ st.write('The current rpm is ', rpm)
151
+
152
+ torque = st.number_input('Torque', value=75)
153
+ st.write('The current rpm is ', torque)
154
+
155
+ tool_wear = st.number_input('Tool Wear', value=25.00)
156
+ st.write('The current rpm is ', tool_wear)
157
+
158
+ air_temp = st.number_input('Air Temperature', value=35.4)
159
+ st.write('The current rpm is ', air_temp)
160
+
161
+ process_temp = st.number_input('Process Temperature', value=46.65)
162
+ st.write('The current rpm is ', process_temp)
163
+
164
+ if st.button("Predict"):
165
+
166
+ result1, result2 = prediction(type, rpm, torque, tool_wear, air_temp, process_temp)
167
+
168
+ st.write("Machine Failure?: ", result1)
169
+ st.write("Type of Failure: ", result2)
170
+
171
+
172
+
173
+
174
+
175
+
176
+
177
+
178
+
179
+
180
+
181
+ #type, rpm, torque, tool_wear, air_temp, process_temp
artifacts/eda/eda.json ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/eda/question_five_violin_plots.html ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/eda/question_four_correlation_matrix.html ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/eda/question_four_ttest.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"columns":["test-statistic","p-value","Hypothesis"],"index":["Air temperature [K]","Process temperature [K]","Rotational speed [rpm]","Torque [Nm]","Tool wear [min]"],"data":[[8.2830178518,1.354800148e-16,"Reject null hypothesis"],[3.5965621887,0.0003240058,"Reject null hypothesis"],[-4.4226338712,0.0000098535,"Reject null hypothesis"],[19.4901955716,4.573804886e-83,"Reject null hypothesis"],[10.6028805888,3.976075963e-26,"Reject null hypothesis"]]}
artifacts/eda/question_one.html ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/eda/question_three_boxplots.html ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/eda/question_three_histograms.html ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/eda/question_two.html ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/model1/model_1.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43d36b18fbc6eb5df7209a1c019826d26928cd0d3aa1ff51377585d55ebf19fa
3
+ size 8636293
artifacts/model2/model_2.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98716f2069bd150492222c8163fd7169c48c6eed31326020503908d4337484db
3
+ size 22037299
artifacts/preprocessed_csv.csv ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/raw.csv ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/scaler.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19992055b94cb982f06621b5f356ebd0dc8be8699e29f9f80265a700c93b0d58
3
+ size 873
frontend/main/main_page.md ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Equipment Predictive Maintenance
2
+
3
+ ## Overview
4
+
5
+ ---
6
+
7
+ Predictive maintenance can help companies minimize downtime, reduce repair costs, and improve operational efficiency. Developing a web application for predictive maintenance can provide users with real-time insights into equipment performance, enabling proactive maintenance, and reducing unplanned downtime.
8
+
9
+ The important business questions that are solved by employing data driven approach using machine learning models are:
10
+
11
+ - Predict whether an equipment will fail or not based.
12
+
13
+ - Identifies the type of equipment failure.
14
+
15
+ This web application is a demonstration of predictive maintenance using a synthetic dataset. The dataset comprises of process parameters, ambient air and process temperatures, rotational speed, torque, and tool wear.
16
+
17
+ ---
18
+
19
+ ## Performance metrics
20
+
21
+ ---
22
+
23
+ To evaluate the performance of the ML models used in the project following metrics are used:
24
+
25
+ - Precsion, recall, and F1 score of the machine learning models.
26
+ - Responsiveness and ease of use of the web application.
27
+
28
+ ---
29
+
30
+ The web application provides the following functionality:
31
+
32
+ - Users can provide the process parameters to the model and receive a prediction of whether the equipment will fail or not, and the type of failure.
33
+ - Users can view and infer the performance metrics of different machine learning models.
34
+ - Users can visualize the data and gain insights into the behavior of the equipment.
35
+ - The application should be built using Streamlit and deployed using Docker and Huggingface spaces.
36
+ - The cost of deployment should be minimal.
37
+
38
+ ---
39
+
40
+ ## Problem Statement
41
+
42
+ ---
43
+
44
+ The problem is to develop a machine learning model that predicts equipment failures based on process parameters.
45
+
46
+ ---
47
+ ## Dataset
48
+ ---
49
+
50
+ The dataset consists of more than 10,000 data points stored as rows with 14 features in columns. The features include process parameters such as air and process temperatures, rotational speed, torque, and tool wear. The target variable is a binary label indicating whether the equipment failed or not.
51
+
52
+ The dataset consists of 10 000 data points stored as rows with 14 features in columns
53
+
54
+ UID: unique identifier ranging from 1 to 10000
55
+
56
+ product ID: consisting of a letter L, M, or H for low (50% of all products), medium (30%) and high (20%) as
57
+
58
+ product quality variants and a variant-specific serial number
59
+
60
+ air temperature [K]: generated using a random walk process later normalized to a standard deviation of 2 K around 300 K
61
+
62
+ process temperature [K]: generated using a random walk process normalized to a standard deviation of 1 K, added to the air temperature plus 10 K.
63
+
64
+ rotational speed [rpm]: calculated from a power of 2860 W, overlaid with a normally distributed noise
65
+
66
+ torque [Nm]: torque values are normally distributed around 40 Nm
67
+
68
+ tool wear [min]: The quality variants H/M/L add 5/3/2 minutes of tool wear to the used tool in the process. and a 'machine failure' label that indicates, whether the machine has failed in this particular datapoint for any of the following failure modes are true.
69
+
70
+
71
+ ---
72
+ ## ML models
73
+ ---
74
+ We will utilize both a binary classification model, and a multi-class classification model to predict equipment failures, and type of equipment failure respectively.
75
+
76
+ ### Machine failure consists of five unique modes
77
+ tool wear failure (TWF): the tool will be replaced of fail at a randomly selected tool wear time between 200 to 240 mins. At this point in time, the tool is replaced 69 times, and fails 51 times (randomly assigned).
78
+
79
+ heat dissipation failure (HDF): heat dissipation causes a process failure, if the difference between air- and process temperature is below 8.6 K and the tool's rotational speed is below 1380 rpm. This is the case for 115 data points.
80
+
81
+ power failure (PWF): the product of torque and rotational speed (in rad/s) equals the power required for the process. If this power is below 3500 W or above 9000 W, the process fails, which is the case 95 times in our dataset.
82
+
83
+ overstrain failure (OSF): if the product of tool wear and torque exceeds 11,000 minNm for the L product variant (12,000 M, 13,000 H), the process fails due to overstrain. This is true for 98 datapoints.
84
+
85
+ random failures (RNF): each process has a chance of 0,1 % to fail regardless of its process parameters. This is the case for only 5 datapoints, less than could be expected for 10,000 datapoints in our dataset.
86
+
87
+ If at least one of the above failure modes is true, the process fails and the 'machine failure' label is set to 1. It is therefore not transparent to the machine learning method, which of the failure modes has caused the process to fail
88
+
89
+ The following machine learning models will be used:
90
+
91
+ - Random Forest
92
+ - Decision Tree
93
+ - Logistic Regression
94
+ - Support Vector Machine (SVM)
95
+ ---
96
+ ## Generic steps
97
+ ---
98
+ - Data preprocessing and cleaning
99
+ - Feature engineering and selection
100
+ - Model selection and training
101
+ - Hyperparameter tuning
102
+ - Model evaluation and testing
103
+ ---
104
+ ## Architecture
105
+ ---
106
+ The web application architecture will consist of the following components:
107
+
108
+ - A frontend web application built using Streamlit
109
+ - A machine learning model for equipment failure prediction
110
+ - Docker containers to run the frontend, backend, and model
111
+ - Cloud infrastructure to host the application
112
+ - CI/CD pipeline using GitHub Actions for automated deployment
113
+
114
+ The frontend will interact with the backend server through API calls to request predictions, model training, and data storage. The backend server will manage user authentication, data storage, and model training. The machine learning model will be trained and deployed using Docker containers. The application will be hosted on Huggingface sppaces. The CI/CD pipeline will be used to automate the deployment process.
115
+
116
+ ---
117
+
118
+ ## Mlops practices
119
+
120
+ ---
121
+
122
+ This project is designed to create an end-to-end workflow for developing and deploying a web application that performs data preprocessing, model training, model evaluation, and prediction. The pipeline leverages Docker containers for encapsulating code, artifacts, and both the frontend and backend components of the application. The application is deployed on a huggingface space to provide a cloud hosting solution.
123
+
124
+ ---
frontend/reports/classification_performance_report.html ADDED
The diff for this file is too large to render. See raw diff
 
frontend/reports/classification_performance_report2.html ADDED
The diff for this file is too large to render. See raw diff
 
frontend/reports/data_drift.html ADDED
The diff for this file is too large to render. See raw diff
 
init_setup.sh ADDED
File without changes
mlruns/0/0f148d87fa9e434e8be67038ff601c1e/artifacts/Logistic Regression/MLmodel ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ artifact_path: Logistic Regression
2
+ flavors:
3
+ python_function:
4
+ env:
5
+ conda: conda.yaml
6
+ virtualenv: python_env.yaml
7
+ loader_module: mlflow.sklearn
8
+ model_path: model.pkl
9
+ predict_fn: predict
10
+ python_version: 3.11.9
11
+ sklearn:
12
+ code: null
13
+ pickled_model: model.pkl
14
+ serialization_format: cloudpickle
15
+ sklearn_version: 1.3.0
16
+ mlflow_version: 2.12.2
17
+ model_size_bytes: 1212
18
+ model_uuid: dcd249d24f0842a3ba42ccc1171c80a0
19
+ run_id: 0f148d87fa9e434e8be67038ff601c1e
20
+ utc_time_created: '2024-05-18 18:28:17.570941'
mlruns/0/0f148d87fa9e434e8be67038ff601c1e/artifacts/Logistic Regression/conda.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ channels:
2
+ - conda-forge
3
+ dependencies:
4
+ - python=3.11.9
5
+ - pip<=24.0
6
+ - pip:
7
+ - mlflow==2.12.2
8
+ - cloudpickle==3.0.0
9
+ - numpy==1.26.4
10
+ - packaging==23.2
11
+ - psutil==5.9.8
12
+ - pyyaml==6.0.1
13
+ - scikit-learn==1.3.0
14
+ - scipy==1.13.0
15
+ name: mlflow-env
mlruns/0/0f148d87fa9e434e8be67038ff601c1e/artifacts/Logistic Regression/metadata/MLmodel ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ artifact_path: Logistic Regression
2
+ flavors:
3
+ python_function:
4
+ env:
5
+ conda: conda.yaml
6
+ virtualenv: python_env.yaml
7
+ loader_module: mlflow.sklearn
8
+ model_path: model.pkl
9
+ predict_fn: predict
10
+ python_version: 3.11.9
11
+ sklearn:
12
+ code: null
13
+ pickled_model: model.pkl
14
+ serialization_format: cloudpickle
15
+ sklearn_version: 1.3.0
16
+ mlflow_version: 2.12.2
17
+ model_size_bytes: 1212
18
+ model_uuid: dcd249d24f0842a3ba42ccc1171c80a0
19
+ run_id: 0f148d87fa9e434e8be67038ff601c1e
20
+ utc_time_created: '2024-05-18 18:28:17.570941'
mlruns/0/0f148d87fa9e434e8be67038ff601c1e/artifacts/Logistic Regression/metadata/conda.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ channels:
2
+ - conda-forge
3
+ dependencies:
4
+ - python=3.11.9
5
+ - pip<=24.0
6
+ - pip:
7
+ - mlflow==2.12.2
8
+ - cloudpickle==3.0.0
9
+ - numpy==1.26.4
10
+ - packaging==23.2
11
+ - psutil==5.9.8
12
+ - pyyaml==6.0.1
13
+ - scikit-learn==1.3.0
14
+ - scipy==1.13.0
15
+ name: mlflow-env
mlruns/0/0f148d87fa9e434e8be67038ff601c1e/artifacts/Logistic Regression/metadata/python_env.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ python: 3.11.9
2
+ build_dependencies:
3
+ - pip==24.0
4
+ - setuptools
5
+ - wheel
6
+ dependencies:
7
+ - -r requirements.txt
mlruns/0/0f148d87fa9e434e8be67038ff601c1e/artifacts/Logistic Regression/metadata/requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ mlflow==2.12.2
2
+ cloudpickle==3.0.0
3
+ numpy==1.26.4
4
+ packaging==23.2
5
+ psutil==5.9.8
6
+ pyyaml==6.0.1
7
+ scikit-learn==1.3.0
8
+ scipy==1.13.0
mlruns/0/0f148d87fa9e434e8be67038ff601c1e/artifacts/Logistic Regression/model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8412a6ce67b61625ad4c227666960c18954e5fa987b55da62786269711514939
3
+ size 1212
mlruns/0/0f148d87fa9e434e8be67038ff601c1e/artifacts/Logistic Regression/python_env.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ python: 3.11.9
2
+ build_dependencies:
3
+ - pip==24.0
4
+ - setuptools
5
+ - wheel
6
+ dependencies:
7
+ - -r requirements.txt
mlruns/0/0f148d87fa9e434e8be67038ff601c1e/artifacts/Logistic Regression/requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ mlflow==2.12.2
2
+ cloudpickle==3.0.0
3
+ numpy==1.26.4
4
+ packaging==23.2
5
+ psutil==5.9.8
6
+ pyyaml==6.0.1
7
+ scikit-learn==1.3.0
8
+ scipy==1.13.0
mlruns/0/0f148d87fa9e434e8be67038ff601c1e/meta.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ artifact_uri: file:///C:/Meera/Jio//15MLOps/project/Predmain/mlruns/0/0f148d87fa9e434e8be67038ff601c1e/artifacts
2
+ end_time: 1716056900937
3
+ entry_point_name: ''
4
+ experiment_id: '0'
5
+ lifecycle_stage: active
6
+ run_id: 0f148d87fa9e434e8be67038ff601c1e
7
+ run_name: Logistic Regression1
8
+ run_uuid: 0f148d87fa9e434e8be67038ff601c1e
9
+ source_name: ''
10
+ source_type: 4
11
+ source_version: ''
12
+ start_time: 1716056897546
13
+ status: 3
14
+ tags: []
15
+ user_id: kmeer
mlruns/0/0f148d87fa9e434e8be67038ff601c1e/metrics/Accuracy ADDED
@@ -0,0 +1 @@
 
 
1
+ 1716056900921 82.65561598894932 0
mlruns/0/0f148d87fa9e434e8be67038ff601c1e/metrics/F1 ADDED
@@ -0,0 +1 @@
 
 
1
+ 1716056900937 82.10479678150818 0
mlruns/0/0f148d87fa9e434e8be67038ff601c1e/metrics/Precision ADDED
@@ -0,0 +1 @@
 
 
1
+ 1716056900928 81.89191093068376 0
mlruns/0/0f148d87fa9e434e8be67038ff601c1e/metrics/Recall ADDED
@@ -0,0 +1 @@
 
 
1
+ 1716056900928 82.7080868592029 0
mlruns/0/0f148d87fa9e434e8be67038ff601c1e/tags/mlflow.log-model.history ADDED
@@ -0,0 +1 @@
 
 
1
+ [{"run_id": "0f148d87fa9e434e8be67038ff601c1e", "artifact_path": "Logistic Regression", "utc_time_created": "2024-05-18 18:28:17.570941", "flavors": {"python_function": {"model_path": "model.pkl", "predict_fn": "predict", "loader_module": "mlflow.sklearn", "python_version": "3.11.9", "env": {"conda": "conda.yaml", "virtualenv": "python_env.yaml"}}, "sklearn": {"pickled_model": "model.pkl", "sklearn_version": "1.3.0", "serialization_format": "cloudpickle", "code": null}}, "model_uuid": "dcd249d24f0842a3ba42ccc1171c80a0", "mlflow_version": "2.12.2", "model_size_bytes": 1212}]
mlruns/0/0f148d87fa9e434e8be67038ff601c1e/tags/mlflow.runName ADDED
@@ -0,0 +1 @@
 
 
1
+ Logistic Regression1
mlruns/0/0f148d87fa9e434e8be67038ff601c1e/tags/mlflow.source.name ADDED
@@ -0,0 +1 @@
 
 
1
+ C:\Meera\Jio\15MLOps\project\Predmain\src\Predictive_Maintenance\pipelines\training_pipeline.py
mlruns/0/0f148d87fa9e434e8be67038ff601c1e/tags/mlflow.source.type ADDED
@@ -0,0 +1 @@
 
 
1
+ LOCAL
mlruns/0/0f148d87fa9e434e8be67038ff601c1e/tags/mlflow.user ADDED
@@ -0,0 +1 @@
 
 
1
+ kmeer
mlruns/0/25cb228506ae475fa3e381cac63bc8a6/artifacts/Random Forest/MLmodel ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ artifact_path: Random Forest
2
+ flavors:
3
+ python_function:
4
+ env:
5
+ conda: conda.yaml
6
+ virtualenv: python_env.yaml
7
+ loader_module: mlflow.sklearn
8
+ model_path: model.pkl
9
+ predict_fn: predict
10
+ python_version: 3.11.9
11
+ sklearn:
12
+ code: null
13
+ pickled_model: model.pkl
14
+ serialization_format: cloudpickle
15
+ sklearn_version: 1.3.0
16
+ mlflow_version: 2.12.2
17
+ model_size_bytes: 22037299
18
+ model_uuid: 01494b63f3fe44098fc10c9b46ac379a
19
+ run_id: 25cb228506ae475fa3e381cac63bc8a6
20
+ utc_time_created: '2024-05-18 18:29:37.003417'
mlruns/0/25cb228506ae475fa3e381cac63bc8a6/artifacts/Random Forest/conda.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ channels:
2
+ - conda-forge
3
+ dependencies:
4
+ - python=3.11.9
5
+ - pip<=24.0
6
+ - pip:
7
+ - mlflow==2.12.2
8
+ - cloudpickle==3.0.0
9
+ - numpy==1.26.4
10
+ - packaging==23.2
11
+ - psutil==5.9.8
12
+ - pyyaml==6.0.1
13
+ - scikit-learn==1.3.0
14
+ - scipy==1.13.0
15
+ name: mlflow-env
mlruns/0/25cb228506ae475fa3e381cac63bc8a6/artifacts/Random Forest/metadata/MLmodel ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ artifact_path: Random Forest
2
+ flavors:
3
+ python_function:
4
+ env:
5
+ conda: conda.yaml
6
+ virtualenv: python_env.yaml
7
+ loader_module: mlflow.sklearn
8
+ model_path: model.pkl
9
+ predict_fn: predict
10
+ python_version: 3.11.9
11
+ sklearn:
12
+ code: null
13
+ pickled_model: model.pkl
14
+ serialization_format: cloudpickle
15
+ sklearn_version: 1.3.0
16
+ mlflow_version: 2.12.2
17
+ model_size_bytes: 22037299
18
+ model_uuid: 01494b63f3fe44098fc10c9b46ac379a
19
+ run_id: 25cb228506ae475fa3e381cac63bc8a6
20
+ utc_time_created: '2024-05-18 18:29:37.003417'
mlruns/0/25cb228506ae475fa3e381cac63bc8a6/artifacts/Random Forest/metadata/conda.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ channels:
2
+ - conda-forge
3
+ dependencies:
4
+ - python=3.11.9
5
+ - pip<=24.0
6
+ - pip:
7
+ - mlflow==2.12.2
8
+ - cloudpickle==3.0.0
9
+ - numpy==1.26.4
10
+ - packaging==23.2
11
+ - psutil==5.9.8
12
+ - pyyaml==6.0.1
13
+ - scikit-learn==1.3.0
14
+ - scipy==1.13.0
15
+ name: mlflow-env
mlruns/0/25cb228506ae475fa3e381cac63bc8a6/artifacts/Random Forest/metadata/python_env.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ python: 3.11.9
2
+ build_dependencies:
3
+ - pip==24.0
4
+ - setuptools
5
+ - wheel
6
+ dependencies:
7
+ - -r requirements.txt
mlruns/0/25cb228506ae475fa3e381cac63bc8a6/artifacts/Random Forest/metadata/requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ mlflow==2.12.2
2
+ cloudpickle==3.0.0
3
+ numpy==1.26.4
4
+ packaging==23.2
5
+ psutil==5.9.8
6
+ pyyaml==6.0.1
7
+ scikit-learn==1.3.0
8
+ scipy==1.13.0
mlruns/0/25cb228506ae475fa3e381cac63bc8a6/artifacts/Random Forest/model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98716f2069bd150492222c8163fd7169c48c6eed31326020503908d4337484db
3
+ size 22037299