Neprox commited on
Commit
c33542b
1 Parent(s): fa44c48
Files changed (1) hide show
  1. app.py +29 -23
app.py CHANGED
@@ -5,17 +5,14 @@ import streamlit as st
5
  import seaborn as sns
6
  import matplotlib.pyplot as plt
7
 
8
- #from dotenv import load_dotenv
9
- #load_dotenv()
10
 
11
  @st.experimental_memo
12
  def load_data():
13
  project = hopsworks.login()
14
  fs = project.get_feature_store()
15
 
16
- #if not os.path.isfile("./cache/batch_data.pkl"):
17
- # if not os.path.isdir("./cache"):
18
- # os.mkdir("./cache")
19
  posts_fg = fs.get_feature_group("reddit_posts", version=os.getenv("POSTS_FG_VERSION", default=1))
20
  users_fg = fs.get_feature_group("reddit_users", version=os.getenv("USERS_FG_VERSION", default=1))
21
  subreddits_fg = fs.get_feature_group("reddit_subreddits", version=os.getenv("SUBREDDITS_FG_VERSION", default=1))
@@ -24,13 +21,9 @@ def load_data():
24
  subreddits_fg.select(features=["subreddit_id", "snapshot_time"]), on=["subreddit_id", "snapshot_time"])
25
  df = full_join.read()
26
 
27
- # df.to_pickle("./cache/batch_data.pkl")
28
- #else:
29
- # df = pd.read_pickle("./cache/batch_data.pkl")
30
-
31
  # Load model including the generated images and evaluation scores
32
  mr = project.get_model_registry()
33
- model_hsfs = mr.get_model("reddit_predict", version=16)
34
  model_dir = model_hsfs.download()
35
  print("Model directory: {}".format(model_dir))
36
 
@@ -42,13 +35,19 @@ def load_data():
42
  metric_rows[target].append(model_hsfs.training_metrics[f"{metric}_{target}"])
43
  df_metrics = pd.DataFrame(metric_rows, index=metrics_avail)
44
 
45
- img_predictions = plt.imread(f"{model_dir}/prediction_error.png")
46
- img_predictions_logscale = plt.imread(f"{model_dir}/prediction_error_logscale.png")
 
 
 
 
 
 
47
 
48
- return df, img_predictions, img_predictions_logscale, df_metrics
49
 
50
 
51
- df, img_predictions, img_predictions_logscale, df_metrics = load_data()
52
 
53
  # create a distribution plot of the number of likes using seaborn
54
  st.title("Like It or Not")
@@ -89,15 +88,22 @@ st.dataframe(df_metrics)
89
  st.markdown("## Prediction Error Plots")
90
  st.markdown("The green line indicates the perfect prediction while the blue lines show point densities. Every point represents a prediction. The model is optimized for the number of likes and provides an estimate for the minimum number of likes expected. The upvote ratio does not perform well and would profit from dedicated modeling with another objective function if it is important.")
91
  st.markdown("### Linear Scale")
92
- st.image(img_predictions)
93
  st.markdown("### Log Scale")
94
- st.image(img_predictions_logscale)
95
 
96
  # Confusion matrix
97
- #st.markdown("## Confusion Matrix")
98
- #st.markdown("The confusion matrix of the model is as follows:")
99
- #st.image("confusion_matrix.png")
100
-
101
- # display the evaluation scores table
102
- #st.title("Evaluation Scores")
103
- #st.dataframe(df[["metric1", "metric2", "metric3", "metric4"]])
 
 
 
 
 
 
 
 
5
  import seaborn as sns
6
  import matplotlib.pyplot as plt
7
 
8
+ from dotenv import load_dotenv
9
+ load_dotenv()
10
 
11
  @st.experimental_memo
12
  def load_data():
13
  project = hopsworks.login()
14
  fs = project.get_feature_store()
15
 
 
 
 
16
  posts_fg = fs.get_feature_group("reddit_posts", version=os.getenv("POSTS_FG_VERSION", default=1))
17
  users_fg = fs.get_feature_group("reddit_users", version=os.getenv("USERS_FG_VERSION", default=1))
18
  subreddits_fg = fs.get_feature_group("reddit_subreddits", version=os.getenv("SUBREDDITS_FG_VERSION", default=1))
 
21
  subreddits_fg.select(features=["subreddit_id", "snapshot_time"]), on=["subreddit_id", "snapshot_time"])
22
  df = full_join.read()
23
 
 
 
 
 
24
  # Load model including the generated images and evaluation scores
25
  mr = project.get_model_registry()
26
+ model_hsfs = mr.get_model("reddit_predict", version=18)
27
  model_dir = model_hsfs.download()
28
  print("Model directory: {}".format(model_dir))
29
 
 
35
  metric_rows[target].append(model_hsfs.training_metrics[f"{metric}_{target}"])
36
  df_metrics = pd.DataFrame(metric_rows, index=metrics_avail)
37
 
38
+ plots = {
39
+ "predictions": plt.imread(f"{model_dir}/prediction_error.png"),
40
+ "predictions_logscale": plt.imread(f"{model_dir}/prediction_error_logscale.png"),
41
+ "confusion_matrix": plt.imread(f"{model_dir}/confusion_matrix.png"),
42
+ "shap_numlikes": plt.imread(f"{model_dir}/shap_summary_plot_num_likes.png"),
43
+ "shap_upvote_ratio": plt.imread(f"{model_dir}/shap_summary_plot_upvote_ratio.png"),
44
+ "shap_numlikes_compact": plt.imread(f"{model_dir}/shap_summary_plot_compact.png")
45
+ }
46
 
47
+ return df, plots, df_metrics
48
 
49
 
50
+ df, plots, df_metrics = load_data()
51
 
52
  # create a distribution plot of the number of likes using seaborn
53
  st.title("Like It or Not")
 
88
  st.markdown("## Prediction Error Plots")
89
  st.markdown("The green line indicates the perfect prediction while the blue lines show point densities. Every point represents a prediction. The model is optimized for the number of likes and provides an estimate for the minimum number of likes expected. The upvote ratio does not perform well and would profit from dedicated modeling with another objective function if it is important.")
90
  st.markdown("### Linear Scale")
91
+ st.image(plots["predictions"])
92
  st.markdown("### Log Scale")
93
+ st.image(plots["predictions_logscale"])
94
 
95
  # Confusion matrix
96
+ st.markdown("## Confusion Matrix")
97
+ st.markdown("After mapping the predicted number of likes to categories, the following confusion matrix can be obtained:")
98
+ st.image(plots["confusion_matrix"])
99
+
100
+ # Shap plots
101
+ st.markdown("## Shap Evaluation")
102
+ st.markdown("Shap values are an approach to machine learning explainability where the magnitude and kind of impact (positive / negative) of all features is computed." +
103
+ "Below, you see a beeswarm plot obtained on the predictions on the test data where every point represents a sample, its color tells if the feature had a high or low value " +
104
+ "and its position tells if the feature had a positive or negative impact on the prediction.")
105
+ st.image(plots["shap_numlikes"])
106
+
107
+ st.markdown("In addition, it is possible to sum up and average the absolute impact of all features over all samples. " +
108
+ "The result can be interpreted as the feature importance. For the embedding features, we summed the values of the individual dimensions.")
109
+ st.image(plots["shap_numlikes_compact"])