Spaces:

ludvigolsen
/

plot_confusion_matrix

Running

App Files Files Community

Ludvig commited on May 31, 2023

Commit

87c83ab

•

1 Parent(s): 4c94861

Adds application. Progression quickly!

Browse files

Files changed (9) hide show

app.py +535 -0
cvms_version.R +1 -0
data.py +72 -0
generate_data.R +46 -0
plot.R +194 -0
requirements.txt +14 -0
small_example.csv +9 -0
text_sections.py +103 -0
utils.py +25 -0

app.py ADDED Viewed

	@@ -0,0 +1,535 @@

+"""
+App for plotting confusion matrix with `cvms::plot_confusion_matrix()`.
+TODO:
+- IMPORTANT! Allow specifying which class probabilities are of! (See plot prob_of_class)
+- Allow setting threshold - manual, max J, spec/sens
+- Add bg box around confusion matrix plot as text dissappears on dark mode!
+- ggsave does not use dpi??
+- allow svg, pdf?
+- entered count -> counts (upload as well)
+- Add full reset button (empty cache on different files)
+"""
+import pathlib
+import tempfile
+from PIL import Image
+import streamlit as st  # Import last
+import pandas as pd
+from pandas.api.types import is_float_dtype
+from itertools import combinations
+from collections import OrderedDict
+from utils import call_subprocess, clean_string_for_non_alphanumerics
+from data import read_data, read_data_cached, DownloadHeader, generate_data
+from text_sections import (
+    intro_text,
+    columns_text,
+    upload_predictions_text,
+    upload_counts_text,
+    generate_data_text,
+    design_text,
+    enter_count_data_text,
+)
+# Create temporary directory
+@st.cache_resource
+def set_tmp_dir():
+    """
+    Must cache to avoid regenerating!
+    Must be the same throughout the iterations!
+    """
+    temp_dir = tempfile.TemporaryDirectory()
+    return temp_dir, temp_dir.name
+temp_dir, temp_dir_path = set_tmp_dir()
+gen_data_store_path = pathlib.Path(f"{temp_dir_path}/generated_data.csv")
+data_store_path = pathlib.Path(f"{temp_dir_path}/data.csv")
+conf_mat_path = pathlib.Path(f"{temp_dir_path}/confusion_matrix.png")
+def input_choice_callback():
+    """
+    Resets steps to 0.
+    Used when switching between input methods.
+    """
+    st.session_state["step"] = 0
+    st.session_state["input_type"] = None
+    # Remove old tmp files
+    if gen_data_store_path.exists():
+        gen_data_store_path.unlink()
+    if data_store_path.exists():
+        data_store_path.unlink()
+    if conf_mat_path.exists():
+        conf_mat_path.unlink()
+# Text
+intro_text()
+# Start step counter
+# Required to make dependent forms work
+if st.session_state.get("step") is None:
+    st.session_state["step"] = 0
+input_choice = st.radio(
+    label="Input",
+    options=["Upload predictions", "Upload counts", "Generate", "Enter counts"],
+    index=0,
+    horizontal=True,
+    on_change=input_choice_callback,
+)
+# Check whether the expected output
+if st.session_state.get("input_type") is None:
+    if input_choice in ["Upload predictions", "Generate"]:
+        st.session_state["input_type"] = "data"
+    else:
+        st.session_state["input_type"] = "counts"
+# Load data
+if input_choice == "Upload predictions":
+    with st.form(key="data_form"):
+        upload_predictions_text()
+        data_path = st.file_uploader("Upload a dataset", type=["csv"])
+        if st.form_submit_button(label="Use data"):
+            if data_path:
+                st.session_state["step"] = 1
+            else:
+                st.session_state["step"] = 0
+                st.markdown("Please upload a file first (or **generate** some random data to try the function).")
+    if st.session_state["step"] >= 1:
+        # Read and store (tmp) data
+        df = read_data_cached(data_path)
+        with st.form(key="column_form"):
+            columns_text()
+            target_col = st.selectbox("Targets column", options=list(df.columns))
+            prediction_col = st.selectbox(
+                "Predictions column", options=list(df.columns)
+            )
+            if st.form_submit_button(label="Set columns"):
+                st.session_state["step"] = 2
+# Load data
+elif input_choice == "Upload counts":
+    with st.form(key="data_form"):
+        upload_counts_text()
+        data_path = st.file_uploader("Upload a dataset", type=["csv"])
+        if st.form_submit_button(label="Use data"):
+            if data_path:
+                st.session_state["step"] = 1
+            else:
+                st.session_state["step"] = 0
+                st.write("Please upload a file first.")
+    if st.session_state["step"] >= 1:
+        # Read and store (tmp) data
+        df = read_data_cached(data_path)
+        with st.form(key="column_form"):
+            columns_text()
+            target_col = st.selectbox("Targets column", options=list(df.columns))
+            prediction_col = st.selectbox(
+                "Predictions column", options=list(df.columns)
+            )
+            n_col = st.selectbox(
+                "Counts column", options=list(df.columns)
+            )
+            if st.form_submit_button(label="Set columns"):
+                st.session_state["step"] = 2
+# Generate data
+elif input_choice == "Generate":
+    def reset_generation_callback():
+        p = pathlib.Path(gen_data_store_path)
+        if p.exists():
+            p.unlink()
+    with st.form(key="generate_form"):
+        generate_data_text()
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            num_classes = st.number_input(
+                "# Classes",
+                value=3,
+                min_value=2,
+                help="Number of classes to generate data for.",
+            )
+        with col2:
+            num_observations = st.number_input(
+                "# Observations",
+                value=30,
+                min_value=2,
+                max_value=10000,
+                help="Number of observations to generate data for.",
+            )
+        with col3:
+            seed = st.number_input("Random Seed", value=42, min_value=0)
+        if st.form_submit_button(
+            label="Generate data", on_click=reset_generation_callback
+        ):
+            st.session_state["step"] = 2
+    if st.session_state["step"] >= 2:
+        generate_data(
+            out_path=gen_data_store_path,
+            num_classes=num_classes,
+            num_observations=num_observations,
+            seed=seed,
+        )
+        df = read_data(gen_data_store_path)
+        target_col = "Target"
+        prediction_col = "Predicted Class"
+elif input_choice == "Enter counts":
+    def repopulate_matrix_callback():
+        if "entered_counts" not in st.session_state:
+            if "entered_counts" in st.session_state:
+                st.session_state.pop("entered_counts")
+    with st.form(key="enter_classes_form"):
+        enter_count_data_text()
+        classes_joined = st.text_input("Classes (comma-separated)")
+        if st.form_submit_button(
+            label="Populate matrix", on_click=repopulate_matrix_callback
+        ):
+            # Extract class names from comma-separated list
+            st.session_state["classes"] = [
+                clean_string_for_non_alphanumerics(s) for s in classes_joined.split(",")
+            ]
+            # Calculate all pairs of predictions and targets
+            all_pairs = list(combinations(st.session_state["classes"], 2))
+            all_pairs += [(pair[1], pair[0]) for pair in all_pairs]
+            all_pairs += [(c, c) for c in st.session_state["classes"]]
+            # Prepopulate the matrix
+            st.session_state["entered_counts"] = pd.DataFrame(
+                all_pairs, columns=["Target", "Prediction"]
+            )
+            st.session_state["step"] = 1
+    if st.session_state["step"] >= 1:
+        with st.form(key="enter_counts_form"):
+            st.write("Fill in the counts for `N(Target, Prediction)` pairs.")
+            count_input_fields = OrderedDict()
+            num_cols = 3
+            cols = st.columns(num_cols)
+            for i, (targ, pred) in enumerate(
+                zip(
+                    st.session_state["entered_counts"]["Target"],
+                    st.session_state["entered_counts"]["Prediction"],
+                )
+            ):
+                count_input_fields[f"{targ}____{pred}"] = cols[
+                    i % num_cols
+                ].number_input(f"N({targ}, {pred})", step=1)
+            if st.form_submit_button(
+                label="Generate data",
+            ):
+                st.session_state["entered_counts"]["N"] = [
+                    int(val) for val in count_input_fields.values()
+                ]
+                st.session_state["step"] = 2
+    if st.session_state["step"] >= 2:
+        DownloadHeader.header_and_data_download(
+            "Entered counts",
+            data=st.session_state["entered_counts"],
+            file_name="Confusion_Matrix_Counts.csv",
+            help="Download counts",
+        )
+        st.write(st.session_state["entered_counts"])
+        target_col = "Target"
+        prediction_col = "Prediction"
+        n_col = "N"
+if st.session_state["step"] >= 2:
+    if st.session_state["input_type"] == "data":
+        # Remove unused columns
+        df = df.loc[:, [target_col, prediction_col]]
+        # Ensure targets are strings
+        df[target_col] = df[target_col].astype(str)
+        df[target_col] = df[target_col].apply(lambda x: x.replace(" ", "_"))
+        # Save to tmp directory to allow reading in R script
+        df.to_csv(data_store_path)
+        # Extract unique classes
+        st.session_state["classes"] = sorted([str(c) for c in df[target_col].unique()])
+        predictions_are_probabilities = is_float_dtype(df[prediction_col])
+        if predictions_are_probabilities and len(st.session_state["classes"]) != 2:
+            st.error(
+                "Predictions can only be probabilities in binary classification. "
+                f"Got {len(st.session_state['classes'])} classes."
+            )
+        st.subheader("The Data")
+        col1, col2, col3 = st.columns([2, 2, 2])
+        with col2:
+            st.write(df.head(5))
+            st.write(f"{df.shape} (first 5 rows).")
+    else:
+        st.session_state["entered_counts"].to_csv(data_store_path)
+    # Check the number of classes
+    num_classes = len(st.session_state["classes"])
+    print(st.session_state["classes"])
+    if num_classes < 2:
+        # TODO Handle better than throwing error?
+        raise ValueError(
+            "Uploaded data must contain 2 or more classes in `Targets column`. "
+            f"Got {num_classes} target classes."
+        )
+    with st.form(key="settings_form"):
+        design_text()
+        col1, col2 = st.columns(2)
+        with col1:
+            selected_classes = st.multiselect(
+                "Select classes (min=2, order is respected)",
+                options=st.session_state["classes"],
+                default=st.session_state["classes"],
+                help="Select the classes to create the confusion matrix for. "
+                "Any observation with either a target or prediction "
+                "of another class is excluded.",
+            )
+        with col2:
+            if st.session_state["input_type"] == "data" and predictions_are_probabilities:
+                prob_of_class = st.selectbox(
+                    "Probabilities are of (not working)",
+                    options=st.session_state["classes"],
+                    index=1,
+                )
+            else:
+                prob_of_class = None
+        default_elements = [
+            "Counts",
+            "Normalized Counts (%)",
+            "Zero Shading",
+            "Arrows",
+        ]
+        if num_classes < 6:
+            # Percentages clutter too much with many classes
+            default_elements += [
+                "Row Percentages",
+                "Column Percentages",
+            ]
+        elements_to_add = st.multiselect(
+            "Add the following elements",
+            options=[
+                "Sum Tiles",
+                "Counts",
+                "Normalized Counts (%)",
+                "Row Percentages",
+                "Column Percentages",
+                "Zero Shading",
+                "Zero Percentages",
+                "Zero Text",
+                "Arrows",
+            ],
+            default=default_elements,
+        )
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            counts_on_top = st.checkbox(
+                "Counts on top (not working)",
+                help="Whether to switch the positions of the counts and normalized counts (%). "
+                "That is, the counts become the big centralized numbers and the "
+                "normalized counts go below with a smaller font size.",
+            )
+        with col2:
+            diag_percentages_only = st.checkbox("Diagonal row/column percentages only")
+        with col3:
+            num_digits = st.number_input(
+                "Digits", value=2, help="Number of digits to round percentages to."
+            )
+        element_flags = [
+            key
+            for key, val in {
+                "--add_sums": "Sum Tiles" in elements_to_add,
+                "--add_counts": "Counts" in elements_to_add,
+                "--add_normalized": "Normalized Counts (%)" in elements_to_add,
+                "--add_row_percentages": "Row Percentages" in elements_to_add,
+                "--add_col_percentages": "Column Percentages" in elements_to_add,
+                "--add_zero_percentages": "Zero Percentages" in elements_to_add,
+                "--add_zero_text": "Zero Text" in elements_to_add,
+                "--add_zero_shading": "Zero Shading" in elements_to_add,
+                "--add_arrows": "Arrows" in elements_to_add,
+                "--counts_on_top": counts_on_top,
+                "--diag_percentages_only": diag_percentages_only,
+            }.items()
+            if val
+        ]
+        palette = st.selectbox(
+            "Color Palette",
+            options=["Blues", "Greens", "Oranges", "Greys", "Purples", "Reds"],
+        )
+        # Ask for output parameters
+        # TODO: Set default based on number of classes and sum tiles
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            width = st.number_input("Width (px)", value=1200 + 100 * (num_classes - 2))
+        with col2:
+            height = st.number_input(
+                "Height (px)", value=1200 + 100 * (num_classes - 2)
+            )
+        with col3:
+            dpi = st.number_input("DPI (not working)", value=320)
+        if st.form_submit_button(label="Apply"):
+            st.session_state["step"] = 3
+    if st.session_state["step"] >= 3:
+        plotting_args = [
+            "--data_path",
+            f"'{data_store_path}'",
+            "--out_path",
+            f"'{conf_mat_path}'",
+            "--target_col",
+            f"'{target_col}'",
+            "--prediction_col",
+            f"'{prediction_col}'",
+            "--width",
+            f"{width}",
+            "--height",
+            f"{height}",
+            "--dpi",
+            f"{dpi}",
+            "--classes",
+            f"{','.join(selected_classes)}",
+            "--digits",
+            f"{num_digits}",
+            "--palette",
+            f"{palette}",
+        ]
+        if st.session_state["input_type"] == "counts":
+            # The input data are counts
+            plotting_args += ["--n_col", f"{n_col}", "--data_are_counts"]
+        plotting_args += element_flags
+        plotting_args = " ".join(plotting_args)
+        call_subprocess(
+            f"Rscript plot.R {plotting_args}",
+            message="Plotting script",
+            return_output=True,
+            encoding="UTF-8",
+        )
+        DownloadHeader.header_and_image_download(
+            "The confusion matrix plot", filepath=conf_mat_path
+        )
+        col1, col2, col3 = st.columns([2, 8, 2])
+        with col2:
+            image = Image.open(conf_mat_path)
+            st.image(
+                image,
+                caption="Confusion Matrix",
+                # width=500,
+                use_column_width=None,
+                clamp=False,
+                channels="RGB",
+                output_format="auto",
+            )
+        # evaluation = dplyr.select(
+        #     evaluation,
+        #     "Balanced Accuracy",
+        #     "Accuracy",
+        #     "F1",
+        #     "Sensitivity",
+        #     "Specificity",
+        #     "Pos Pred Value",
+        #     "Neg Pred Value",
+        #     "AUC",
+        #     "Kappa",
+        #     "MCC",
+        # )
+        # evaluation_py = ro.conversion.rpy2py(evaluation)
+        # st.write(evaluation_py)
+    # confusion_matrix_py = ro.conversion.rpy2py(confusion_matrix)
+    # st.write(confusion_matrix_py)
+    # evaluation = dplyr.select(
+    #     evaluation,
+    #     "Balanced Accuracy",
+    #     "Accuracy",
+    #     "F1",
+    #     "Sensitivity",
+    #     "Specificity",
+    #     "Pos Pred Value",
+    #     "Neg Pred Value",
+    #     "AUC",
+    #     "Kappa",
+    #     "MCC",
+    # )
+    # evaluation_py = ro.conversion.rpy2py(evaluation)
+    # st.write(evaluation_py)
+    # temp_dir.cleanup()
+else:
+    st.write("Please upload data.")
+#   target_col = "Target",
+#   prediction_col = "Prediction",
+#   counts_col = "N",
+#   class_order = NULL,
+#   add_sums = FALSE,
+#   add_counts = TRUE,
+#   add_normalized = TRUE,
+#   add_row_percentages = TRUE,
+#   add_col_percentages = TRUE,
+#   diag_percentages_only = FALSE,
+#   rm_zero_percentages = TRUE,
+#   rm_zero_text = TRUE,
+#   add_zero_shading = TRUE,
+#   add_arrows = TRUE,
+#   counts_on_top = FALSE,
+#   palette = "Blues",
+#   intensity_by = "counts",
+#   theme_fn = ggplot2::theme_minimal,
+#   place_x_axis_above = TRUE,
+#   rotate_y_text = TRUE,
+#   digits = 1,
+#   font_counts = font(),
+#   font_normalized = font(),
+#   font_row_percentages = font(),
+#   font_col_percentages = font(),
+#   arrow_size = 0.048,
+#   arrow_nudge_from_text = 0.065,
+#   tile_border_color = NA,
+#   tile_border_size = 0.1,
+#   tile_border_linetype = "solid",
+#   sums_settings = sum_tile_settings(),
+#   darkness = 0.8
+# )

cvms_version.R ADDED Viewed

	@@ -0,0 +1 @@


1	+ print(packageVersion("cvms"))

data.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import pathlib
+import pandas as pd
+import streamlit as st
+from utils import call_subprocess
+def read_data(data):
+    if data is not None:
+        df = pd.read_csv(data)
+        return df
+    else:
+        return None
+@st.cache_data
+def read_data_cached(data):
+    return read_data(data)
+def generate_data(out_path, num_classes, num_observations, seed) -> None:
+    call_subprocess(
+        f"Rscript generate_data.R --out_path {out_path} --num_classes {num_classes} --num_observations {num_observations} --seed {seed}",
+        message="Data generation script",
+        return_output=True,
+        encoding="UTF-8",
+    )
+class DownloadHeader:
+    """
+    Class for showing header and download button (for an image file) in the same row.
+    """
+    @staticmethod
+    def header_and_image_download(
+        header, filepath, key=None, label="Download", help="Download plot"
+    ):
+        col1, col2 = st.columns([9, 2])
+        with col1:
+            st.subheader(header)
+        with col2:
+            st.write("")
+            with open(filepath, "rb") as img:
+                st.download_button(
+                    label=label,
+                    data=img,
+                    file_name=pathlib.Path(filepath).name,
+                    mime="image/png",
+                    key=key,
+                    help=help,
+                )
+    @staticmethod
+    def _convert_df_to_csv(data, **kwargs):
+        return data.to_csv(**kwargs).encode("utf-8")
+    @staticmethod
+    def header_and_data_download(
+        header, data, file_name, key=None, label="Download", help="Download data"
+    ):
+        col1, col2 = st.columns([9, 2])
+        with col1:
+            st.subheader(header)
+        with col2:
+            st.write("")
+            st.download_button(
+                label=label,
+                data=DownloadHeader._convert_df_to_csv(data, index=False),
+                file_name=file_name,
+                key=key,
+                help=help,
+            )

generate_data.R ADDED Viewed

	@@ -0,0 +1,46 @@

+#!/usr/bin/env Rscript
+library(optparse)
+library(cvms)
+option_list <- list(
+    make_option(c("--out_path"), type="character",
+                help="Path to save data at."),
+    make_option(c("--num_classes"), type="integer",
+                help="Number of classes."),
+    make_option(c("--num_observations"), type="integer",
+                help="Number of observations."),
+    make_option(c("--seed"), type="integer",
+                help="Number of observations.")
+)
+opt_parser <- OptionParser(option_list=option_list)
+opt <- parse_args(opt_parser)
+print(opt)
+# Set seed if given
+if (!is.null(opt$seed)){
+    set.seed(opt$seed)
+}
+# Make fairly certain predictions
+rcertain <- function(n) {
+  (runif(n, min = 1, max = 100)^1.4) / 100
+}
+# Generate data
+data <- cvms::multiclass_probability_tibble(
+  num_classes=opt$num_classes,
+  num_observations=opt$num_observations,
+  apply_softmax = TRUE,
+  FUN = rcertain,
+  class_name = "c",
+  add_predicted_classes = TRUE,
+  add_targets = TRUE
+)
+data <- data[, c("Predicted Class", "Target")]
+# Write to disk
+write.csv(data, file = opt$out_path, row.names=FALSE)

plot.R ADDED Viewed

	@@ -0,0 +1,194 @@

+#!/usr/bin/env Rscript
+library(optparse)
+library(cvms)
+library(dplyr)
+library(ggplot2)
+option_list <- list(
+    make_option(c("--data_path"), type="character",
+                help="Path to data file (.csv)."),
+    make_option(c("--out_path"), type="character",
+                help="Path to save confusion matrix plot at."),
+    make_option(c("--data_are_counts"), action="store_true", default=FALSE,
+                help="Indicates that `--data_path` contains counts, not predictions."),
+    make_option(c("--target_col"), type="character",
+                help="Target column"),
+    make_option(c("--prediction_col"), type="character",
+                help="Prediction column"),
+    make_option(c("--n_col"), type="character",
+                help="Count column (when `--data_are_counts`)."),
+    make_option(c("--classes"), type="character",
+                help="Comma-separated class names. Only these classes will be used - in the specified order."),
+    make_option(c("--prob_of_class"), type="character",
+                help="Name of class that probabilities are of."),
+    make_option(c("--palette"), type="character",
+                help="Color palette."),
+    make_option(c("--width"), type="integer",
+                help="Width of plot in pixels."),
+    make_option(c("--height"), type="integer",
+                help="Height of plot in pixels."),
+    make_option(c("--dpi"), type="integer",
+                help="DPI of plot."),
+    make_option(c("--add_sums"), action="store_true", default=FALSE,
+                help="Wether to add sum tiles."),
+    make_option(c("--add_counts"), action="store_true", default=FALSE,
+                help="Wether to add counts."),
+    make_option(c("--add_normalized"), action="store_true", default=FALSE,
+                help="Wether to add normalized counts (i.e. percentages)."),
+    make_option(c("--add_row_percentages"), action="store_true", default=FALSE,
+                help="Wether to add row percentages."),
+    make_option(c("--add_col_percentages"), action="store_true", default=FALSE,
+                help="Wether to add column percentages."),
+    make_option(c("--add_zero_percentages"), action="store_true", default=FALSE,
+                help="Wether to add percentages to zero-tiles."),
+    make_option(c("--add_zero_text"), action="store_true", default=FALSE,
+                help="Wether to add text to zero-tiles."),
+    make_option(c("--add_zero_shading"), action="store_true", default=FALSE,
+                help="Wether to add shading to zero-tiles."),
+    make_option(c("--add_arrows"), action="store_true", default=FALSE,
+                help="Wether to add arrows to row/sum percentages. Requires additional packages."),
+    make_option(c("--counts_on_top"), action="store_true", default=FALSE,
+                help="Wether to have the counts on top and normalized counts below."),
+    make_option(c("--diag_percentages_only"), action="store_true", default=FALSE,
+                help="Wether to only show diagonal row/column percentages."),
+    make_option(c("--digits"), type="integer",
+                help="Number of digits to show for percentages.")
+)
+opt_parser <- OptionParser(option_list=option_list)
+opt <- parse_args(opt_parser)
+print(opt)
+data_are_counts <- opt$data_are_counts
+# read.csv turns white space into dots
+target_col <- stringr::str_squish(opt$target_col)
+target_col <- stringr::str_replace_all(target_col, " ", ".")
+prediction_col <- stringr::str_squish(opt$prediction_col)
+prediction_col  <- stringr::str_replace_all(prediction_col, " ", ".")
+n_col <- NULL
+if (!is.null(opt$n_col)){
+    n_col <- stringr::str_squish(opt$n_col)
+    n_col  <- stringr::str_replace_all(n_col, " ", ".")
+}
+# Read and prepare data frame
+df <- tryCatch({
+    read.csv(opt$data_path)
+}, error=function(e){
+    print(paste0("Failed to read data from ", opt$data_path))
+    print(e)
+    stop(e)
+})
+print(df)
+df <- dplyr::as_tibble(df)
+print(df)
+df[[target_col]] <- as.character(df[[target_col]])
+if (isTRUE(data_are_counts)){
+    df[[prediction_col]] <- as.character(df[[prediction_col]])
+}
+# Predictions can be either probabilities or
+# hard class predictions
+if (is.integer(df[[prediction_col]]) || !is.numeric(df[[prediction_col]])){
+    all_present_classes <- sort(
+        c(unique(df[[target_col]]),
+          unique(df[[prediction_col]])
+        )
+    )
+} else {
+    all_present_classes <- sort(
+        unique(df[[target_col]])
+    )
+}
+if (!is.null(opt$classes)){
+    classes <- as.character(unlist(strsplit(opt$classes,"[,:]")), recursive=TRUE)
+} else {
+    classes <- all_present_classes
+}
+print(paste0("Selected Classes: ", paste0(classes, collapse=", ")))
+if (!isTRUE(data_are_counts)){
+    # We remove the unwanted classes from the confusion matrix
+    # (easier - possibly slower in edge cases)
+    family <- ifelse(length(all_present_classes) == 2, "binomial", "multinomial")
+    print(df)
+    # TODO : use prob_of_class to ensure probabilities are interpreted correctly!!
+    # Might need to invert them to get it to work!
+    evaluation <- tryCatch({
+        cvms::evaluate(
+            data=df,
+            target_col=target_col,
+            prediction_cols=prediction_col,
+            type=family,
+        )
+    }, error=function(e){
+        print("Failed to evaluate data.")
+        print(head(df, 5))
+        print(e)
+        stop(e)
+    })
+    confusion_matrix <- evaluation[["Confusion Matrix"]][[1]]
+} else {
+    confusion_matrix <- dplyr::rename(
+        df,
+        Target = !!target_col,
+        Prediction = !!prediction_col,
+        N = !!n_col
+    )
+}
+confusion_matrix  <- dplyr::filter(
+    confusion_matrix,
+    Prediction %in% classes,
+    Target %in% classes
+)
+confusion_matrix_plot <- tryCatch({
+    cvms::plot_confusion_matrix(
+        confusion_matrix,
+        class_order=classes,
+        add_sums=opt$add_sums,
+        add_counts=opt$add_counts,
+        add_normalized=opt$add_normalized,
+        add_row_percentages=opt$add_row_percentages,
+        add_col_percentages=opt$add_col_percentages,
+        rm_zero_percentages=!opt$add_zero_percentages,
+        rm_zero_text=!opt$add_zero_text,
+        add_zero_shading=opt$add_zero_shading,
+        add_arrows=opt$add_arrows,
+        counts_on_top=opt$counts_on_top,
+        diag_percentages_only=opt$diag_percentages_only,
+        digits=as.integer(opt$digits),
+        palette=opt$palette
+    )
+}, error=function(e){
+    print("Failed to create plot from confusion matrix.")
+    print(confusion_matrix)
+    print(e)
+    stop(e)
+})
+tryCatch({
+    ggplot2::ggsave(
+        opt$out_path,
+        width=opt$width,
+        height=opt$height,
+        dpi=opt$dpi,
+        units="px"
+    )
+}, error=function(e){
+    print(paste0("Failed to ggsave plot to: ", opt$out_path))
+    print(e)
+    stop(e)
+})

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+# rpy2=3.5.1
+pandas
+lazyeval
+r-cvms
+r-dplyr
+r-ggimage
+r-rsvg # Conda forge?
+r-optparse
+r-ggnewscale
+r-stringr
+# Needs:
+# conda config --add channels conda-forge
+# conda config --set channel_priority strict

small_example.csv ADDED Viewed

	@@ -0,0 +1,9 @@

+target,prediction,predicted_class
+1,0.3,1
+2,0.9,2
+1,0.2,1
+2,0.9,2
+1,0.7,2
+1,0.8,2
+2,0.5,2
+2,0.7,2

text_sections.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import streamlit as st
+from utils import call_subprocess
+@st.cache_resource
+def get_cvms_version():
+    return (
+        str(
+            call_subprocess(
+                f"Rscript cvms_version.R",
+                message="cvms versioning script",
+                return_output=True,
+                encoding="UTF-8",
+            )
+        )
+        .split("[1]")[-1]
+        .replace("‘", "")
+        .replace("’", "")
+    )
+def intro_text():
+    col1, col2 = st.columns([8, 2])
+    with col1:
+        st.title("Plot Confusion Matrix")
+        st.write(
+            "This application allows you to plot a confusion matrix based on your own data. "
+        )
+    with col2:
+        st.image(
+            "https://github.com/LudvigOlsen/cvms/raw/master/man/figures/cvms_logo_242x280_250dpi.png",
+            width=125,
+        )
+    st.write(
+        "The plot is created with the [**cvms**](https://github.com/LudvigOlsen/cvms) R package "
+        f"(v/{get_cvms_version()}, LR Olsen & HB Zachariae, 2019)."
+    )
+    st.write(
+        "DATA PRIVACY: In order to transfer the data "
+        "between python and R, it is temporarily stored on the servers. "
+        "While we, the authors, have no intention of looking at your data, we make "
+        "*no guarantees* about the privacy of your data (it is not our servers). "
+        "Please do not upload sensitive data. The application "
+        "only requires columns with predictions and targets."
+    )
+def generate_data_text():
+    st.subheader("Generate data")
+    st.write(
+        "If you just want to try out the application, you can generate a dataset with targets and predictions. "
+        "Select a number of classes and observations, and you're ready to go! "
+    )
+def enter_count_data_text():
+    st.subheader("Enter counts")
+    st.write(
+        "If you already have the confusion matrix counts and want to plot them. "
+        "Enter the counts and get designing! "
+    )
+    st.write("Start by entering the names of your classes:")
+def upload_counts_text():
+    st.subheader("Upload your counts")
+    st.write(
+        "Plot an existing confusion matrix (counts of target-prediction combinations). "
+        "The application expects a `.csv` file with: \n"
+        "1) A `target classes` column. \n\n"
+        "2) A `predicted classes` column. \n\n"
+        "3) A `combination count` column for the "
+        "combination frequency of 1 and 2. \n\n"
+        "Other columns are currently ignored. "
+        "See example of such a .csv file [here] (TODO). "
+    )
+def upload_predictions_text():
+    st.subheader("Upload your predictions")
+    st.markdown(
+        "The application expects a `.csv` file with:  \n"
+        "1) A `target` column.  \n"
+        "Targets will be converted into strings. \n\n"
+        "2) A `prediction` column.  \n"
+        "Predictions can be probabilities (binary classification only) or class predictions. \n\n"
+        "Other columns are currently ignored.  \n\n"
+        "You will have the option to select the names of these two columns, so don't "
+        "worry too much about the column names in the uploaded data."
+    )
+def columns_text():
+    st.subheader("Specify columns")
+    st.write(
+        "Please select which of the columns in the data should be used for targets and predictions."
+    )
+def design_text():
+    st.subheader("Design your plot")
+    st.write("This is where you customize the design of your confusion matrix plot.")

utils.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import subprocess
+import re, string
+def call_subprocess(call_, message, return_output=False, encoding="UTF-8"):
+    # With capturing of output
+    if return_output:
+        try:
+            out = subprocess.check_output(call_, shell=True, encoding=encoding)
+        except subprocess.CalledProcessError as e:
+            print(f"{message}: {call_}")
+            raise e
+        return out
+    # Without capturing of output
+    try:
+        subprocess.check_call(call_, shell=True)
+    except subprocess.CalledProcessError as e:
+        print(f"{message}: {call_}")
+        raise e
+def clean_string_for_non_alphanumerics(s):
+    pattern = re.compile("[\W'_']+")
+    return pattern.sub("", s)