|
from pandas import DataFrame |
|
from plotnine import ( |
|
ggplot, |
|
aes, |
|
geom_col, |
|
geom_line, |
|
annotate, |
|
theme, |
|
element_blank, |
|
labs, |
|
coord_flip, |
|
scale_x_discrete, |
|
scale_x_datetime, |
|
scale_y_continuous, |
|
theme_light, |
|
) |
|
|
|
|
|
class DataAvailabilityError(Exception): |
|
"""Raised when not enough data available to vizualize.""" |
|
pass |
|
|
|
|
|
def plot_NA(placeholder_text: str = "Not enough data available to visualize.", placeholder_size: int = 14): |
|
"""Placeholder plot for when there is not enough data available to visualize. |
|
""" |
|
return ( |
|
ggplot() |
|
+ annotate("text", x=0, y=0, label=placeholder_text, size=placeholder_size) |
|
+ theme(axis_ticks=element_blank(), axis_text=element_blank(), panel_grid=element_blank()) |
|
+ labs(x="", y="", title="") |
|
) |
|
|
|
|
|
def generate_rule_axis_label(rule_types: list | None = None): |
|
"""Generate axis label for rules, accounting for rule type ("all", "3f1-significant", or "other-significant"). |
|
""" |
|
categories = "" |
|
if (rule_types is None) or ("all" in rule_types): |
|
pass |
|
elif all(True if cat in rule_types else False for cat in ("3f1-significant", "other-significant")): |
|
categories = "significant" |
|
elif ("3f1-significant" in rule_types) and ("other-significant" not in rule_types): |
|
categories = "Section 3(f)(1) Significant" |
|
elif ("3f1-significant" not in rule_types) and ("other-significant" in rule_types): |
|
categories = "Other Significant" |
|
return f"Number of {categories} rules".replace(" ", " ") |
|
|
|
|
|
def plot_agency(df, group_col = "acronym", value_col = "rules", color="#033C5A", rule_types: list | None = None): |
|
"""Plot rules by agency. |
|
|
|
Args: |
|
df (DataFrame): Input data. |
|
group_col (str, optional): Column on which the data are grouped. Defaults to "acronym". |
|
value_col (str, optional): Column of values to be plotted. Defaults to "rules". |
|
color (str, optional): Color of values in plot. Defaults to "#033C5A" ([GW Blue](https://communications.gwu.edu/visual-identity/color-palette)). |
|
rule_types (list | None, optional): One or more rule types to include in plot. Accepts "all", "3f1-significant", or "other-significant". Defaults to None. |
|
|
|
Returns: |
|
ggplot: Plotted data. |
|
""" |
|
order_list = df.loc[:, group_col].to_list()[::-1] |
|
|
|
y_lab = generate_rule_axis_label(rule_types) |
|
|
|
plot = ( |
|
ggplot( |
|
df, |
|
aes(x=group_col, y=value_col), |
|
) |
|
+ geom_col(color="#FFFFFF", fill=color) |
|
+ coord_flip() |
|
+ scale_x_discrete(limits=order_list) |
|
+ labs(y=y_lab, x="", title="Rules Published by Agency") |
|
+ theme_light() |
|
) |
|
return plot |
|
|
|
|
|
def plot_month( |
|
df: DataFrame, |
|
group_cols: tuple = ("publication_year", "publication_month"), |
|
value_col: str = "rules", |
|
color: str = "#033C5A", |
|
title: str | None = None, |
|
y_lab: str = "", |
|
): |
|
"""Plot rules by month. |
|
|
|
Args: |
|
df (DataFrame): Input data. |
|
group_cols (tuple, optional): Columns on which the data are grouped. Defaults to ("publication_year", "publication_month"). |
|
value_col (str, optional): Column of values to be plotted. Defaults to "rules". |
|
color (str, optional): Color of values in plot. Defaults to "#033C5A" ([GW Blue](https://communications.gwu.edu/visual-identity/color-palette)). |
|
title (str | None, optional): Plot title. Defaults to None. |
|
y_lab (str, optional): Plot y label. Defaults to "" (empty string). |
|
|
|
Returns: |
|
ggplot: Plotted data. |
|
""" |
|
df.loc[:, "ym"] = df[group_cols[0]].astype(str) + "-" + df[group_cols[1]].astype(str).str.pad(2, fillchar="0") |
|
order_list = df.loc[:, "ym"].to_list() |
|
if title is None: |
|
title = "Rules Published by Month" |
|
|
|
plot = ( |
|
ggplot( |
|
df, |
|
aes(x="ym", y=value_col), |
|
) |
|
+ geom_col(color="#FFFFFF", fill=color) |
|
+ scale_x_discrete(limits=order_list) |
|
+ labs(y=y_lab, x="", title=title) |
|
+ theme_light() |
|
) |
|
return plot |
|
|
|
|
|
def plot_day( |
|
df: DataFrame, |
|
group_col: str = "publication_date", |
|
value_col: str = "rules", |
|
color: str = "#033C5A", |
|
title: str | None = None, |
|
y_lab: str = "", |
|
): |
|
"""Plot rules by day. |
|
|
|
Args: |
|
df (DataFrame): Input data. |
|
group_col (str, optional): Column on which the data are grouped. Defaults to "publication_date". |
|
value_col (str, optional): Column of values to be plotted. Defaults to "rules". |
|
color (str, optional): Color of values in plot. Defaults to "#033C5A" ([GW Blue](https://communications.gwu.edu/visual-identity/color-palette)). |
|
title (str | None, optional): Plot title. Defaults to None. |
|
y_lab (str, optional): Plot y label. Defaults to "" (empty string). |
|
|
|
Returns: |
|
ggplot: Plotted data. |
|
""" |
|
min_date = df.loc[:, group_col].min() |
|
max_date = df.loc[:, group_col].max() |
|
diff = (max_date - min_date).days |
|
if diff in range(0, 61): |
|
freq = "1 week" |
|
elif diff in range(61, 91): |
|
freq = "2 weeks" |
|
else: |
|
freq = "1 month" |
|
|
|
max_value = df.loc[:, value_col].max() |
|
|
|
if title is None: |
|
title = "Rules Published by Date" |
|
|
|
plot = ( |
|
ggplot( |
|
df, |
|
aes(x=group_col, y=value_col), |
|
) |
|
+ geom_line(group=1, color=color) |
|
+ scale_x_datetime(date_breaks=freq, date_labels="%m-%d") |
|
+ scale_y_continuous(limits=(0, max_value), expand=(0, 0, 0.1, 0)) |
|
+ labs(y=y_lab, x="", title=title) |
|
+ theme_light() |
|
) |
|
return plot |
|
|
|
|
|
def plot_week( |
|
df: DataFrame, |
|
group_col: str = "week_of", |
|
value_col: str = "rules", |
|
color: str = "#033C5A", |
|
title: str | None = None, |
|
y_lab: str = "", |
|
show_significant: bool = False, |
|
): |
|
"""Plot rules by week. |
|
|
|
Args: |
|
df (DataFrame): Input data. |
|
group_col (str, optional): Column on which the data are grouped. Defaults to "week_of". |
|
value_col (str, optional): Column of values to be plotted. Defaults to "rules". |
|
color (str, optional): Color of values in plot. Defaults to "#033C5A" ([GW Blue](https://communications.gwu.edu/visual-identity/color-palette)). |
|
title (str | None, optional): Plot title. Defaults to None. |
|
y_lab (str, optional): Plot y label. Defaults to "" (empty string). |
|
|
|
Returns: |
|
ggplot: Plotted data. |
|
""" |
|
max_value = df.loc[:, value_col].max() |
|
|
|
date_values = df[group_col].to_list() |
|
num_weeks = len(date_values) |
|
|
|
if num_weeks in range(8, 16): |
|
reduce_by = 2 |
|
elif num_weeks in range(16, 24): |
|
reduce_by = 3 |
|
elif num_weeks in range(24, 32): |
|
reduce_by = 4 |
|
elif num_weeks >= 32: |
|
reduce_by = 5 |
|
else: |
|
reduce_by = 1 |
|
|
|
breaks = [val for idx, val in enumerate(date_values) if idx % reduce_by == 0] |
|
|
|
if title is None: |
|
title = "Rules Published by Week" |
|
|
|
plot = ( |
|
ggplot( |
|
df, |
|
aes(x=group_col, y=value_col), |
|
) |
|
+ geom_line(group=1, color=color) |
|
+ scale_x_datetime(breaks=breaks, labels=[f"{w.strftime('%m-%d')}" for w in breaks]) |
|
+ scale_y_continuous(limits=(0, max_value), expand=(0, 0, 0.1, 0)) |
|
+ labs(y=y_lab, x="", title=title) |
|
+ theme_light() |
|
) |
|
return plot |
|
|
|
|
|
def plot_tf(df: DataFrame, frequency: str, rule_types: list | None = None, **kwargs) -> ggplot: |
|
"""Plot rules over time by given frequency. |
|
|
|
Args: |
|
df (DataFrame): Input data. |
|
frequency (str): Frequency of time for aggregating rules. Accepts "monthly" or "daily". |
|
rule_types (list | None, optional): One or more rule types to include in plot. Accepts "all", "3f1-significant", or "other-significant". Defaults to None. |
|
|
|
Raises: |
|
ValueError: Frequency parameter received invalid value. |
|
|
|
Returns: |
|
ggplot: Plotted data. |
|
""" |
|
freq_options = { |
|
"monthly": plot_month, |
|
"daily": plot_day, |
|
"weekly": plot_week, |
|
} |
|
plot_freq = freq_options.get(frequency, None) |
|
if plot_freq is None: |
|
raise ValueError(f"Frequency must be one of: {', '.join(freq_options.keys())}") |
|
|
|
y_lab = generate_rule_axis_label(rule_types) |
|
|
|
return plot_freq(df, y_lab=y_lab, **kwargs) |
|
|