elineve's picture
Upload 301 files
07423df
raw
history blame
3.04 kB
from typing import List
import pandas as pd
from h2o_wave import data, ui
def histogram_card(
x,
a=0.1,
b=0.9,
x_axis_description="text_length",
histogram_box="first",
title="Text Length (split by whitespace)",
):
assert " " not in x_axis_description, (
"x_axis_description in histogram card must not contain spaces, "
"as the card would not be rendered."
)
df_quantile = compute_quantile_df(x, a, b)
df_quantile = df_quantile.rename(columns={"length": x_axis_description})
card = ui.plot_card(
box=histogram_box,
title=title,
data=data(
fields=df_quantile.columns.tolist(),
rows=df_quantile.values.tolist(),
pack=True,
),
plot=ui.plot(
marks=[
ui.mark(
type="area",
x=f"={x_axis_description}",
x_title=f"Total samples: {len(x)}",
y="=count",
y_title="Count",
color="=data_type",
shape="circle",
)
]
),
)
return card
def compute_quantile_df(x: List[int], a: float, b: float):
"""
Compute the quantiles based on the input list x.
Returns a dataframe with the following columns:
- length: length of the text
- count: number of texts with this length
- data_type: quantile type
(first (a * 100)% quantile, (a * 100)%-(100 * (1 - b))% quantile,
last (100 * (1 - b))% quantile)
Note that quantiles are overlapping on the edges.
"""
if not x:
raise ValueError("Input list x is empty")
if not 0 <= a <= b <= 1:
raise ValueError(
"Values of a and b must be in [0, 1] "
"and a should be less than or equal to b"
)
x_axis_description = "length"
df = pd.DataFrame(x, columns=[x_axis_description])
df["count"] = 1
df_quantile = (
df.groupby([x_axis_description])
.sum()
.reset_index()
.sort_values(by=x_axis_description)[[x_axis_description, "count"]]
)
sorted_data = sorted(x)
first_quantile = sorted_data[int(len(sorted_data) * a)]
last_quantile = sorted_data[-int(len(sorted_data) * (1 - b))]
df_first = df_quantile.loc[df_quantile[x_axis_description] <= first_quantile].copy()
df_first["data_type"] = f"first {int(a * 100)}% quantile"
df_last = df_quantile.loc[df_quantile[x_axis_description] >= last_quantile].copy()
df_last["data_type"] = f"last {100 - int(b * 100)}% quantile"
df_quantile["data_type"] = f"{int(a * 100)}%-{int(b * 100)}% quantile"
middle_quantile_min = max(0, len(df_first) - 1)
middle_quantile_max = (
min(len(df_quantile), (len(df_quantile) - len(df_last) - 1)) + 1
)
df_quantile = pd.concat(
[
df_first,
df_quantile.loc[middle_quantile_min:middle_quantile_max],
df_last,
]
)
return df_quantile