In [8]:
import json
import numpy as np
import plotly.graph_objects as go
RED_FULL="rgba(255, 0, 0, 1)"

# Define the function 1 - (1 - x^8)^14
def func1(x):
 return 1 - np.power(1 - np.power(x, 8), 14)

# Define the function 1 - (1 - x^20)^450
def func2(x):
 return 1 - np.power(1 - np.power(x, 20), 450)

# Generate x values from 0 to 1
x = np.linspace(0, 1, 1000)

# Calculate y values for each function
y1 = func1(x)
y2 = func2(x)

# Create traces
trace1 = go.Scatter(x=x, y=y1, mode='lines', name='FineWeb: 1-(1-s^8)^14')
trace2 = go.Scatter(x=x, y=y2, mode='lines', name='RefinedWeb: 1-(1-s^20)^450')
vertical_line = go.Scatter(x=[0.75, 0.75], y=[0, 1], mode='lines', line=dict(color='red', dash='dash'), name='Threshold')

# Define layout
layout = {
 'title': {
 'text': 'MinHash parameters',
 },
 'xaxis': {
 'title': {
 'text': 'Document similarity (s)',
 },
 },
 'yaxis': {
 'title': {
 'text': 'Matched as dups probability',
 },
 },
}


def normalize_run_name(run_name):
 return run_name.replace("/", "_")


def save_for_plot(dir_name, df, views, xlabel="Dataset", ylabel="Matched as dups probability", plot_name="plot name", custom_layout={}, ranges={}, x_column=None, default_metric=None):
 import os
 files = {}
 os.makedirs(f"data/plots/{dir_name}", exist_ok=True)
 for view in views:
 data = {}
 for run_name in df["runname"].unique():
 run_name_only=df[df["runname"]==run_name]
 data[run_name] = {
 "x": run_name_only[x_column].tolist() if x_column else [run_name],
 "y": run_name_only[view].tolist(),
 "label": run_name,
 }
 file_name = f"{normalize_run_name(view)}.json"
 files[view] = {"file": f"{file_name}"}
 with open(f"data/plots/{dir_name}/{file_name}", "w") as f:
 json.dump({
 "data": data,
 "layout": {
 "title": {
 "text": plot_name,
 },
 "xaxis": {
 "title": {
 "text": xlabel,
 },
 },
 "yaxis": {
 # "range": ranges.get(view, None),
 "title": {
 "text": ylabel,
 },
 },
 "shapes": [
 {
 "type": "line",
 "x0": 0.75,
 "y0": 0.0,
 "x1": 0.75,
 "y1": 1.2,
 "xref": "x",
 "yref": "y",
 "line": {
 "color": RED_FULL,
 "width": 1,
 "dash": "dashdot"
 },
 "showarrow": False
 }
 ],
 **custom_layout,
 },
 }, f)
 with open(f"data/plots/{dir_name}/index.json", "w") as f:
 json.dump({
 "files": files,
 "settings": {
 "defaultMetric": default_metric,
 "slider": None,
 "autoSetXRange": False,
 }
 }, f)
 return files

import pandas as pd
df = pd.DataFrame({
 "runname": ["FineWeb: 1-(1-s^8)^14"]*len(x) + ["RefinedWeb: 1-(1-s^20)^450"]*len(x),
 "similarity": x.tolist()+x.tolist(),
 "prob": y1.tolist()+y2.tolist(),
 "view": ["normal"]*2*len(x)
})

custom_layout = {
 "legend": {
 "orientation": "v",
 "xanchor": "left",
 "yanchor": "top",
 "x": 0,
 "y": 1,
 },
}

save_for_plot("minhash_params", df, ["prob"], xlabel="Document similarity (s)", plot_name="MinHash parameters", custom_layout=custom_layout, ranges={}, x_column="similarity", default_metric="prob")

{'prob': {'file': 'prob.json'}}