{ "cells": [ { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'prob': {'file': 'prob.json'}}" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import json\n", "import numpy as np\n", "import plotly.graph_objects as go\n", "RED_FULL=\"rgba(255, 0, 0, 1)\"\n", "\n", "# Define the function 1 - (1 - x^8)^14\n", "def func1(x):\n", " return 1 - np.power(1 - np.power(x, 8), 14)\n", "\n", "# Define the function 1 - (1 - x^20)^450\n", "def func2(x):\n", " return 1 - np.power(1 - np.power(x, 20), 450)\n", "\n", "# Generate x values from 0 to 1\n", "x = np.linspace(0, 1, 1000)\n", "\n", "# Calculate y values for each function\n", "y1 = func1(x)\n", "y2 = func2(x)\n", "\n", "# Create traces\n", "trace1 = go.Scatter(x=x, y=y1, mode='lines', name='FineWeb: 1-(1-s^8)^14')\n", "trace2 = go.Scatter(x=x, y=y2, mode='lines', name='RefinedWeb: 1-(1-s^20)^450')\n", "vertical_line = go.Scatter(x=[0.75, 0.75], y=[0, 1], mode='lines', line=dict(color='red', dash='dash'), name='Threshold')\n", "\n", "# Define layout\n", "layout = {\n", " 'title': {\n", " 'text': 'MinHash parameters',\n", " },\n", " 'xaxis': {\n", " 'title': {\n", " 'text': 'Document similarity (s)',\n", " },\n", " },\n", " 'yaxis': {\n", " 'title': {\n", " 'text': 'Matched as dups probability',\n", " },\n", " },\n", "}\n", "\n", "\n", "def normalize_run_name(run_name):\n", " return run_name.replace(\"/\", \"_\")\n", "\n", "\n", "def save_for_plot(dir_name, df, views, xlabel=\"Dataset\", ylabel=\"Matched as dups probability\", plot_name=\"plot name\", custom_layout={}, ranges={}, x_column=None, default_metric=None):\n", " import os\n", " files = {}\n", " os.makedirs(f\"data/plots/{dir_name}\", exist_ok=True)\n", " for view in views:\n", " data = {}\n", " for run_name in df[\"runname\"].unique():\n", " run_name_only=df[df[\"runname\"]==run_name]\n", " data[run_name] = {\n", " \"x\": run_name_only[x_column].tolist() if x_column else [run_name],\n", " \"y\": run_name_only[view].tolist(),\n", " \"label\": run_name,\n", " }\n", " file_name = f\"{normalize_run_name(view)}.json\"\n", " files[view] = {\"file\": f\"{file_name}\"}\n", " with open(f\"data/plots/{dir_name}/{file_name}\", \"w\") as f:\n", " json.dump({\n", " \"data\": data,\n", " \"layout\": {\n", " \"title\": {\n", " \"text\": plot_name,\n", " },\n", " \"xaxis\": {\n", " \"title\": {\n", " \"text\": xlabel,\n", " },\n", " },\n", " \"yaxis\": {\n", " # \"range\": ranges.get(view, None),\n", " \"title\": {\n", " \"text\": ylabel,\n", " },\n", " },\n", " \"shapes\": [\n", " {\n", " \"type\": \"line\",\n", " \"x0\": 0.75,\n", " \"y0\": 0.0,\n", " \"x1\": 0.75,\n", " \"y1\": 1.2,\n", " \"xref\": \"x\",\n", " \"yref\": \"y\",\n", " \"line\": {\n", " \"color\": RED_FULL,\n", " \"width\": 1,\n", " \"dash\": \"dashdot\"\n", " },\n", " \"showarrow\": False\n", " }\n", " ],\n", " **custom_layout,\n", " },\n", " }, f)\n", " with open(f\"data/plots/{dir_name}/index.json\", \"w\") as f:\n", " json.dump({\n", " \"files\": files,\n", " \"settings\": {\n", " \"defaultMetric\": default_metric,\n", " \"slider\": None,\n", " \"autoSetXRange\": False,\n", " }\n", " }, f)\n", " return files\n", "\n", "import pandas as pd\n", "df = pd.DataFrame({\n", " \"runname\": [\"FineWeb: 1-(1-s^8)^14\"]*len(x) + [\"RefinedWeb: 1-(1-s^20)^450\"]*len(x),\n", " \"similarity\": x.tolist()+x.tolist(),\n", " \"prob\": y1.tolist()+y2.tolist(),\n", " \"view\": [\"normal\"]*2*len(x)\n", "})\n", "\n", "custom_layout = {\n", " \"legend\": {\n", " \"orientation\": \"v\",\n", " \"xanchor\": \"left\",\n", " \"yanchor\": \"top\",\n", " \"x\": 0,\n", " \"y\": 1,\n", " },\n", "}\n", "\n", "save_for_plot(\"minhash_params\", df, [\"prob\"], xlabel=\"Document similarity (s)\", plot_name=\"MinHash parameters\", custom_layout=custom_layout, ranges={}, x_column=\"similarity\", default_metric=\"prob\")" ] } ], "metadata": { "kernelspec": { "display_name": "datatrove", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }