import gradio as gr import matplotlib.pyplot as plt import numpy as np from functools import partial # ai4code_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/AI4Code") # amps_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/AMPS") # apache_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/ASFPublicMail") # books3_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Books3") # cp_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/CPDataset") # dmmath_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/DMMath") # discourse_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Discourse") # wiki_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Enwiki") # euro_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/EuroParliamentProceedings") # freelaw_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/FreeLaw_Options") # ghdiffs_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/GitHubDiff") # ghissues_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/GitHubIssues") # gutenberg_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Gutenberg") # leet_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/LeetCode") # pileoflaw_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/PileOfLaw") # pubmed_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/PubMed") # s2orc_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/S2ORC") # se_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/StackExchange") # usenet_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/USENET") # uspto_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/USPTO") # ubuntuirc_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/UbuntuIRC") # arxiv_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/arXiv") dataset_data = { "AI4Code": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "AMPS": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "ASFPublicMail": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "Books3": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "CPDataset": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "DMMath": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "Discourse": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "Enwiki": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "EuroParliamentProceedings": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "FreeLaw_Options": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "GitHubDiff": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "GitHubIssues": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "Gutenberg": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "LeetCode": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "PileOfLaw": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "PubMed": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "S2ORC": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "StackExchange": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "USENET": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "USPTO": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "UbuntuIRC": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "arXiv": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, } def plt_plot(ratio, dataset, threshold): x = dataset_data[dataset][ratio] # calculate percentage of data that will be removed given threshold perc = np.sum(x < threshold) / len(x) # create a figure fig = plt.figure() # add a subplot ax = fig.add_subplot(111) # plot some data using black ax.hist(x, bins=50, color="black") # plot red dashed line at threshold ax.axvline(threshold, color='r', linestyle='dashed', linewidth=2) # set title # add percentage of data removed ax.set_title(f"{dataset} (removed {perc:.2%})") plt.xlabel("Value") plt.ylabel("Frequency") # make it look nice plt.tight_layout() return fig with gr.Blocks() as demo: dataset = gr.Radio(list(dataset_data.keys()), label="Dataset", value="arXiv") print(dataset.value) with gr.Tab("Character Repetition Ratio"): # plot some random data plot = gr.Plot() threshold = gr.Slider(minimum=0, maximum=100, label="Threshold") calculate = gr.Button("Calculate") plot_fn = partial(plt_plot, "word_rep_ratios") calculate.click(plot_fn, [dataset, threshold], plot) with gr.Tab("Word Repetition Ratio"):# plot some random data plot = gr.Plot() threshold = gr.Slider(minimum=0, maximum=1, label="Threshold") calculate = gr.Button("Calculate") plot_fn = partial(plt_plot, "char_rep_ratios") calculate.click(plot_fn, [dataset, threshold], plot) with gr.Tab("Flagged Word Ratio"):# plot some random data plot = gr.Plot() threshold = gr.Slider(minimum=0, maximum=1, label="Threshold") calculate = gr.Button("Calculate") plot_fn = partial(plt_plot, "flagged_word_ratios") calculate.click(plot_fn, [dataset, threshold], plot) if __name__ == "__main__": demo.launch(share=True)