pilev2_pipeline / app.py
ncoop57
Made more readyable and added plots
cf5eed6
raw
history blame
10.1 kB
import gradio as gr
import matplotlib.pyplot as plt
import numpy as np
from functools import partial
# ai4code_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/AI4Code")
# amps_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/AMPS")
# apache_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/ASFPublicMail")
# books3_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Books3")
# cp_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/CPDataset")
# dmmath_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/DMMath")
# discourse_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Discourse")
# wiki_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Enwiki")
# euro_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/EuroParliamentProceedings")
# freelaw_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/FreeLaw_Options")
# ghdiffs_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/GitHubDiff")
# ghissues_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/GitHubIssues")
# gutenberg_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Gutenberg")
# leet_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/LeetCode")
# pileoflaw_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/PileOfLaw")
# pubmed_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/PubMed")
# s2orc_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/S2ORC")
# se_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/StackExchange")
# usenet_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/USENET")
# uspto_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/USPTO")
# ubuntuirc_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/UbuntuIRC")
# arxiv_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/arXiv")
dataset_data = {
"AI4Code": {
# create fake data for the different ratios
"word_rep_ratios": np.random.randn(1000),
"char_rep_ratios": np.random.randn(1000),
"flagged_word_ratios": np.random.randn(1000),
"num_words": np.random.randint(0, 1000, 1000),
},
"AMPS": {
# create fake data for the different ratios
"word_rep_ratios": np.random.randn(1000),
"char_rep_ratios": np.random.randn(1000),
"flagged_word_ratios": np.random.randn(1000),
"num_words": np.random.randint(0, 1000, 1000),
},
"ASFPublicMail": {
# create fake data for the different ratios
"word_rep_ratios": np.random.randn(1000),
"char_rep_ratios": np.random.randn(1000),
"flagged_word_ratios": np.random.randn(1000),
"num_words": np.random.randint(0, 1000, 1000),
},
"Books3": {
# create fake data for the different ratios
"word_rep_ratios": np.random.randn(1000),
"char_rep_ratios": np.random.randn(1000),
"flagged_word_ratios": np.random.randn(1000),
"num_words": np.random.randint(0, 1000, 1000),
},
"CPDataset": {
# create fake data for the different ratios
"word_rep_ratios": np.random.randn(1000),
"char_rep_ratios": np.random.randn(1000),
"flagged_word_ratios": np.random.randn(1000),
"num_words": np.random.randint(0, 1000, 1000),
},
"DMMath": {
# create fake data for the different ratios
"word_rep_ratios": np.random.randn(1000),
"char_rep_ratios": np.random.randn(1000),
"flagged_word_ratios": np.random.randn(1000),
"num_words": np.random.randint(0, 1000, 1000),
},
"Discourse": {
# create fake data for the different ratios
"word_rep_ratios": np.random.randn(1000),
"char_rep_ratios": np.random.randn(1000),
"flagged_word_ratios": np.random.randn(1000),
"num_words": np.random.randint(0, 1000, 1000),
},
"Enwiki": {
# create fake data for the different ratios
"word_rep_ratios": np.random.randn(1000),
"char_rep_ratios": np.random.randn(1000),
"flagged_word_ratios": np.random.randn(1000),
"num_words": np.random.randint(0, 1000, 1000),
},
"EuroParliamentProceedings": {
# create fake data for the different ratios
"word_rep_ratios": np.random.randn(1000),
"char_rep_ratios": np.random.randn(1000),
"flagged_word_ratios": np.random.randn(1000),
"num_words": np.random.randint(0, 1000, 1000),
},
"FreeLaw_Options": {
# create fake data for the different ratios
"word_rep_ratios": np.random.randn(1000),
"char_rep_ratios": np.random.randn(1000),
"flagged_word_ratios": np.random.randn(1000),
"num_words": np.random.randint(0, 1000, 1000),
},
"GitHubDiff": {
# create fake data for the different ratios
"word_rep_ratios": np.random.randn(1000),
"char_rep_ratios": np.random.randn(1000),
"flagged_word_ratios": np.random.randn(1000),
"num_words": np.random.randint(0, 1000, 1000),
},
"GitHubIssues": {
# create fake data for the different ratios
"word_rep_ratios": np.random.randn(1000),
"char_rep_ratios": np.random.randn(1000),
"flagged_word_ratios": np.random.randn(1000),
"num_words": np.random.randint(0, 1000, 1000),
},
"Gutenberg": {
# create fake data for the different ratios
"word_rep_ratios": np.random.randn(1000),
"char_rep_ratios": np.random.randn(1000),
"flagged_word_ratios": np.random.randn(1000),
"num_words": np.random.randint(0, 1000, 1000),
},
"LeetCode": {
# create fake data for the different ratios
"word_rep_ratios": np.random.randn(1000),
"char_rep_ratios": np.random.randn(1000),
"flagged_word_ratios": np.random.randn(1000),
"num_words": np.random.randint(0, 1000, 1000),
},
"PileOfLaw": {
# create fake data for the different ratios
"word_rep_ratios": np.random.randn(1000),
"char_rep_ratios": np.random.randn(1000),
"flagged_word_ratios": np.random.randn(1000),
"num_words": np.random.randint(0, 1000, 1000),
},
"PubMed": {
# create fake data for the different ratios
"word_rep_ratios": np.random.randn(1000),
"char_rep_ratios": np.random.randn(1000),
"flagged_word_ratios": np.random.randn(1000),
"num_words": np.random.randint(0, 1000, 1000),
},
"S2ORC": {
# create fake data for the different ratios
"word_rep_ratios": np.random.randn(1000),
"char_rep_ratios": np.random.randn(1000),
"flagged_word_ratios": np.random.randn(1000),
"num_words": np.random.randint(0, 1000, 1000),
},
"StackExchange": {
# create fake data for the different ratios
"word_rep_ratios": np.random.randn(1000),
"char_rep_ratios": np.random.randn(1000),
"flagged_word_ratios": np.random.randn(1000),
"num_words": np.random.randint(0, 1000, 1000),
},
"USENET": {
# create fake data for the different ratios
"word_rep_ratios": np.random.randn(1000),
"char_rep_ratios": np.random.randn(1000),
"flagged_word_ratios": np.random.randn(1000),
"num_words": np.random.randint(0, 1000, 1000),
},
"USPTO": {
# create fake data for the different ratios
"word_rep_ratios": np.random.randn(1000),
"char_rep_ratios": np.random.randn(1000),
"flagged_word_ratios": np.random.randn(1000),
"num_words": np.random.randint(0, 1000, 1000),
},
"UbuntuIRC": {
# create fake data for the different ratios
"word_rep_ratios": np.random.randn(1000),
"char_rep_ratios": np.random.randn(1000),
"flagged_word_ratios": np.random.randn(1000),
"num_words": np.random.randint(0, 1000, 1000),
},
"arXiv": {
# create fake data for the different ratios
"word_rep_ratios": np.random.randn(1000),
"char_rep_ratios": np.random.randn(1000),
"flagged_word_ratios": np.random.randn(1000),
"num_words": np.random.randint(0, 1000, 1000),
},
}
def plt_plot(ratio, dataset, threshold):
x = dataset_data[dataset][ratio]
# calculate percentage of data that will be removed given threshold
perc = np.sum(x < threshold) / len(x)
# create a figure
fig = plt.figure()
# add a subplot
ax = fig.add_subplot(111)
# plot some data using black
ax.hist(x, bins=50, color="black")
# plot red dashed line at threshold
ax.axvline(threshold, color='r', linestyle='dashed', linewidth=2)
# set title
# add percentage of data removed
ax.set_title(f"{dataset} (removed {perc:.2%})")
plt.xlabel("Value")
plt.ylabel("Frequency")
# make it look nice
plt.tight_layout()
return fig
with gr.Blocks() as demo:
dataset = gr.Radio(list(dataset_data.keys()), label="Dataset", value="arXiv")
print(dataset.value)
with gr.Tab("Character Repetition Ratio"):
# plot some random data
plot = gr.Plot()
threshold = gr.Slider(minimum=0, maximum=100, label="Threshold")
calculate = gr.Button("Calculate")
plot_fn = partial(plt_plot, "word_rep_ratios")
calculate.click(plot_fn, [dataset, threshold], plot)
with gr.Tab("Word Repetition Ratio"):# plot some random data
plot = gr.Plot()
threshold = gr.Slider(minimum=0, maximum=1, label="Threshold")
calculate = gr.Button("Calculate")
plot_fn = partial(plt_plot, "char_rep_ratios")
calculate.click(plot_fn, [dataset, threshold], plot)
with gr.Tab("Flagged Word Ratio"):# plot some random data
plot = gr.Plot()
threshold = gr.Slider(minimum=0, maximum=1, label="Threshold")
calculate = gr.Button("Calculate")
plot_fn = partial(plt_plot, "flagged_word_ratios")
calculate.click(plot_fn, [dataset, threshold], plot)
if __name__ == "__main__":
demo.launch(share=True)