HugoLaurencon HF staff commited on
Commit
4bbaeac
1 Parent(s): f622ed0

first draft

Browse files
Files changed (1) hide show
  1. app.py +61 -13
app.py CHANGED
@@ -1,23 +1,71 @@
1
  import streamlit as st
2
- import datasets
3
- from functools import partial
 
4
 
5
 
6
- data = datasets.load_dataset("json", data_files="small_test_data.jsonl")["train"].select(range(100))
7
 
8
- bad_cutoff = st.slider('Bad words cutoff', 0, 1)
9
- stp_cutoff = st.slider('Stop words cutoff', 0, 1)
10
- ppl_cutoff = st.slider('ppl cutoff', 0, 1)
11
 
 
 
 
12
 
13
- def filter_ppl(examples, invert=False):
14
- return [ppl < ppl_cutoff for ppl in examples["ppl"]]
15
 
16
- def filter_bad(examples, invert=False):
17
- return [bad < bad_cutoff for bad in examples["bad_words"]]
 
 
 
18
 
19
- def filter_stp(examples, invert=False):
20
- return [stp > stp_cutoff for stp in examples["stop_words"]]
 
 
 
 
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- st.table(data)
 
1
  import streamlit as st
2
+ import json
3
+ import pandas as pd
4
+ import numpy as np
5
 
6
 
7
+ st.title('5k English documents from Oscar with their stats.')
8
 
9
+ path_data = "./10K_english_examples_with_stats.json"
10
+ with open(path_data) as json_file:
11
+ data = json.load(json_file)
12
 
13
+ data = data[:5000]
14
+ data = pd.DataFrame(data)
15
+ del data["len_words"]
16
 
17
+ st.header('Parameters of the filtering')
 
18
 
19
+ cutoff_special_characters_ratio = st.slider("Max cutoff special characters ratio", 0., 1., 1., step=0.01)
20
+ cutoff_stopwords_ratio = st.slider("Min cutoff stopwords ratio", 0., 1., 0., step=0.01)
21
+ cutoff_badwords_ratio = st.slider("Max cutoff badwords ratio", 0., 1., 1., step=0.001)
22
+ cutoff_lang_id_score = st.slider("Min cutoff lang id score", 0., 1., 0., step=0.01)
23
+ cutoff_perplexity_score = st.slider("Perplexity cutoff perplexity score", 0, 14000000, 14000000)
24
 
25
+ keys = [
26
+ ("special_characters_ratio", cutoff_special_characters_ratio, True),
27
+ ("stopwords_ratio", cutoff_stopwords_ratio, False),
28
+ ("badwords_ratio", cutoff_badwords_ratio, True),
29
+ ("lang_id_score", cutoff_lang_id_score, False),
30
+ ("perplexity_score", cutoff_perplexity_score, True),
31
+ ]
32
 
33
+ cond = [(data[key] <= cutoff) if max_cutoff else (data[key] >= cutoff) for key, cutoff, max_cutoff in keys]
34
+ cond = np.all(cond, axis=0)
35
+
36
+ data_keep = data.loc[cond]
37
+ st.header('Data that we keep')
38
+ st.markdown("Click on a column to sort by it.")
39
+ st.markdown("Place the cursor on the text to display it.")
40
+ st.dataframe(data_keep)
41
+
42
+ data_not_keep = data.loc[np.invert(cond)]
43
+ st.header('Data that is thrown away')
44
+ st.markdown("Click on a column to sort by it.")
45
+ st.markdown("Place the cursor on the text to display it.")
46
+ st.dataframe(data_not_keep)
47
+
48
+ def plot_hist(key, num_bins=50):
49
+ st.header(" ".join(key.split("_")))
50
+ hist_values = data[key].values
51
+ max_range = np.max(hist_values)
52
+ hist_values = np.histogram(
53
+ hist_values,
54
+ bins=num_bins,
55
+ range=(0,max_range)
56
+ )[0]
57
+ st.bar_chart(hist_values)
58
+ st.markdown(f"Each bin is of size: {max_range/num_bins}.")
59
+
60
+ for key, _, _ in keys:
61
+ plot_hist(key)
62
+
63
+ st.header('Download data')
64
+
65
+ with open(path_data) as json_file:
66
+ btn = st.download_button(
67
+ label="Download data as json",
68
+ data=json_file,
69
+ file_name='data.json',
70
+ )
71