HugoLaurencon HF staff commited on
Commit
94680d6
1 Parent(s): dc95951

first commit

Browse files
.gitattributes CHANGED
@@ -25,3 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ *.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
1
+ *.DS_Store
app.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+
4
+ PATH_PLOTS = "./plots"
5
+
6
+ LANGUAGES = {
7
+ "Arabic": "ar",
8
+ "Basque": "eu",
9
+ "Bengali": "bn",
10
+ "Catalan": "ca",
11
+ "Chinese": "zh",
12
+ "English": "en",
13
+ "French": "fr",
14
+ "Hindi": "hi",
15
+ "Indonesian": "id",
16
+ "Portuguese": "pt",
17
+ "Spanish": "es",
18
+ "Urdu": "ur",
19
+ "Vietnamese": "vi",
20
+ }
21
+
22
+ FILTERS = [
23
+ "number of words",
24
+ "character repetition ratio",
25
+ "word repetition ratio",
26
+ "special character ratio",
27
+ "closed class word ratio",
28
+ "flagged word ratio",
29
+ "perplexity score",
30
+ ]
31
+
32
+
33
+ class Visualization:
34
+ def __init__(self):
35
+ pass
36
+
37
+ def set_title(self):
38
+ st.title("Visualization of the distributions of the filter values for the BigScience Corpus")
39
+
40
+ def choose_language(self):
41
+ chosen_language = st.sidebar.selectbox(
42
+ "Language",
43
+ options=list(LANGUAGES.keys()),
44
+ index=0
45
+ )
46
+ self.chosen_language = LANGUAGES[chosen_language]
47
+
48
+ def choose_filter(self):
49
+ chosen_filter = st.sidebar.selectbox(
50
+ "Filter on the",
51
+ options=FILTERS,
52
+ index=0
53
+ )
54
+ self.chosen_filter = chosen_filter.replace(" ", "_")
55
+
56
+ def display_plot(self):
57
+ path_image = f"{PATH_PLOTS}/{self.chosen_language}_{self.chosen_filter}.png"
58
+
59
+ col1, col2, col3 = st.columns([1,6,1])
60
+ with col1:
61
+ st.write("")
62
+ with col2:
63
+ st.image(path_image)
64
+ with col3:
65
+ st.write("")
66
+
67
+ def visualization(self):
68
+ self.set_title()
69
+ self.choose_language()
70
+ self.choose_filter()
71
+ self.display_plot()
72
+
73
+
74
+ if __name__ == "__main__":
75
+ visualization = Visualization()
76
+ visualization.visualization()
plots/eu_character_repetition_ratio.png ADDED

Git LFS Details

  • SHA256: e528e29511ab985a398c99e969e08c1f9e48dc80df455b3ed7510e476be8c750
  • Pointer size: 131 Bytes
  • Size of remote file: 164 kB
plots/eu_closed_class_word_ratio.png ADDED

Git LFS Details

  • SHA256: 92313ec14a1442d628b882171f878b07b907706052c465cf34d12739910e4021
  • Pointer size: 131 Bytes
  • Size of remote file: 192 kB
plots/eu_flagged_word_ratio.png ADDED

Git LFS Details

  • SHA256: 58028fad5bde422407445ad7cae53226c63b2cc502d2cc93fb6ff71084729566
  • Pointer size: 131 Bytes
  • Size of remote file: 131 kB
plots/eu_number_of_words.png ADDED

Git LFS Details

  • SHA256: 9a8e1ce28ca3cf1769acfe9d81a968bc2b5f29818241def20f66e40626966314
  • Pointer size: 131 Bytes
  • Size of remote file: 126 kB
plots/eu_perplexity_score.png ADDED

Git LFS Details

  • SHA256: b4d46d783558259f5c6832f553da688211cc0c9f4f27177dad610deb86742729
  • Pointer size: 131 Bytes
  • Size of remote file: 155 kB
plots/eu_special_character_ratio.png ADDED

Git LFS Details

  • SHA256: f57cc31dfaa3fe3ef788843c119d20da431f006e0017484573c79e34efd06f13
  • Pointer size: 131 Bytes
  • Size of remote file: 156 kB
plots/eu_word_repetition_ratio.png ADDED

Git LFS Details

  • SHA256: d5744ec3407eef88c708a6d1baee6a94dff3c8984c0617f78bf185d896d70b9b
  • Pointer size: 131 Bytes
  • Size of remote file: 107 kB