diff --git a/.gitattributes b/.gitattributes index ac481c8eb05e4d2496fbe076a38a7b4835dd733d..1ea554a480d0b5b87e83c33975b921551a4770d1 100644 --- a/.gitattributes +++ b/.gitattributes @@ -25,3 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zstandard filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..c17485e497a5a240d121479c1fafc6a615d57a75 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*DS_Store diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..352c90c54f554703fab7acec3bb24b26de808e02 --- /dev/null +++ b/app.py @@ -0,0 +1,77 @@ +import streamlit as st + + +PATH_PLOTS = "./plots" + +LANGUAGES = { + "Arabic": "ar", + "Basque": "eu", + "Bengali": "bn", + "Catalan": "ca", + "Chinese": "zh", + "English": "en", + "French": "fr", + "Hindi": "hi", + "Indonesian": "id", + "Portuguese": "pt", + "Spanish": "es", + "Urdu": "ur", + "Vietnamese": "vi", +} + +FILTERS = [ + "number of words", + "character repetition ratio", + "word repetition ratio", + "special character ratio", + "closed class word ratio", + "flagged word ratio", + "perplexity score", +] + + +class Visualization: + def __init__(self): + pass + + def set_title(self): + st.title("Visualization of the distributions of the filter values for the BigScience Corpus") + + def choose_language(self): + chosen_language = st.sidebar.selectbox( + "Language", + options=list(LANGUAGES.keys()), + index=5 # English + ) + self.chosen_language = LANGUAGES[chosen_language] + + def choose_filter(self): + chosen_filter = st.sidebar.selectbox( + "Filter on the", + options=FILTERS, + index=0 + ) + self.chosen_filter = chosen_filter.replace(" ", "_") + + def display_plot(self): + path_image = f"{PATH_PLOTS}/{self.chosen_language}_{self.chosen_filter}.png" + + col1, col2, col3 = st.columns([1,6,1]) + with col1: + st.write("") + with col2: + st.image(path_image) + with col3: + st.write("") + + def visualization(self): + self.set_title() + self.choose_language() + self.choose_filter() + self.display_plot() + + +if __name__ == "__main__": + st.set_page_config(layout="wide") + visualization = Visualization() + visualization.visualization() diff --git a/plots/ar_character_repetition_ratio.png b/plots/ar_character_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..14b0de87c0a5abff91f06cd1bbdc4629b17c884e --- /dev/null +++ b/plots/ar_character_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:baf81a5a0cc829dbe1e2fad3742a576d907945e38612790132e2f84d5daafb34 +size 173621 diff --git a/plots/ar_closed_class_word_ratio.png b/plots/ar_closed_class_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..b5e3975cd71939fb1bc2821c2a266e014ae7120f --- /dev/null +++ b/plots/ar_closed_class_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a354e4c26a61f69f7a547ba2e73882e7f3334c48c3cc38eafdf66d74aa2ce060 +size 168956 diff --git a/plots/ar_flagged_word_ratio.png b/plots/ar_flagged_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..33f8495a47a8a430ad96a0a05bcf84e3f398821e --- /dev/null +++ b/plots/ar_flagged_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:904de90ab7b5411433642513eb24455d051b498778fcc1fdd842780b59b0cd74 +size 135737 diff --git a/plots/ar_number_of_words.png b/plots/ar_number_of_words.png new file mode 100644 index 0000000000000000000000000000000000000000..2725b502cea447211488e80700bc30cb05aa1906 --- /dev/null +++ b/plots/ar_number_of_words.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06fe8e274916534a3169a4c014d33fbae6506d4873016629ecf383581f67ac36 +size 158083 diff --git a/plots/ar_perplexity_score.png b/plots/ar_perplexity_score.png new file mode 100644 index 0000000000000000000000000000000000000000..86f8bfebc1017acd01ed2dd07ff8f4aaa1cd1d2c --- /dev/null +++ b/plots/ar_perplexity_score.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bfaf281edad1587ced695535f957a68b0ada096b4cc89eb45ed0afe03df731e +size 179482 diff --git a/plots/ar_special_character_ratio.png b/plots/ar_special_character_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..1b0b20614debef98be0ca1e9713ef712ad34a7f6 --- /dev/null +++ b/plots/ar_special_character_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1be6fe4bb706e4ef2755384c9b004f573c09407b5ac5f4eeec2126fcfefdeb16 +size 156036 diff --git a/plots/ar_word_repetition_ratio.png b/plots/ar_word_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..e0073219ca2be56ede490e6cd37b4f83256678eb --- /dev/null +++ b/plots/ar_word_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0777d3e8cf7a099031bc4ba62106ca3e9ef7b77860b11c23281739e006ff11f3 +size 123827 diff --git a/plots/bn_character_repetition_ratio.png b/plots/bn_character_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..cf617daf30784af23533ae17b0c7721c61d24243 --- /dev/null +++ b/plots/bn_character_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2c1ef996fa3dfdd18f72ae6d8a6af5a2f310576b35f066cbaec152e22e4b8ef +size 144644 diff --git a/plots/bn_closed_class_word_ratio.png b/plots/bn_closed_class_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..ccae3ed5c8c8286d9a1ad2c301fcb3e9044b43e9 --- /dev/null +++ b/plots/bn_closed_class_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25d29968260309af1bbdb3321a4dabe1cbef8feafd3eebfd999cca6219867b4c +size 124211 diff --git a/plots/bn_flagged_word_ratio.png b/plots/bn_flagged_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..6325c42a48a75731c539b357b7ce1253bc0f5099 --- /dev/null +++ b/plots/bn_flagged_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32680c70a68f6894db37abfd93e61f6ef5ec45478197baf8993f81077eaaa1b7 +size 93407 diff --git a/plots/bn_number_of_words.png b/plots/bn_number_of_words.png new file mode 100644 index 0000000000000000000000000000000000000000..577d848544101dc96b3581ddbf5d9366d270e430 --- /dev/null +++ b/plots/bn_number_of_words.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89dc8ae8a078c06daf5d90c5a3399a93f8d6dc6f0f1e741de89d228ddd42930b +size 121482 diff --git a/plots/bn_perplexity_score.png b/plots/bn_perplexity_score.png new file mode 100644 index 0000000000000000000000000000000000000000..90cffc9d9880f098e8f981c902cf3214c42026d4 --- /dev/null +++ b/plots/bn_perplexity_score.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da594778413eb7fd1ff8f792aaf624a2269f5b6a993b89cff8b93be79d4dcbcc +size 149074 diff --git a/plots/bn_special_character_ratio.png b/plots/bn_special_character_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..e4858ab5df2f681dd1d50dce6652848accc43873 --- /dev/null +++ b/plots/bn_special_character_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4eb4f5ba2090f0ac28ffc6b44dd16caa4f8c1d038678e53df6c795ff438eb127 +size 127342 diff --git a/plots/bn_word_repetition_ratio.png b/plots/bn_word_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..0b5996aee995badaa82dde76916f971bd6f3a965 --- /dev/null +++ b/plots/bn_word_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:039f065d3bfb23288f19f5fdf0adffd2e708e6f02da0a8d29252cc8ff8b227a5 +size 108183 diff --git a/plots/ca_character_repetition_ratio.png b/plots/ca_character_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..a24a3947da750031b8a2283b8975342402168a9f --- /dev/null +++ b/plots/ca_character_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8812dd8abb9a6a5e7f3a43942ef57275cba538cd33284afb29a6472428fa3c63 +size 142847 diff --git a/plots/ca_closed_class_word_ratio.png b/plots/ca_closed_class_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..6a20b82d1e429fc1d68d40741ad1b50fe12769bf --- /dev/null +++ b/plots/ca_closed_class_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2eb8d0bf4cbcfdeb4b8a7339a70958d0a3ef6b9a29713ea8757f339a7f67ce6 +size 130055 diff --git a/plots/ca_flagged_word_ratio.png b/plots/ca_flagged_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..c75be4630d763cb6abf2d4147bb73a63e43a062c --- /dev/null +++ b/plots/ca_flagged_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3eace8e80e653296e1fc5db380a43424b30a9e2f82d7f26c362b8f7735e0b677 +size 115831 diff --git a/plots/ca_number_of_words.png b/plots/ca_number_of_words.png new file mode 100644 index 0000000000000000000000000000000000000000..7c340bb090067115610adb104c7870ab74b8c9c3 --- /dev/null +++ b/plots/ca_number_of_words.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c598b50d5a2a471830f26c780035848ee52fc5401c92e917507bfeeb2535ce2 +size 131659 diff --git a/plots/ca_perplexity_score.png b/plots/ca_perplexity_score.png new file mode 100644 index 0000000000000000000000000000000000000000..01b2a24bcb750e58424a22d5e794f225e77d654d --- /dev/null +++ b/plots/ca_perplexity_score.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf0530f78f3c7bf0001b5ca99bf7a38786b807916ab9659a36f638a01d01314d +size 158136 diff --git a/plots/ca_special_character_ratio.png b/plots/ca_special_character_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..6440240d461e29518677a95d8803b07850c4ca9b --- /dev/null +++ b/plots/ca_special_character_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:649605a63565597f49af5e046c3220ed2f77e1417d4e5019d95bc64956add00c +size 125503 diff --git a/plots/ca_word_repetition_ratio.png b/plots/ca_word_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..136861291c800269c5a2730bec0872cb8f1cffcb --- /dev/null +++ b/plots/ca_word_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:baddc170825ec53dd34d6f6b33dd475b640629311fa4bed7df188552c3549448 +size 106133 diff --git a/plots/en_character_repetition_ratio.png b/plots/en_character_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..145dc1a37b906c02053f31a49997e5db0319691e --- /dev/null +++ b/plots/en_character_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17cf5a75d99d8cf415e103ddfd499220418e5643fd9181e502ee7e9a494c0dec +size 163879 diff --git a/plots/en_closed_class_word_ratio.png b/plots/en_closed_class_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..0fba1d346163e012dd725b6d38554979f88911a1 --- /dev/null +++ b/plots/en_closed_class_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5c5f0b06125a0d0078c66a7beb5e49df17244244b646dbc3f5c1f533803492f +size 170783 diff --git a/plots/en_flagged_word_ratio.png b/plots/en_flagged_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..3070f018763423096bf30454c973714fd1fd9bee --- /dev/null +++ b/plots/en_flagged_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16d318728e392dd4bbfcce24eb58b6adbc7970f2081cdff23c7243085c96e272 +size 124373 diff --git a/plots/en_number_of_words.png b/plots/en_number_of_words.png new file mode 100644 index 0000000000000000000000000000000000000000..9e6301640f6fbd1008387bdd9779e43574a2949f --- /dev/null +++ b/plots/en_number_of_words.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48f4cb273978e5ca2604be4900b4fd741a6909eaa4361cafcf603feb0688748e +size 138062 diff --git a/plots/en_perplexity_score.png b/plots/en_perplexity_score.png new file mode 100644 index 0000000000000000000000000000000000000000..68579d267940da047279a3e52149e88dbc966a31 --- /dev/null +++ b/plots/en_perplexity_score.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d560e61df00d1a13bd7841fcf94b8544f9976fbc5d4fd12a1f868d7287ebc402 +size 167673 diff --git a/plots/en_special_character_ratio.png b/plots/en_special_character_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..c0c7b7da77ad6d616d68a4bc2d9e46752459604a --- /dev/null +++ b/plots/en_special_character_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fa5ca7109c84a26d7667ac7d22c2bcf08d426c86d53243cf70ed3871df7de73 +size 143933 diff --git a/plots/en_word_repetition_ratio.png b/plots/en_word_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..b9a796af9d8dfb32cdec5ae02018e56f47888e39 --- /dev/null +++ b/plots/en_word_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9a565010dc67a9655c8d5c78b75fc14537898eea5b41a96f977ec81a7eee18 +size 120704 diff --git a/plots/es_character_repetition_ratio.png b/plots/es_character_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..b4a74deab047e69d01140f9cb64360f6e1a7bb74 --- /dev/null +++ b/plots/es_character_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1b13f3d5c9f5ee4fcb451acfe10445e8be9804c25244332e7471c4e56e80639 +size 167007 diff --git a/plots/es_closed_class_word_ratio.png b/plots/es_closed_class_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..b002b9173aa6d6315d1a7dc4a0c30f831f865516 --- /dev/null +++ b/plots/es_closed_class_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0f182f4ea9c48b6449c93c80b5efe50c7d12db25edbafb84bae0d8e46bbee5f +size 160577 diff --git a/plots/es_flagged_word_ratio.png b/plots/es_flagged_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..2d426270dfc880946ed6d4ea96662fb6d23ed0e7 --- /dev/null +++ b/plots/es_flagged_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5729f9d8968cec508756a523e875fea51d86905573552f8c4bdebdcfc9f5de9b +size 112623 diff --git a/plots/es_number_of_words.png b/plots/es_number_of_words.png new file mode 100644 index 0000000000000000000000000000000000000000..afa99754de61e637949cd76baae725f609d3e638 --- /dev/null +++ b/plots/es_number_of_words.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:060c08dd8b6791ebdba01dba035039ecdc5f278f52106aeab7730beb7ae7f818 +size 129494 diff --git a/plots/es_perplexity_score.png b/plots/es_perplexity_score.png new file mode 100644 index 0000000000000000000000000000000000000000..07641683c1e5533490126f3e2cf199ad8d40a408 --- /dev/null +++ b/plots/es_perplexity_score.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4f79d8e9248b211527c93f82c4e63b18804d8d765e6ac9314a65b10d9bf310c +size 185849 diff --git a/plots/es_special_character_ratio.png b/plots/es_special_character_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..a88b2bfed8e65a5f075899f58c31b920fc071349 --- /dev/null +++ b/plots/es_special_character_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b40dd9627369aafbc1749410b73629078f38d8abbdde81abdce134608f2b90b +size 147501 diff --git a/plots/es_word_repetition_ratio.png b/plots/es_word_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..cfdb8d93012d07c3ea9dd46e0b31d9925c3c6d6e --- /dev/null +++ b/plots/es_word_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1064d62b8fc32db42271f6cf26cc74242e3772449bc49479082ad23bf1392350 +size 114317 diff --git a/plots/eu_character_repetition_ratio.png b/plots/eu_character_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..43be4f117397e8ab3d7296a7767f7f5d47991273 --- /dev/null +++ b/plots/eu_character_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3d825009e273f2f3167f10f78e2f2bc5e69e92965846b113f08386c7bdee17f +size 158696 diff --git a/plots/eu_closed_class_word_ratio.png b/plots/eu_closed_class_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..92fc5ac9d1fba765be55d3e220a41b6779f751bb --- /dev/null +++ b/plots/eu_closed_class_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25ceaa4fdc65dc19f5e18f04bf73bb6532559f432b7823182323718e5df2c1f9 +size 187877 diff --git a/plots/eu_flagged_word_ratio.png b/plots/eu_flagged_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..d876a7514f86673e53f40c48eeac72ab65471500 --- /dev/null +++ b/plots/eu_flagged_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce750c5e272c6dfef45852c4facd82fafac81d771de9c8818cd81312880e953b +size 118246 diff --git a/plots/eu_number_of_words.png b/plots/eu_number_of_words.png new file mode 100644 index 0000000000000000000000000000000000000000..335712e6526c9eabdf0b174dafaab5bcff1341b9 --- /dev/null +++ b/plots/eu_number_of_words.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1e44bdae7266b683577a7ffb8f9ce4dbe93eb841f45fd3d0909ee04afffa999 +size 132606 diff --git a/plots/eu_perplexity_score.png b/plots/eu_perplexity_score.png new file mode 100644 index 0000000000000000000000000000000000000000..6f20a4652b33e129f611ce57e5679ab1d7e87f1a --- /dev/null +++ b/plots/eu_perplexity_score.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79212bb07f2f3aa88f07f6512c58a2c6628b28ac63d9e6fcb54623d68ac48d27 +size 158453 diff --git a/plots/eu_special_character_ratio.png b/plots/eu_special_character_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..6c21ef26277a8327fc44b9065141c915aea953a7 --- /dev/null +++ b/plots/eu_special_character_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d994f85ccab727aa0d13bc61a14f817a7cb2f73c96c10463f2c420406247925d +size 156089 diff --git a/plots/eu_word_repetition_ratio.png b/plots/eu_word_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..032603e5acd11e160bb84b1add374f0054e17332 --- /dev/null +++ b/plots/eu_word_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1909e1c7780196f01e86c211cadb55889307a7f30a3def9e2d94c6486a338351 +size 108896 diff --git a/plots/fr_character_repetition_ratio.png b/plots/fr_character_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..4e7c81a1951cfe4305ffac8b8fb7009e9e24e534 --- /dev/null +++ b/plots/fr_character_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb72a7cc99bf105f89bec95278793984a1c56817cbb6a39b435f50255f038318 +size 157924 diff --git a/plots/fr_closed_class_word_ratio.png b/plots/fr_closed_class_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..e5617f5d2cac28233808c0b53423fac2d46d89e1 --- /dev/null +++ b/plots/fr_closed_class_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2eab9d238667126eca14243c58ee679c681dafa1db59f69a79f7ddb6135f9952 +size 149680 diff --git a/plots/fr_flagged_word_ratio.png b/plots/fr_flagged_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..408fb01c0aeb6950269670e1135d50d1bdb310e8 --- /dev/null +++ b/plots/fr_flagged_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d76137a58b8389671c5a8d44592609095f12a48e27dc821733ad9c0a7a846187 +size 125404 diff --git a/plots/fr_number_of_words.png b/plots/fr_number_of_words.png new file mode 100644 index 0000000000000000000000000000000000000000..952d0d3a7d46da3772d21eb0358bf44542684e36 --- /dev/null +++ b/plots/fr_number_of_words.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0706293ebc11343405826da43a83afc658217562e5acdde4b44a34b8d2244234 +size 123960 diff --git a/plots/fr_perplexity_score.png b/plots/fr_perplexity_score.png new file mode 100644 index 0000000000000000000000000000000000000000..e7ed49e2e489c7871d5817de5e5039718c19b1b0 --- /dev/null +++ b/plots/fr_perplexity_score.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdf3406197464ee54623e5c3fb96af20107b073c638fb58c2d97d8e7930b4eb7 +size 148864 diff --git a/plots/fr_special_character_ratio.png b/plots/fr_special_character_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..7ddcdf62778c0362258ae78a0876ab32e9cdf17f --- /dev/null +++ b/plots/fr_special_character_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8676569f0620ec07bd843c8d8f30d4b7c94154af22ce1301cc227ff2800d68e +size 147657 diff --git a/plots/fr_word_repetition_ratio.png b/plots/fr_word_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..3072eb1f1f1277dac8e9f595b5a16ac202014f2b --- /dev/null +++ b/plots/fr_word_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66aa8111587c721b04ba40a692c0210f0b24151c1f07deba91ede27d72e258bb +size 112400 diff --git a/plots/hi_character_repetition_ratio.png b/plots/hi_character_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..b26a540ec8352b5f795e14cae5d9407b3d25c371 --- /dev/null +++ b/plots/hi_character_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a3b7d595ec76cd69e798f83d0e00ec55182e5783840c1f331d6f15eb3915b2e +size 158549 diff --git a/plots/hi_closed_class_word_ratio.png b/plots/hi_closed_class_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..6e637b630090e063c44128abf1e92dc3c455ee18 --- /dev/null +++ b/plots/hi_closed_class_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1196692b0190846ae9a9fbe4d18f58473efde1c33604c6f93d0bf255cdbe56e4 +size 168159 diff --git a/plots/hi_flagged_word_ratio.png b/plots/hi_flagged_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..d1b87080c576ded8d24db3bc6e75adc5e6d35772 --- /dev/null +++ b/plots/hi_flagged_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87ebbcc5e76d796a850b94227b1d64c62ff24efdfda2bafb11b8e60319c047ec +size 102635 diff --git a/plots/hi_number_of_words.png b/plots/hi_number_of_words.png new file mode 100644 index 0000000000000000000000000000000000000000..de95642d8cc856bf379667c22f9b309904f0dbae --- /dev/null +++ b/plots/hi_number_of_words.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3460c68756a156a2a67d1837ecea2edf64c199dd60433a59bcc9b89aeddf1ab3 +size 128726 diff --git a/plots/hi_perplexity_score.png b/plots/hi_perplexity_score.png new file mode 100644 index 0000000000000000000000000000000000000000..d9ca1ed93dc72cdbd2f650718e87603248ffe43d --- /dev/null +++ b/plots/hi_perplexity_score.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bfb546fdfd36f1868d1c769e07933c8c0d291f5e67af717d4aeb391427853c9 +size 180328 diff --git a/plots/hi_special_character_ratio.png b/plots/hi_special_character_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..7681c112dc1963f7eec6bd54808e20f8590e8fd8 --- /dev/null +++ b/plots/hi_special_character_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97c76659155c9827e9d862641e6501c4bdd6de0ae23c92faebe77490fa79c808 +size 137083 diff --git a/plots/hi_word_repetition_ratio.png b/plots/hi_word_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..bf46c1a2d912c0f7d8b89f84d22cbdf12a69b486 --- /dev/null +++ b/plots/hi_word_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d67bcf86b3ee5ee506ea29b81e3e0d5ade532a8a6558cf55f5c752d287c984a +size 102535 diff --git a/plots/id_character_repetition_ratio.png b/plots/id_character_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..5a0b6a8d516ed262da34bc62620cdfb2096acebf --- /dev/null +++ b/plots/id_character_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f0904a67af7da7047855c4bef4b2fde6c622a8ef46c6d6393afc110abcfac5e +size 170094 diff --git a/plots/id_closed_class_word_ratio.png b/plots/id_closed_class_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..2fe4343be47be1edf9888c6c35a8411014ce848c --- /dev/null +++ b/plots/id_closed_class_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:225384662fe04dcc8503c885004203f61fd4c90b35835e33e0711cc72213499c +size 178841 diff --git a/plots/id_flagged_word_ratio.png b/plots/id_flagged_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..f44c7bc9946bd4137aef764a801404ec6c0e6289 --- /dev/null +++ b/plots/id_flagged_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee5583bdf37b00bcfab3b30914bab7a993ab3c94c95ef6f48c891dd51e7ca1b8 +size 141307 diff --git a/plots/id_number_of_words.png b/plots/id_number_of_words.png new file mode 100644 index 0000000000000000000000000000000000000000..92c30345f4ed342fb027281412c64743258e1415 --- /dev/null +++ b/plots/id_number_of_words.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a43d13acad75ea301a5ea28276ba7055bd8c01390fea58a7f4aedd003a7aa90f +size 157669 diff --git a/plots/id_perplexity_score.png b/plots/id_perplexity_score.png new file mode 100644 index 0000000000000000000000000000000000000000..e088106f35b0858b014fc50341f9102cd7e60524 --- /dev/null +++ b/plots/id_perplexity_score.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:632ab1b27b0d4c2b32e9da92f352714140567a000554a6c275cb7919abac5bf3 +size 185645 diff --git a/plots/id_special_character_ratio.png b/plots/id_special_character_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..97b32c2d9389f16f8cdf3047157dfbb64df7bf64 --- /dev/null +++ b/plots/id_special_character_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e71294b95a3021192ffd2e389f9d78d6b56b25f4f815628376ffd77b36ef9ef +size 151627 diff --git a/plots/id_word_repetition_ratio.png b/plots/id_word_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..59640b406dab53185da88b6c3efa446d183d606e --- /dev/null +++ b/plots/id_word_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ae7f7039789819848e67af56242603be1a67e22be247edcc9c8cf86fd25843a +size 123190 diff --git a/plots/pt_character_repetition_ratio.png b/plots/pt_character_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..ee3384b398dedd217213a6105693b9314e3fc4e9 --- /dev/null +++ b/plots/pt_character_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcc65459da9465a0375dc5066f32444bf51ad1a80d0fa0525141987456018957 +size 168370 diff --git a/plots/pt_closed_class_word_ratio.png b/plots/pt_closed_class_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..75b87796180300e08f3a5d3f770082548a12f403 --- /dev/null +++ b/plots/pt_closed_class_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f8e16ab07f6c94a692d58f16a49ac2ddc51beed01bee41c322811a611257130 +size 176287 diff --git a/plots/pt_flagged_word_ratio.png b/plots/pt_flagged_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..ceebb3f001e6eb06ed9e8c7881b595635aeba998 --- /dev/null +++ b/plots/pt_flagged_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70e565ee15480a32d3104e407f79b4cbd50b3e53e43d4a97552d7a7178e9575a +size 119123 diff --git a/plots/pt_number_of_words.png b/plots/pt_number_of_words.png new file mode 100644 index 0000000000000000000000000000000000000000..3233073e61ffe6cac10c85b8cfec564650e1138a --- /dev/null +++ b/plots/pt_number_of_words.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adb39972af989e7c673fc752eff8b2a72244cc756c5ecef0dd755879e9ecb1dd +size 142510 diff --git a/plots/pt_perplexity_score.png b/plots/pt_perplexity_score.png new file mode 100644 index 0000000000000000000000000000000000000000..948a7d3d9a1e3206733c7a1556b19ce0204d492a --- /dev/null +++ b/plots/pt_perplexity_score.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b198c30074bc112725c15aca79f99c83da3a8eeebdc84cc117bca9913ff1732d +size 180941 diff --git a/plots/pt_special_character_ratio.png b/plots/pt_special_character_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..9f563ffe72b508e4b63d826b31f9fb2266d9b341 --- /dev/null +++ b/plots/pt_special_character_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6274ad882fac16593a84dc798d38162c0f774fbe6b7fe114316bbfee9f2db04 +size 153363 diff --git a/plots/pt_word_repetition_ratio.png b/plots/pt_word_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..0aaaca736f4d31fa0a45268747a975b18aaf5204 --- /dev/null +++ b/plots/pt_word_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:501981f2ab9b946e4593cc5769664813971af17cac63e61a46362d169d119e6b +size 119477 diff --git a/plots/ur_character_repetition_ratio.png b/plots/ur_character_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..0047461dc735b4c89306e3794ba6fb5e37503b9e --- /dev/null +++ b/plots/ur_character_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:958c1668ad15fe43d94eb7abf19f4fb5388fa8e0ce3ea0e021a3a4d8989d8349 +size 125276 diff --git a/plots/ur_closed_class_word_ratio.png b/plots/ur_closed_class_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..50614ccea1faa4eb0ee9f6db0e9dbed9b7e8f5c4 --- /dev/null +++ b/plots/ur_closed_class_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba44afd1fd2d568bce1af7fba1ca64cd48f12e0895aa251c30294079ca26d5e9 +size 166078 diff --git a/plots/ur_flagged_word_ratio.png b/plots/ur_flagged_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..d05a20d1ab8275f6ecfcad8ca4eb7950e19271d8 --- /dev/null +++ b/plots/ur_flagged_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc2bf3431ea98f2db3a04e5af254637bd2926d92a44452f7e7ac165d13e70246 +size 106455 diff --git a/plots/ur_number_of_words.png b/plots/ur_number_of_words.png new file mode 100644 index 0000000000000000000000000000000000000000..fef82de6adab5761ee702eb9024e7947b54fd7b4 --- /dev/null +++ b/plots/ur_number_of_words.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69b6b8290914816fa4a7ca0f8c5840b2b1b1e06b6127013656003e188acf1497 +size 105619 diff --git a/plots/ur_perplexity_score.png b/plots/ur_perplexity_score.png new file mode 100644 index 0000000000000000000000000000000000000000..2fd79c38bb97493731f78c1c80377539dfe8dec2 --- /dev/null +++ b/plots/ur_perplexity_score.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f52532ad31f7c315b48e39a89cfebb39545665212ee15471cb709645bfc0c8c +size 160741 diff --git a/plots/ur_special_character_ratio.png b/plots/ur_special_character_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..23116f24a27697ad3586333a706172a880a4934d --- /dev/null +++ b/plots/ur_special_character_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43bed1f6decd8f94c7b1d6300aa82d88e29af5280135f902801bba062541618 +size 126502 diff --git a/plots/ur_word_repetition_ratio.png b/plots/ur_word_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..c7cdbc1b885d4a771337eeaff868137f1b0fb47c --- /dev/null +++ b/plots/ur_word_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad517ee0d88070475a871546038d27dee9ae9cfcdf7aecb69db57050fce1ddb1 +size 102442 diff --git a/plots/vi_character_repetition_ratio.png b/plots/vi_character_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..c5f2138e29a933064fee8f609497e36a5dd7bb74 --- /dev/null +++ b/plots/vi_character_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6acd3f46f8b2a30550f37edb9c14570628464a37fb5b595e8602278faddacd9 +size 132811 diff --git a/plots/vi_closed_class_word_ratio.png b/plots/vi_closed_class_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..5e4ba1ae4526b67cf1ab69590a3249e929f0815b --- /dev/null +++ b/plots/vi_closed_class_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c246fd9fc0fe5d428c424281f76464ae15dfd22d736be795b6c1b9c82fd7fdd8 +size 152334 diff --git a/plots/vi_flagged_word_ratio.png b/plots/vi_flagged_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..73203c637b8fbb2f0862093abb9d170d790ab211 --- /dev/null +++ b/plots/vi_flagged_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:786f1645e9c238ed4124a8040efb700139dc4304d2280b16922241e5d0562816 +size 106870 diff --git a/plots/vi_number_of_words.png b/plots/vi_number_of_words.png new file mode 100644 index 0000000000000000000000000000000000000000..836f67ef38bb750abe1d4b1d95acdd8fd276b066 --- /dev/null +++ b/plots/vi_number_of_words.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c692d7a02d857ecfcdaf46a3cf6292c4a439a61efdcb5b3775786930e382aa21 +size 106236 diff --git a/plots/vi_perplexity_score.png b/plots/vi_perplexity_score.png new file mode 100644 index 0000000000000000000000000000000000000000..e984633cb5b4f55bba30a4a55b6d2162ce904421 --- /dev/null +++ b/plots/vi_perplexity_score.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0723d7b20157fe324e579dd4b85c0825eaa25cf6f9a02b02e4112cb8bcf15c34 +size 151749 diff --git a/plots/vi_special_character_ratio.png b/plots/vi_special_character_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..91f0f7dfd73b3f95aea830ff33b7995247ab98f0 --- /dev/null +++ b/plots/vi_special_character_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8c5f78a322b27b95221612b0177a83d920cf26d361070d54be58586b4091a27 +size 133729 diff --git a/plots/vi_word_repetition_ratio.png b/plots/vi_word_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..bc2d13ba488481c4d79a92b6f9afe7a5a67ba1f8 --- /dev/null +++ b/plots/vi_word_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3c0056741059289f3cc116883f3d3390d8141de67f49bbc5ff5ba2d3c26dc15 +size 108754 diff --git a/plots/zh_character_repetition_ratio.png b/plots/zh_character_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..1978517c15294037bb82e0668a98e45e20cd4a75 --- /dev/null +++ b/plots/zh_character_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e214b2d426b9ab0bc2a24cc9a529978c411a2e17b744fbeb3296e7b0ff8e1e4e +size 160035 diff --git a/plots/zh_closed_class_word_ratio.png b/plots/zh_closed_class_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..99048d04906baa8bfb2f972951a15417b800efb1 --- /dev/null +++ b/plots/zh_closed_class_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:605646f10050a8454ac366b0dd4eeaf09763f1b34101ecf708066a4af530c157 +size 170979 diff --git a/plots/zh_flagged_word_ratio.png b/plots/zh_flagged_word_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..90cb5a91917375b13a1c9bfff4ee65d329cbb810 --- /dev/null +++ b/plots/zh_flagged_word_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04e048de1bbba58c6d142ebe646e1c6aee50ec12274b8864d14dde4c33f3540e +size 132581 diff --git a/plots/zh_number_of_words.png b/plots/zh_number_of_words.png new file mode 100644 index 0000000000000000000000000000000000000000..3c0ee7947e96bb9753afedf30ada61206b89a65a --- /dev/null +++ b/plots/zh_number_of_words.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:686d20d6c542db07b6742f75b7a3bf50144b2d99391bf02366cf904b10a0f930 +size 151241 diff --git a/plots/zh_perplexity_score.png b/plots/zh_perplexity_score.png new file mode 100644 index 0000000000000000000000000000000000000000..4791a65bebfa739bda32bb236dbe409ce9ab3941 --- /dev/null +++ b/plots/zh_perplexity_score.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59af7f4a6863f773b15a6ae7838ed01d84760478734915175ec8d0894777c4ec +size 176268 diff --git a/plots/zh_special_character_ratio.png b/plots/zh_special_character_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..41ab1b9c689be153dccc5980eec6ca94c7b93d33 --- /dev/null +++ b/plots/zh_special_character_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fee9251c2063733e2794185b20f34252a02f110f88718b5fd15c0e4434722c6 +size 157665 diff --git a/plots/zh_word_repetition_ratio.png b/plots/zh_word_repetition_ratio.png new file mode 100644 index 0000000000000000000000000000000000000000..8f4310eb4b837c943c1f0c334c151988e00f5ccf --- /dev/null +++ b/plots/zh_word_repetition_ratio.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9efb53f99b39b99c5945bfbd6f28121880aedf85efdf27b498b3c4313053e448 +size 122307