prateekagrawal commited on
Commit
2434dff
1 Parent(s): 666b7aa

Changed Layout

Browse files
.gitignore DELETED
@@ -1,131 +0,0 @@
1
- # Byte-compiled / optimized / DLL files
2
- __pycache__/
3
- *.py[cod]
4
- *$py.class
5
-
6
- # C extensions
7
- *.so
8
-
9
- # Distribution / packaging
10
- .Python
11
- build/
12
- develop-eggs/
13
- dist/
14
- downloads/
15
- eggs/
16
- .eggs/
17
- lib/
18
- lib64/
19
- parts/
20
- sdist/
21
- var/
22
- wheels/
23
- pip-wheel-metadata/
24
- share/python-wheels/
25
- *.egg-info/
26
- .installed.cfg
27
- *.egg
28
- MANIFEST
29
-
30
- # PyInstaller
31
- # Usually these files are written by a python script from a template
32
- # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
- *.manifest
34
- *.spec
35
-
36
- # Installer logs
37
- pip-log.txt
38
- pip-delete-this-directory.txt
39
-
40
- # Unit test / coverage reports
41
- htmlcov/
42
- .tox/
43
- .nox/
44
- .coverage
45
- .coverage.*
46
- .cache
47
- nosetests.xml
48
- coverage.xml
49
- *.cover
50
- *.py,cover
51
- .hypothesis/
52
- .pytest_cache/
53
-
54
- # Translations
55
- *.mo
56
- *.pot
57
-
58
- # Django stuff:
59
- *.log
60
- local_settings.py
61
- db.sqlite3
62
- db.sqlite3-journal
63
-
64
- # Flask stuff:
65
- instance/
66
- .webassets-cache
67
-
68
- # Scrapy stuff:
69
- .scrapy
70
-
71
- # Sphinx documentation
72
- docs/_build/
73
-
74
- # PyBuilder
75
- target/
76
-
77
- # Jupyter Notebook
78
- .ipynb_checkpoints
79
-
80
- # IPython
81
- profile_default/
82
- ipython_config.py
83
-
84
- # pyenv
85
- .python-version
86
-
87
- # pipenv
88
- # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
- # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
- # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
- # install all needed dependencies.
92
- #Pipfile.lock
93
-
94
- # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
- __pypackages__/
96
-
97
- # Celery stuff
98
- celerybeat-schedule
99
- celerybeat.pid
100
-
101
- # SageMath parsed files
102
- *.sage.py
103
-
104
- # Environments
105
- .env
106
- .venv
107
- env/
108
- venv/
109
- ENV/
110
- env.bak/
111
- venv.bak/
112
-
113
- # Spyder project settings
114
- .spyderproject
115
- .spyproject
116
-
117
- # Rope project settings
118
- .ropeproject
119
-
120
- # mkdocs documentation
121
- /site
122
-
123
- # mypy
124
- .mypy_cache/
125
- .dmypy.json
126
- dmypy.json
127
-
128
- # Pyre type checker
129
- .pyre/
130
-
131
- .vscode/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,86 +1,14 @@
1
- import json
2
- import random
3
-
4
- import pandas as pd
5
  import streamlit as st
6
- from transformers import AutoModelForMaskedLM, AutoTokenizer, pipeline
7
-
8
- with open("config.json") as f:
9
- cfg = json.loads(f.read())
10
-
11
-
12
- @st.cache(show_spinner=False, persist=True)
13
- def load_model(masked_text, model_name):
14
-
15
- model = AutoModelForMaskedLM.from_pretrained(model_name)
16
- tokenizer = AutoTokenizer.from_pretrained(model_name)
17
- nlp = pipeline("fill-mask", model=model, tokenizer=tokenizer)
18
-
19
- MASK_TOKEN = tokenizer.mask_token
20
-
21
- masked_text = masked_text.replace("<mask>", MASK_TOKEN)
22
- result_sentence = nlp(masked_text)
23
-
24
- return result_sentence[0]["sequence"], result_sentence[0]["token_str"]
25
 
26
 
27
  def main():
28
-
29
- st.title("RoBERTa Hindi")
30
- st.markdown(
31
- "This demo uses the below pretrained BERT variants for Mask Language Modeling (MLM):\n"
32
- "- [RoBERTa Hindi](https://huggingface.co/flax-community/roberta-hindi)\n"
33
- "- [Indic Transformers Hindi](https://huggingface.co/neuralspace-reverie/indic-transformers-hi-bert)\n"
34
- "- [HindiBERTa](https://huggingface.co/mrm8488/HindiBERTa)\n"
35
- "- [RoBERTa Hindi Guj San](https://huggingface.co/surajp/RoBERTa-hindi-guj-san)"
36
- )
37
-
38
- models_list = list(cfg["models"].keys())
39
-
40
- models = st.multiselect(
41
- "Choose models",
42
- models_list,
43
- models_list[0],
44
- )
45
-
46
- target_text_path = "./mlm_custom/mlm_targeted_text.csv"
47
- target_text_df = pd.read_csv(target_text_path)
48
-
49
- texts = target_text_df["text"]
50
-
51
- st.sidebar.title("Hindi MLM")
52
-
53
- pick_random = st.sidebar.checkbox("Pick any random text")
54
-
55
- results_df = pd.DataFrame(columns=["Model Name", "Filled Token", "Filled Text"])
56
-
57
- model_names = []
58
- filled_masked_texts = []
59
- filled_tokens = []
60
-
61
- if pick_random:
62
- random_text = texts[random.randint(0, texts.shape[0] - 1)]
63
- masked_text = st.text_area("Please type a masked sentence to fill", random_text)
64
- else:
65
- select_text = st.sidebar.selectbox("Select any of the following text", texts)
66
- masked_text = st.text_area("Please type a masked sentence to fill", select_text)
67
-
68
- # pd.set_option('max_colwidth',30)
69
- if st.button("Fill the Mask!"):
70
- with st.spinner("Filling the Mask..."):
71
-
72
- for selected_model in models:
73
-
74
- filled_sentence, filled_token = load_model(masked_text, cfg["models"][selected_model])
75
- model_names.append(selected_model)
76
- filled_tokens.append(filled_token)
77
- filled_masked_texts.append(filled_sentence)
78
-
79
- results_df["Model Name"] = model_names
80
- results_df["Filled Token"] = filled_tokens
81
- results_df["Filled Text"] = filled_masked_texts
82
-
83
- st.table(results_df)
84
 
85
 
86
  if __name__ == "__main__":
 
 
 
 
 
1
  import streamlit as st
2
+ from multiapp import MultiApp
3
+ from apps import about, credits, inference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
 
6
  def main():
7
+ app = MultiApp()
8
+ app.add_app("Inference", inference.app)
9
+ app.add_app("About", about.app)
10
+ app.add_app("Credits", credits.app)
11
+ app.run()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
 
14
  if __name__ == "__main__":
apps/__pycache__/about.cpython-38.pyc ADDED
Binary file (1.64 kB). View file
 
apps/__pycache__/credits.cpython-38.pyc ADDED
Binary file (311 Bytes). View file
 
apps/__pycache__/inference.cpython-38.pyc ADDED
Binary file (1.72 kB). View file
 
apps/about.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+
4
+ def app():
5
+ # st.title("About")
6
+ st.markdown("<h1 style='text-align: center;'>About</h1>", unsafe_allow_html=True)
7
+ st.markdown("""## Introduction""")
8
+ st.markdown(
9
+ """**RoBERTa-hindi** is one of the many projects in the Flax/JAX community week organized by HuggingFace in collaboration with Google to make compute-intensive projects more practicable."""
10
+ )
11
+ st.markdown(
12
+ """It is a monolingual transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts."""
13
+ )
14
+
15
+ st.markdown("""## Datasets used""")
16
+ st.markdown(
17
+ """RoBERTa-Hindi has been pretrained on a huge corpus consisting of multiple datasets. The entire list of datasets used is mentioned below : """
18
+ )
19
+ st.markdown(
20
+ """
21
+ 1. OSCAR
22
+ 2. mC4
23
+ 3. Indic-glue
24
+ 4. Hindi-wikipedia-articles-172k
25
+ 5. Hindi-text-short-summarization corpus
26
+ 6. Hindi-text-short-and-large-summarization corpus
27
+ 7. Oldnewspaperhindi
28
+ 8. Samanantar
29
+ """
30
+ )
31
+
32
+ st.markdown(
33
+ """
34
+ ***NOTE: Some of the datasets are readily available on the HuggingFace Datasets while the team developed the rest as per the docs.***
35
+ """
36
+ )
37
+
apps/credits.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ def app():
4
+ st.title(' Credits')
apps/inference.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pandas.io.formats.format import return_docstring
2
+ import streamlit as st
3
+ import pandas as pd
4
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
5
+ from transformers import pipeline
6
+ import os
7
+ import json
8
+ import random
9
+
10
+ with open("config.json") as f:
11
+ cfg = json.loads(f.read())
12
+
13
+
14
+ @st.cache(show_spinner=False, persist=True)
15
+ def load_model(masked_text, model_name):
16
+
17
+ model = AutoModelForMaskedLM.from_pretrained(model_name)
18
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
19
+ nlp = pipeline("fill-mask", model=model, tokenizer=tokenizer)
20
+
21
+ MASK_TOKEN = tokenizer.mask_token
22
+
23
+ masked_text = masked_text.replace("<mask>", MASK_TOKEN)
24
+ result_sentence = nlp(masked_text)
25
+
26
+ return result_sentence[0]["sequence"], result_sentence[0]["token_str"]
27
+
28
+
29
+ def app():
30
+ st.markdown(
31
+ "<h1 style='text-align: center; color: green;'>RoBERTa Hindi</h1>",
32
+ unsafe_allow_html=True,
33
+ )
34
+ st.markdown(
35
+ "This demo uses multiple hindi transformer models for Masked Language Modelling (MLM)."
36
+ )
37
+
38
+ models_list = list(cfg["models"].keys())
39
+
40
+ models = st.multiselect("Choose models", models_list, models_list[0],)
41
+
42
+ target_text_path = "./mlm_custom/mlm_targeted_text.csv"
43
+ target_text_df = pd.read_csv(target_text_path)
44
+
45
+ texts = target_text_df["text"]
46
+
47
+ st.sidebar.title("Hindi MLM")
48
+
49
+ pick_random = st.sidebar.checkbox("Pick any random text")
50
+
51
+ results_df = pd.DataFrame(columns=["Model Name", "Filled Token", "Filled Text"])
52
+
53
+ model_names = []
54
+ filled_masked_texts = []
55
+ filled_tokens = []
56
+
57
+ if pick_random:
58
+ random_text = texts[random.randint(0, texts.shape[0] - 1)]
59
+ masked_text = st.text_area("Please type a masked sentence to fill", random_text)
60
+ else:
61
+ select_text = st.sidebar.selectbox("Select any of the following text", texts)
62
+ masked_text = st.text_area("Please type a masked sentence to fill", select_text)
63
+
64
+ # pd.set_option('max_colwidth',30)
65
+ if st.button("Fill the Mask!"):
66
+ with st.spinner("Filling the Mask..."):
67
+
68
+ for selected_model in models:
69
+
70
+ filled_sentence, filled_token = load_model(
71
+ masked_text, cfg["models"][selected_model]
72
+ )
73
+ model_names.append(selected_model)
74
+ filled_tokens.append(filled_token)
75
+ filled_masked_texts.append(filled_sentence)
76
+
77
+ results_df["Model Name"] = model_names
78
+ results_df["Filled Token"] = filled_tokens
79
+ results_df["Filled Text"] = filled_masked_texts
80
+
81
+ st.table(results_df)
multiapp.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Frameworks for running multiple Streamlit applications as a single app.
2
+ """
3
+ import streamlit as st
4
+
5
+
6
+ class MultiApp:
7
+ def __init__(self):
8
+ self.apps = []
9
+
10
+ def add_app(self, title, func):
11
+ self.apps.append({"title": title, "function": func})
12
+
13
+ def run(self):
14
+ st.sidebar.header("Navigation")
15
+ app = st.sidebar.selectbox("", self.apps, format_func=lambda app: app["title"])
16
+
17
+ app["function"]()
18
+