hassiahk commited on
Commit
666b7aa
1 Parent(s): 3f6b043

Model changes and code formatting

Browse files
Files changed (5) hide show
  1. .gitignore +131 -0
  2. app.py +52 -48
  3. config.json +8 -0
  4. mlm_custom/test_mlm.py +6 -5
  5. requirements.txt +1 -4
.gitignore ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
130
+
131
+ .vscode/
app.py CHANGED
@@ -1,83 +1,87 @@
1
- from pandas.io.formats.format import return_docstring
2
- import streamlit as st
3
- import pandas as pd
4
- from transformers import AutoTokenizer,AutoModelForMaskedLM
5
- from transformers import pipeline
6
- import os
7
  import json
8
  import random
9
- import numpy as np
 
 
 
 
 
 
10
 
11
 
12
- @st.cache(show_spinner=False,persist=True)
13
- def load_model(masked_text,model_name):
14
 
15
- model = AutoModelForMaskedLM.from_pretrained(model_name, from_flax=True)
16
  tokenizer = AutoTokenizer.from_pretrained(model_name)
17
- nlp = pipeline('fill-mask', model=model, tokenizer=tokenizer)
18
-
19
  MASK_TOKEN = tokenizer.mask_token
20
-
21
- masked_text = masked_text.replace("<mask>",MASK_TOKEN)
22
  result_sentence = nlp(masked_text)
23
 
24
- return result_sentence[0]['sequence'], result_sentence[0]['token_str']
 
25
 
26
  def main():
27
 
28
  st.title("RoBERTa Hindi")
29
  st.markdown(
30
- "This demo uses pretrained RoBERTa variants for Mask Language Modeling (MLM)"
 
 
 
 
31
  )
32
 
 
 
33
  models = st.multiselect(
34
- "Choose models",
35
- ['flax-community/roberta-hindi','mrm8488/HindiBERTa',\
36
- 'neuralspace-reverie/indic-transformers-hi-bert',
37
- 'surajp/RoBERTa-hindi-guj-san'],
38
- ["flax-community/roberta-hindi"]
39
- )
40
-
41
- target_text_path = './mlm_custom/mlm_targeted_text.csv'
42
  target_text_df = pd.read_csv(target_text_path)
43
-
44
- texts = target_text_df['text']
45
-
46
  st.sidebar.title("Hindi MLM")
47
-
48
  pick_random = st.sidebar.checkbox("Pick any random text")
49
-
50
- results_df = pd.DataFrame(columns = ['Model Name','Filled Token','Filled Text'])
51
-
52
  model_names = []
53
  filled_masked_texts = []
54
  filled_tokens = []
55
-
56
  if pick_random:
57
- random_text = texts[random.randint(0,texts.shape[0]-1)]
58
- masked_text = st.text_area("Please type a masked sentence to fill",random_text)
59
  else:
60
- select_text = st.sidebar.selectbox('Select any of the following text',\
61
- texts)
62
- masked_text = st.text_area("Please type a masked sentence to fill",select_text)
63
-
64
- #pd.set_option('max_colwidth',30)
65
- if st.button('Fill the Mask!'):
66
  with st.spinner("Filling the Mask..."):
67
 
68
  for selected_model in models:
69
 
70
- filled_sentence,filled_token = load_model(masked_text,selected_model)
71
  model_names.append(selected_model)
72
  filled_tokens.append(filled_token)
73
  filled_masked_texts.append(filled_sentence)
74
 
75
- results_df['Model Name'] = model_names
76
- results_df['Filled Token'] = filled_tokens
77
- results_df['Filled Text'] = filled_masked_texts
78
-
79
- #st.table(results_df)
80
- st.write(results_df)
81
 
82
  if __name__ == "__main__":
83
- main()
 
 
 
 
 
 
 
1
  import json
2
  import random
3
+
4
+ import pandas as pd
5
+ import streamlit as st
6
+ from transformers import AutoModelForMaskedLM, AutoTokenizer, pipeline
7
+
8
+ with open("config.json") as f:
9
+ cfg = json.loads(f.read())
10
 
11
 
12
+ @st.cache(show_spinner=False, persist=True)
13
+ def load_model(masked_text, model_name):
14
 
15
+ model = AutoModelForMaskedLM.from_pretrained(model_name)
16
  tokenizer = AutoTokenizer.from_pretrained(model_name)
17
+ nlp = pipeline("fill-mask", model=model, tokenizer=tokenizer)
18
+
19
  MASK_TOKEN = tokenizer.mask_token
20
+
21
+ masked_text = masked_text.replace("<mask>", MASK_TOKEN)
22
  result_sentence = nlp(masked_text)
23
 
24
+ return result_sentence[0]["sequence"], result_sentence[0]["token_str"]
25
+
26
 
27
  def main():
28
 
29
  st.title("RoBERTa Hindi")
30
  st.markdown(
31
+ "This demo uses the below pretrained BERT variants for Mask Language Modeling (MLM):\n"
32
+ "- [RoBERTa Hindi](https://huggingface.co/flax-community/roberta-hindi)\n"
33
+ "- [Indic Transformers Hindi](https://huggingface.co/neuralspace-reverie/indic-transformers-hi-bert)\n"
34
+ "- [HindiBERTa](https://huggingface.co/mrm8488/HindiBERTa)\n"
35
+ "- [RoBERTa Hindi Guj San](https://huggingface.co/surajp/RoBERTa-hindi-guj-san)"
36
  )
37
 
38
+ models_list = list(cfg["models"].keys())
39
+
40
  models = st.multiselect(
41
+ "Choose models",
42
+ models_list,
43
+ models_list[0],
44
+ )
45
+
46
+ target_text_path = "./mlm_custom/mlm_targeted_text.csv"
 
 
47
  target_text_df = pd.read_csv(target_text_path)
48
+
49
+ texts = target_text_df["text"]
50
+
51
  st.sidebar.title("Hindi MLM")
52
+
53
  pick_random = st.sidebar.checkbox("Pick any random text")
54
+
55
+ results_df = pd.DataFrame(columns=["Model Name", "Filled Token", "Filled Text"])
56
+
57
  model_names = []
58
  filled_masked_texts = []
59
  filled_tokens = []
60
+
61
  if pick_random:
62
+ random_text = texts[random.randint(0, texts.shape[0] - 1)]
63
+ masked_text = st.text_area("Please type a masked sentence to fill", random_text)
64
  else:
65
+ select_text = st.sidebar.selectbox("Select any of the following text", texts)
66
+ masked_text = st.text_area("Please type a masked sentence to fill", select_text)
67
+
68
+ # pd.set_option('max_colwidth',30)
69
+ if st.button("Fill the Mask!"):
 
70
  with st.spinner("Filling the Mask..."):
71
 
72
  for selected_model in models:
73
 
74
+ filled_sentence, filled_token = load_model(masked_text, cfg["models"][selected_model])
75
  model_names.append(selected_model)
76
  filled_tokens.append(filled_token)
77
  filled_masked_texts.append(filled_sentence)
78
 
79
+ results_df["Model Name"] = model_names
80
+ results_df["Filled Token"] = filled_tokens
81
+ results_df["Filled Text"] = filled_masked_texts
82
+
83
+ st.table(results_df)
84
+
85
 
86
  if __name__ == "__main__":
87
+ main()
config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "models": {
3
+ "RoBERTa Hindi": "flax-community/roberta-hindi",
4
+ "Indic Transformers Hindi": "neuralspace-reverie/indic-transformers-hi-bert",
5
+ "HindiBERTa": "mrm8488/HindiBERTa",
6
+ "RoBERTa Hindi Guj San": "surajp/RoBERTa-hindi-guj-san"
7
+ }
8
+ }
mlm_custom/test_mlm.py CHANGED
@@ -1,9 +1,10 @@
1
- import pandas as pd
2
- import numpy as np
3
- from transformers import AutoTokenizer, RobertaModel, AutoModel, AutoModelForMaskedLM
4
- from transformers import pipeline
5
- import os
6
  import json
 
 
 
 
 
 
7
 
8
 
9
  class MLMTest():
 
 
 
 
 
 
1
  import json
2
+ import os
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ from transformers import (AutoModel, AutoModelForMaskedLM, AutoTokenizer,
7
+ RobertaModel, pipeline)
8
 
9
 
10
  class MLMTest():
requirements.txt CHANGED
@@ -1,6 +1,3 @@
1
  streamlit
2
  torch
3
- transformers
4
- jax
5
- jaxlib
6
- flax
 
1
  streamlit
2
  torch
3
+ transformers