Spaces:
Runtime error
Runtime error
prateekagrawal
commited on
Commit
•
2434dff
1
Parent(s):
666b7aa
Changed Layout
Browse files- .gitignore +0 -131
- app.py +7 -79
- apps/__pycache__/about.cpython-38.pyc +0 -0
- apps/__pycache__/credits.cpython-38.pyc +0 -0
- apps/__pycache__/inference.cpython-38.pyc +0 -0
- apps/about.py +37 -0
- apps/credits.py +4 -0
- apps/inference.py +81 -0
- multiapp.py +18 -0
.gitignore
DELETED
@@ -1,131 +0,0 @@
|
|
1 |
-
# Byte-compiled / optimized / DLL files
|
2 |
-
__pycache__/
|
3 |
-
*.py[cod]
|
4 |
-
*$py.class
|
5 |
-
|
6 |
-
# C extensions
|
7 |
-
*.so
|
8 |
-
|
9 |
-
# Distribution / packaging
|
10 |
-
.Python
|
11 |
-
build/
|
12 |
-
develop-eggs/
|
13 |
-
dist/
|
14 |
-
downloads/
|
15 |
-
eggs/
|
16 |
-
.eggs/
|
17 |
-
lib/
|
18 |
-
lib64/
|
19 |
-
parts/
|
20 |
-
sdist/
|
21 |
-
var/
|
22 |
-
wheels/
|
23 |
-
pip-wheel-metadata/
|
24 |
-
share/python-wheels/
|
25 |
-
*.egg-info/
|
26 |
-
.installed.cfg
|
27 |
-
*.egg
|
28 |
-
MANIFEST
|
29 |
-
|
30 |
-
# PyInstaller
|
31 |
-
# Usually these files are written by a python script from a template
|
32 |
-
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
33 |
-
*.manifest
|
34 |
-
*.spec
|
35 |
-
|
36 |
-
# Installer logs
|
37 |
-
pip-log.txt
|
38 |
-
pip-delete-this-directory.txt
|
39 |
-
|
40 |
-
# Unit test / coverage reports
|
41 |
-
htmlcov/
|
42 |
-
.tox/
|
43 |
-
.nox/
|
44 |
-
.coverage
|
45 |
-
.coverage.*
|
46 |
-
.cache
|
47 |
-
nosetests.xml
|
48 |
-
coverage.xml
|
49 |
-
*.cover
|
50 |
-
*.py,cover
|
51 |
-
.hypothesis/
|
52 |
-
.pytest_cache/
|
53 |
-
|
54 |
-
# Translations
|
55 |
-
*.mo
|
56 |
-
*.pot
|
57 |
-
|
58 |
-
# Django stuff:
|
59 |
-
*.log
|
60 |
-
local_settings.py
|
61 |
-
db.sqlite3
|
62 |
-
db.sqlite3-journal
|
63 |
-
|
64 |
-
# Flask stuff:
|
65 |
-
instance/
|
66 |
-
.webassets-cache
|
67 |
-
|
68 |
-
# Scrapy stuff:
|
69 |
-
.scrapy
|
70 |
-
|
71 |
-
# Sphinx documentation
|
72 |
-
docs/_build/
|
73 |
-
|
74 |
-
# PyBuilder
|
75 |
-
target/
|
76 |
-
|
77 |
-
# Jupyter Notebook
|
78 |
-
.ipynb_checkpoints
|
79 |
-
|
80 |
-
# IPython
|
81 |
-
profile_default/
|
82 |
-
ipython_config.py
|
83 |
-
|
84 |
-
# pyenv
|
85 |
-
.python-version
|
86 |
-
|
87 |
-
# pipenv
|
88 |
-
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
89 |
-
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
90 |
-
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
91 |
-
# install all needed dependencies.
|
92 |
-
#Pipfile.lock
|
93 |
-
|
94 |
-
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
95 |
-
__pypackages__/
|
96 |
-
|
97 |
-
# Celery stuff
|
98 |
-
celerybeat-schedule
|
99 |
-
celerybeat.pid
|
100 |
-
|
101 |
-
# SageMath parsed files
|
102 |
-
*.sage.py
|
103 |
-
|
104 |
-
# Environments
|
105 |
-
.env
|
106 |
-
.venv
|
107 |
-
env/
|
108 |
-
venv/
|
109 |
-
ENV/
|
110 |
-
env.bak/
|
111 |
-
venv.bak/
|
112 |
-
|
113 |
-
# Spyder project settings
|
114 |
-
.spyderproject
|
115 |
-
.spyproject
|
116 |
-
|
117 |
-
# Rope project settings
|
118 |
-
.ropeproject
|
119 |
-
|
120 |
-
# mkdocs documentation
|
121 |
-
/site
|
122 |
-
|
123 |
-
# mypy
|
124 |
-
.mypy_cache/
|
125 |
-
.dmypy.json
|
126 |
-
dmypy.json
|
127 |
-
|
128 |
-
# Pyre type checker
|
129 |
-
.pyre/
|
130 |
-
|
131 |
-
.vscode/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -1,86 +1,14 @@
|
|
1 |
-
import json
|
2 |
-
import random
|
3 |
-
|
4 |
-
import pandas as pd
|
5 |
import streamlit as st
|
6 |
-
from
|
7 |
-
|
8 |
-
with open("config.json") as f:
|
9 |
-
cfg = json.loads(f.read())
|
10 |
-
|
11 |
-
|
12 |
-
@st.cache(show_spinner=False, persist=True)
|
13 |
-
def load_model(masked_text, model_name):
|
14 |
-
|
15 |
-
model = AutoModelForMaskedLM.from_pretrained(model_name)
|
16 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
17 |
-
nlp = pipeline("fill-mask", model=model, tokenizer=tokenizer)
|
18 |
-
|
19 |
-
MASK_TOKEN = tokenizer.mask_token
|
20 |
-
|
21 |
-
masked_text = masked_text.replace("<mask>", MASK_TOKEN)
|
22 |
-
result_sentence = nlp(masked_text)
|
23 |
-
|
24 |
-
return result_sentence[0]["sequence"], result_sentence[0]["token_str"]
|
25 |
|
26 |
|
27 |
def main():
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
"- [Indic Transformers Hindi](https://huggingface.co/neuralspace-reverie/indic-transformers-hi-bert)\n"
|
34 |
-
"- [HindiBERTa](https://huggingface.co/mrm8488/HindiBERTa)\n"
|
35 |
-
"- [RoBERTa Hindi Guj San](https://huggingface.co/surajp/RoBERTa-hindi-guj-san)"
|
36 |
-
)
|
37 |
-
|
38 |
-
models_list = list(cfg["models"].keys())
|
39 |
-
|
40 |
-
models = st.multiselect(
|
41 |
-
"Choose models",
|
42 |
-
models_list,
|
43 |
-
models_list[0],
|
44 |
-
)
|
45 |
-
|
46 |
-
target_text_path = "./mlm_custom/mlm_targeted_text.csv"
|
47 |
-
target_text_df = pd.read_csv(target_text_path)
|
48 |
-
|
49 |
-
texts = target_text_df["text"]
|
50 |
-
|
51 |
-
st.sidebar.title("Hindi MLM")
|
52 |
-
|
53 |
-
pick_random = st.sidebar.checkbox("Pick any random text")
|
54 |
-
|
55 |
-
results_df = pd.DataFrame(columns=["Model Name", "Filled Token", "Filled Text"])
|
56 |
-
|
57 |
-
model_names = []
|
58 |
-
filled_masked_texts = []
|
59 |
-
filled_tokens = []
|
60 |
-
|
61 |
-
if pick_random:
|
62 |
-
random_text = texts[random.randint(0, texts.shape[0] - 1)]
|
63 |
-
masked_text = st.text_area("Please type a masked sentence to fill", random_text)
|
64 |
-
else:
|
65 |
-
select_text = st.sidebar.selectbox("Select any of the following text", texts)
|
66 |
-
masked_text = st.text_area("Please type a masked sentence to fill", select_text)
|
67 |
-
|
68 |
-
# pd.set_option('max_colwidth',30)
|
69 |
-
if st.button("Fill the Mask!"):
|
70 |
-
with st.spinner("Filling the Mask..."):
|
71 |
-
|
72 |
-
for selected_model in models:
|
73 |
-
|
74 |
-
filled_sentence, filled_token = load_model(masked_text, cfg["models"][selected_model])
|
75 |
-
model_names.append(selected_model)
|
76 |
-
filled_tokens.append(filled_token)
|
77 |
-
filled_masked_texts.append(filled_sentence)
|
78 |
-
|
79 |
-
results_df["Model Name"] = model_names
|
80 |
-
results_df["Filled Token"] = filled_tokens
|
81 |
-
results_df["Filled Text"] = filled_masked_texts
|
82 |
-
|
83 |
-
st.table(results_df)
|
84 |
|
85 |
|
86 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
+
from multiapp import MultiApp
|
3 |
+
from apps import about, credits, inference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
|
6 |
def main():
|
7 |
+
app = MultiApp()
|
8 |
+
app.add_app("Inference", inference.app)
|
9 |
+
app.add_app("About", about.app)
|
10 |
+
app.add_app("Credits", credits.app)
|
11 |
+
app.run()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
|
14 |
if __name__ == "__main__":
|
apps/__pycache__/about.cpython-38.pyc
ADDED
Binary file (1.64 kB). View file
|
|
apps/__pycache__/credits.cpython-38.pyc
ADDED
Binary file (311 Bytes). View file
|
|
apps/__pycache__/inference.cpython-38.pyc
ADDED
Binary file (1.72 kB). View file
|
|
apps/about.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
|
4 |
+
def app():
|
5 |
+
# st.title("About")
|
6 |
+
st.markdown("<h1 style='text-align: center;'>About</h1>", unsafe_allow_html=True)
|
7 |
+
st.markdown("""## Introduction""")
|
8 |
+
st.markdown(
|
9 |
+
"""**RoBERTa-hindi** is one of the many projects in the Flax/JAX community week organized by HuggingFace in collaboration with Google to make compute-intensive projects more practicable."""
|
10 |
+
)
|
11 |
+
st.markdown(
|
12 |
+
"""It is a monolingual transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts."""
|
13 |
+
)
|
14 |
+
|
15 |
+
st.markdown("""## Datasets used""")
|
16 |
+
st.markdown(
|
17 |
+
"""RoBERTa-Hindi has been pretrained on a huge corpus consisting of multiple datasets. The entire list of datasets used is mentioned below : """
|
18 |
+
)
|
19 |
+
st.markdown(
|
20 |
+
"""
|
21 |
+
1. OSCAR
|
22 |
+
2. mC4
|
23 |
+
3. Indic-glue
|
24 |
+
4. Hindi-wikipedia-articles-172k
|
25 |
+
5. Hindi-text-short-summarization corpus
|
26 |
+
6. Hindi-text-short-and-large-summarization corpus
|
27 |
+
7. Oldnewspaperhindi
|
28 |
+
8. Samanantar
|
29 |
+
"""
|
30 |
+
)
|
31 |
+
|
32 |
+
st.markdown(
|
33 |
+
"""
|
34 |
+
***NOTE: Some of the datasets are readily available on the HuggingFace Datasets while the team developed the rest as per the docs.***
|
35 |
+
"""
|
36 |
+
)
|
37 |
+
|
apps/credits.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def app():
|
4 |
+
st.title(' Credits')
|
apps/inference.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pandas.io.formats.format import return_docstring
|
2 |
+
import streamlit as st
|
3 |
+
import pandas as pd
|
4 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
5 |
+
from transformers import pipeline
|
6 |
+
import os
|
7 |
+
import json
|
8 |
+
import random
|
9 |
+
|
10 |
+
with open("config.json") as f:
|
11 |
+
cfg = json.loads(f.read())
|
12 |
+
|
13 |
+
|
14 |
+
@st.cache(show_spinner=False, persist=True)
|
15 |
+
def load_model(masked_text, model_name):
|
16 |
+
|
17 |
+
model = AutoModelForMaskedLM.from_pretrained(model_name)
|
18 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
19 |
+
nlp = pipeline("fill-mask", model=model, tokenizer=tokenizer)
|
20 |
+
|
21 |
+
MASK_TOKEN = tokenizer.mask_token
|
22 |
+
|
23 |
+
masked_text = masked_text.replace("<mask>", MASK_TOKEN)
|
24 |
+
result_sentence = nlp(masked_text)
|
25 |
+
|
26 |
+
return result_sentence[0]["sequence"], result_sentence[0]["token_str"]
|
27 |
+
|
28 |
+
|
29 |
+
def app():
|
30 |
+
st.markdown(
|
31 |
+
"<h1 style='text-align: center; color: green;'>RoBERTa Hindi</h1>",
|
32 |
+
unsafe_allow_html=True,
|
33 |
+
)
|
34 |
+
st.markdown(
|
35 |
+
"This demo uses multiple hindi transformer models for Masked Language Modelling (MLM)."
|
36 |
+
)
|
37 |
+
|
38 |
+
models_list = list(cfg["models"].keys())
|
39 |
+
|
40 |
+
models = st.multiselect("Choose models", models_list, models_list[0],)
|
41 |
+
|
42 |
+
target_text_path = "./mlm_custom/mlm_targeted_text.csv"
|
43 |
+
target_text_df = pd.read_csv(target_text_path)
|
44 |
+
|
45 |
+
texts = target_text_df["text"]
|
46 |
+
|
47 |
+
st.sidebar.title("Hindi MLM")
|
48 |
+
|
49 |
+
pick_random = st.sidebar.checkbox("Pick any random text")
|
50 |
+
|
51 |
+
results_df = pd.DataFrame(columns=["Model Name", "Filled Token", "Filled Text"])
|
52 |
+
|
53 |
+
model_names = []
|
54 |
+
filled_masked_texts = []
|
55 |
+
filled_tokens = []
|
56 |
+
|
57 |
+
if pick_random:
|
58 |
+
random_text = texts[random.randint(0, texts.shape[0] - 1)]
|
59 |
+
masked_text = st.text_area("Please type a masked sentence to fill", random_text)
|
60 |
+
else:
|
61 |
+
select_text = st.sidebar.selectbox("Select any of the following text", texts)
|
62 |
+
masked_text = st.text_area("Please type a masked sentence to fill", select_text)
|
63 |
+
|
64 |
+
# pd.set_option('max_colwidth',30)
|
65 |
+
if st.button("Fill the Mask!"):
|
66 |
+
with st.spinner("Filling the Mask..."):
|
67 |
+
|
68 |
+
for selected_model in models:
|
69 |
+
|
70 |
+
filled_sentence, filled_token = load_model(
|
71 |
+
masked_text, cfg["models"][selected_model]
|
72 |
+
)
|
73 |
+
model_names.append(selected_model)
|
74 |
+
filled_tokens.append(filled_token)
|
75 |
+
filled_masked_texts.append(filled_sentence)
|
76 |
+
|
77 |
+
results_df["Model Name"] = model_names
|
78 |
+
results_df["Filled Token"] = filled_tokens
|
79 |
+
results_df["Filled Text"] = filled_masked_texts
|
80 |
+
|
81 |
+
st.table(results_df)
|
multiapp.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Frameworks for running multiple Streamlit applications as a single app.
|
2 |
+
"""
|
3 |
+
import streamlit as st
|
4 |
+
|
5 |
+
|
6 |
+
class MultiApp:
|
7 |
+
def __init__(self):
|
8 |
+
self.apps = []
|
9 |
+
|
10 |
+
def add_app(self, title, func):
|
11 |
+
self.apps.append({"title": title, "function": func})
|
12 |
+
|
13 |
+
def run(self):
|
14 |
+
st.sidebar.header("Navigation")
|
15 |
+
app = st.sidebar.selectbox("", self.apps, format_func=lambda app: app["title"])
|
16 |
+
|
17 |
+
app["function"]()
|
18 |
+
|