Spaces:

prateekagrawal
/

roberta-testing

Runtime error

App Files Files Community

prateekagrawal commited on Jul 16, 2021

Commit

6554f2c

•

1 Parent(s): b6bfb5a

Initial commit

Browse files

Files changed (13) hide show

app.py +21 -0
apps/__pycache__/about.cpython-38.pyc +0 -0
apps/__pycache__/credits.cpython-38.pyc +0 -0
apps/__pycache__/inference.cpython-38.pyc +0 -0
apps/about.py +51 -0
apps/credits.py +51 -0
apps/inference.py +52 -0
mlm_custom/mlm_full_text.csv +19 -0
mlm_custom/mlm_targeted_text.csv +18 -0
mlm_custom/mlm_test_config.csv +6 -0
mlm_custom/test_mlm.py +142 -0
multiapp.py +22 -0
requirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from pandas.io.formats.format import return_docstring
+import streamlit as st
+import pandas as pd
+from transformers import AutoTokenizer,AutoModelForMaskedLM
+from transformers import pipeline
+import os
+import json
+from multiapp import MultiApp
+from apps import about,credits,inference
+def main():
+    app = MultiApp()
+    app.add_app("Inference", inference.app)
+    app.add_app("About", about.app)
+    app.add_app("Credits", credits.app)
+    app.run()
+if __name__ == "__main__":
+    main()

apps/__pycache__/about.cpython-38.pyc ADDED Viewed

Binary file (2.95 kB). View file

apps/__pycache__/credits.cpython-38.pyc ADDED Viewed

Binary file (1.96 kB). View file

apps/__pycache__/inference.cpython-38.pyc ADDED Viewed

Binary file (1.68 kB). View file

apps/about.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# -*- coding: utf-8 -*-
+# @Author: prateek
+# @Date:   2021-03-02 02:23:36
+# @Last Modified by:   prateek
+# @Last Modified time: 2021-03-02 23:04:21
+import streamlit as st
+import numpy as np
+import pandas as pd
+from sklearn import datasets
+from PIL import Image
+def app():
+    st.title('About')
+    st.write("""
+    ## What is diabetes
+    According to the NIH, "Diabetes is a disease that occurs when your **blood glucose**,
+     also called blood sugar, is **too high**. Blood **glucose** is your main source of
+      energy and **comes from the food you eat**. **Insulin**, a hormone made from the pancreas,
+       **helps glucose** from food get into your cells to be used for energy. Sometimes your
+       body doesn’t make enough or any insulin or doesn’t use insulin well. Glucose then stays
+       in your blood and doesn’t reach your cells.
+    Over time, **having too much glucose in your blood** can cause health problems. """)
+    st.write(
+        """
+        ### Health impact
+Over time, diabetes can damage the heart, blood vessels, eyes, kidneys, and nerves.
+* Adults with diabetes have a two- to three-fold increased risk of heart attacks and strokes(1).
+* Combined with reduced blood flow, neuropathy (nerve damage) in the feet increases the chance of foot ulcers, infection and eventual need for limb amputation.
+* Diabetic retinopathy is an important cause of blindness, and occurs as a result of long-term accumulated damage to the small blood vessels in the retina. Diabetes is the cause of 2.6% of global blindness(2).
+* Diabetes is among the leading causes of kidney failure(3).
+        """)
+    st.write(
+        """
+        ### Key facts
+* The number of people with diabetes rose from 108 million in 1980 to 422 million in 2014.
+* The global prevalence of diabetes* among adults over 18 years of age rose from 4.7% in 1980 to 8.5% in 2014 (1).
+* Between 2000 and 2016, there was a 5% increase in premature mortality from diabetes.
+* Diabetes prevalence has been rising more rapidly in low- and middle-income countries than in high-income countries.
+* Diabetes is a major cause of blindness, kidney failure, heart attacks, stroke and lower limb amputation.
+* In 2016, an estimated 1.6 million deaths were directly caused by diabetes. Another 2.2 million deaths were attributable to high blood glucose in 2012.
+* Almost half of all deaths attributable to high blood glucose occur before the age of 70 years. WHO estimates that diabetes was the seventh leading cause of death in 2016.
+* A healthy diet, regular physical activity, maintaining a normal body weight and avoiding tobacco use are ways to prevent or delay the onset of type 2 diabetes.
+* Diabetes can be treated and its consequences avoided or delayed with diet, physical activity, medication and regular screening and treatment for complications.
+        """)

apps/credits.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# -*- coding: utf-8 -*-
+# @Author: prateek
+# @Date:   2021-03-02 22:37:41
+# @Last Modified by:   prateek
+# @Last Modified time: 2021-03-02 23:38:33
+import streamlit as st
+def app():
+    st.title(' Credits')
+    st.write("""The following web application is built and maintained by **Prateek Agrawal** for the sole purpose of learning and displaying the power and usage of machine learning in the field of healthcare. He believes that the Artificial Intelligence and Machine Learning can truly help in making the world a better place to live in.""")
+    st.write("""
+    ## Data
+    The datasets consist of several medical predictor (independent) variables and one target (dependent)
+    variable, Outcome. Independent variables include the number of pregnancies the patient has had,
+    their BMI, insulin level, age, and so on.
+    [link of data in kaggle](https://www.kaggle.com/uciml/pima-indians-diabetes-database)""")
+    st.write("""
+    ## Columns
+    |Columns|Description|
+    |-------|------------|
+    |Pregnancies|Number of times pregnant|
+    |Glucose|Plasma glucose concentration for 2 hours in an oral glucose tolerance test|
+    |BloodPressure|Diastolic blood pressure (mm Hg)|
+    |SkinThickness|Triceps skin fold thickness (mm)|
+    |Insulin|2-Hour serum insulin (mu U/ml)|
+    |BMI|Body mass index (weight in kg/(height in m)^2)|
+    |DiabetesPedigreeFunction|Diabetes pedigree function|
+    |Age|Age (years)|
+    |Outcome|Class variable (0 or 1) 268 of 768 are 1, the others are 0|
+    ## Information
+    ### WHO Website
+    * https://www.who.int/health-topics/diabetes#tab=tab_1
+    * https://www.who.int/news-room/fact-sheets/detail/diabetes
+    ### Machine Learning related info
+    * https://www.kaggle.com/uciml/pima-indians-diabetes-database/code
+    * https://towardsdatascience.com/streamlit-101-an-in-depth-introduction-fc8aad9492f2
+    """)

apps/inference.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from pandas.io.formats.format import return_docstring
+import streamlit as st
+import pandas as pd
+from transformers import AutoTokenizer,AutoModelForMaskedLM
+from transformers import pipeline
+import os
+import json
+@st.cache(show_spinner=False,persist=True)
+def load_model(masked_text,model_name):
+    model = AutoModelForMaskedLM.from_pretrained(model_name, from_flax=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    tokenizer.save_pretrained('exported_pytorch_model')
+    model.save_pretrained('exported_pytorch_model')
+    nlp = pipeline('fill-mask', model="exported_pytorch_model")
+    result_sentence = nlp(masked_text)
+    return result_sentence
+def app():
+    st.markdown("<h1 style='text-align: center; color: green;'>RoBERTa Hindi</h1>", unsafe_allow_html=True)
+    st.markdown(
+    "This demo uses pretrained RoBERTa variants for Mask Language Modelling (MLM)"
+    )
+    target_text_path = './mlm_custom/mlm_targeted_text.csv'
+    target_text_df = pd.read_csv(target_text_path)
+    texts = target_text_df['text']
+    st.markdown("""## Select any of the following text : """)
+    masked_text = st.selectbox('',
+     texts)
+    st.write('You selected:', masked_text)
+    models = st.multiselect(
+        "Choose models",
+        ['flax-community/roberta-hindi','mrm8488/HindiBERTa','ai4bharat/indic-bert',\
+        'neuralspace-reverie/indic-transformers-hi-bert',
+          'surajp/RoBERTa-hindi-guj-san'],
+        ["flax-community/roberta-hindi"]
+    )
+    selected_model = models[0]
+    if st.button('Fill the Mask!'):
+         with st.spinner("Filling the Mask..."):
+            filled_sentence = load_model(masked_text,selected_model)
+            st.write(filled_sentence)

mlm_custom/mlm_full_text.csv ADDED Viewed

	@@ -0,0 +1,19 @@

+user_id,text
+dk-crazydiv,हम आपके सुखद यात्रा की कामना करते हैं
+dk-crazydiv,मुझे उनसे बात करना बहुत अच्छा लगा
+dk-crazydiv,"बार बार देखो, हज़ार बार देखो, ये देखने की चीज़ है"
+dk-crazydiv,ट्रंप कल अहमदाबाद में प्रधानमंत्री मोदी से मुलाकात करने जा रहे हैं
+dk-crazydiv,बॉम्बे से बैंगलोर की दूरी 500 किलोमीटर है
+amankhandelia,मधु घट फूटा ही करते हैं लघु जीवन लेकर आए हैं प्याले टूटा ही करते हैं
+amankhandelia,वर्त्तमान के मोहजाल में आने वाला कल न भुलाएं
+amankhandelia,भारत में हुए अन्यायों के गवाह है मुंशी प्रेमचंद के उपन्यास
+amankhandelia,"एक लेखक अपनी कलम तभी उठाता है, जब उसकी संवेदनाओं पर किसी ने चोट की हो"
+amankhandelia,"मेरा कुछ कहना तब उचित है, जब मुझे सुनना तुम्हारी प्राथमिकता हो, औपचारिकता नहीं"
+amankhandelia,मरना लगा रहेगा यहाँ जी तो लीजिए
+amankhandelia,बहुत कम लोग जानते हैं कि वो बहुत कम जानते हैं
+hassiahk,"जल्दी सोना और जल्दी उठना इंसान को स्वस्थ ,समृद्ध और बुद्धिमान बनाता है"
+hassiahk,बात ये है कि आप इसे पहले से ही जानते हैं
+hassiahk,"रोज एक सेब खाओ, डॉक्टर से दूर रहो"
+hassiahk,किसी पुस्तक को उसके आवरण से मत आंकिए
+hassiahk,जहा चाह वहा राह
+hassiahk,सभी अच्छी चीजों का एक अंत होता है

mlm_custom/mlm_targeted_text.csv ADDED Viewed

	@@ -0,0 +1,18 @@

+user_id,text,output,multi
+dk-crazydiv,हम आपके <mask> यात्रा की कामना करते हैं,सुखद,
+dk-crazydiv,मुझे उनसे बात करना बहुत <mask> लगा,अच्छा,
+dk-crazydiv,"बार बार देखो, हज़ार बार देखो, ये देखने की <mask> है","[""चीज़"",""बात""]",TRUE
+dk-crazydiv,ट्रंप कल अहमदाबाद में प्रधानमंत्री मोदी से <mask> करने जा रहे हैं,"[""मुलाकात"",""मिल्ने""]",TRUE
+dk-crazydiv,बॉम्बे से बैंगलोर की <mask> 500 किलोमीटर है,दूरी,
+dk-crazydiv,कहने को साथ अपने ये <mask> चलती है,दुनिया,
+dk-crazydiv,"ये इश्क़ नहीं आसान बस इतना समझ लीजिये, एक आग का दरिया है और <mask> के जाना है",डूब,
+prateekagrawal,आपका दिन <mask> हो,"[""शुभ"",""अच्छा""]",TRUE
+prateekagrawal,हिंदी भारत में <mask> जाने वाली भाषाओं में से एक है,"[""बोली"",""सिखाई"",""आधिकारिक""]",TRUE
+prateekagrawal,शुभ <mask>,"[""प्रभात"",रात्रि"",""यात्रा"",""अवसर""]",TRUE
+prateekagrawal,इंसान को कभी बुरा नहीं <mask> चाहिए,"[""बोलना"",""देखना"",""सुनाना"",""करना""]",TRUE
+hassiahk,बात ये है कि आप इसे <mask> से ही जानते हैं,पहले,
+hassiahk,<mask> पूर्व में उगता है,सूरज,
+hassiahk,"जल्दी सोना और जल्दी उठना इंसान को स्वस्थ ,समृद्ध और बुद्धिमान <mask> है",बनाता,
+hassiahk,"रोज एक सेब खाओ, <mask> से दूर रहो",डॉक्टर,
+hassiahk,किसी पुस्तक को उसके <mask> से मत आंकिए,आवरण,
+hassiahk,सभी <mask> चीजों का एक अंत होता है,"[""अच्छी"", ""बुरी""]",TRUE

mlm_custom/mlm_test_config.csv ADDED Viewed

	@@ -0,0 +1,6 @@

+model_name,display_name,revision,from_flax,use_fast,add_prefix_space
+flax-community/roberta-hindi,,,TRUE,TRUE,TRUE
+mrm8488/HindiBERTa,,,FALSE,TRUE,TRUE
+ai4bharat/indic-bert,,,FALSE,FALSE,FALSE
+neuralspace-reverie/indic-transformers-hi-bert,,,FALSE,TRUE,TRUE
+surajp/RoBERTa-hindi-guj-san,,,FALSE,TRUE,TRUE

mlm_custom/test_mlm.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import pandas as pd
+import numpy as np
+from transformers import AutoTokenizer, RobertaModel, AutoModel, AutoModelForMaskedLM
+from transformers import pipeline
+import os
+import json
+class MLMTest():
+    def __init__(self, config_file="mlm_test_config.csv", full_text_file="mlm_full_text.csv", targeted_text_file="mlm_targeted_text.csv"):
+        self.config_df = pd.read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), config_file))
+        self.config_df.fillna("", inplace=True)
+        self.full_text_df = pd.read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), full_text_file))
+        self.targeted_text_df = pd.read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), targeted_text_file))
+        self.full_text_results = []
+        self.targeted_text_results = []
+    def _run_full_test_row(self, text, print_debug=False):
+        return_data = []
+        data = text.split()
+        for i in range(0, len(data)):
+            masked_text = " ".join(data[:i]) + " "+self.nlp.tokenizer.mask_token+" " + " ".join(data[i+1:])
+            expected_result = data[i]
+            result = self.nlp(masked_text)
+            self.full_text_results.append({"text": masked_text, "result": result[0]["token_str"], "true_output": expected_result})
+            if print_debug:
+                print(masked_text)
+                print([x["token_str"] for x in result])
+                print("-"*20)
+            return_data.append({"prediction": result[0]["token_str"], "true_output": expected_result})
+        return return_data
+    def _run_targeted_test_row(self, text, expected_result, print_debug=False):
+        return_data = []
+        result = self.nlp(text.replace("<mask>", self.nlp.tokenizer.mask_token))
+        self.targeted_text_results.append({"text": text, "result": result[0]["token_str"], "true_output": expected_result})
+        if print_debug:
+            print(text)
+            print([x["token_str"] for x in result])
+            print("-"*20)
+        return_data.append({"prediction": result[0]["token_str"], "true_output": expected_result})
+        return return_data
+    def _compute_acc(self, results):
+        ctr = 0
+        for row in results:
+            try:
+                z = json.loads(row["true_output"])
+                if isinstance(z, list):
+                    if row["prediction"] in z:
+                        ctr+=1
+            except:
+                if row["prediction"] == row["true_output"]:
+                    ctr+=1
+        return float(ctr/len(results))
+    def run_full_test(self, exclude_user_ids=[], print_debug=False):
+        df = pd.DataFrame()
+        for idx, row in self.config_df.iterrows():
+            self.full_text_results = []
+            model_name = row["model_name"]
+            display_name = row["display_name"] if row["display_name"] else row["model_name"]
+            revision = row["revision"] if row["revision"] else "main"
+            from_flax = row["from_flax"]
+            if from_flax:
+                model = AutoModelForMaskedLM.from_pretrained(model_name, from_flax=True, revision=revision)
+                tokenizer = AutoTokenizer.from_pretrained(model_name)
+                tokenizer.save_pretrained('exported_pytorch_model')
+                model.save_pretrained('exported_pytorch_model')
+                self.nlp = pipeline('fill-mask', model="exported_pytorch_model")
+            else:
+                self.nlp = pipeline('fill-mask', model=model_name)
+            accs = []
+            try:
+                for idx, row in self.full_text_df.iterrows():
+                    if row["user_id"] in exclude_user_ids:
+                        continue
+                    results = self._run_full_test_row(row["text"], print_debug=print_debug)
+                    acc = self._compute_acc(results)
+                    accs.append(acc)
+            except:
+                print("Error for", display_name)
+                continue
+            print(display_name, " Average acc:", sum(accs)/len(accs))
+            if df.empty:
+                df = pd.DataFrame(self.full_text_results)
+                df.rename(columns={"result": display_name}, inplace=True)
+            else:
+                preds = [x["result"] for x in self.full_text_results]
+                df[display_name] = preds
+        df.to_csv("full_text_results.csv", index=False)
+        print("Results saved to full_text_results.csv")
+    def run_targeted_test(self, exclude_user_ids=[], print_debug=False):
+        df = pd.DataFrame()
+        for idx, row in self.config_df.iterrows():
+            self.targeted_text_results = []
+            model_name = row["model_name"]
+            display_name = row["display_name"] if row["display_name"] else row["model_name"]
+            revision = row["revision"] if row["revision"] else "main"
+            from_flax = row["from_flax"]
+            if from_flax:
+                model = AutoModelForMaskedLM.from_pretrained(model_name, from_flax=True, revision=revision)
+                tokenizer = AutoTokenizer.from_pretrained(model_name)
+                tokenizer.save_pretrained('exported_pytorch_model')
+                model.save_pretrained('exported_pytorch_model')
+                self.nlp = pipeline('fill-mask', model="exported_pytorch_model")
+            else:
+                self.nlp = pipeline('fill-mask', model=model_name)
+            accs = []
+            try:
+                for idx, row2 in self.targeted_text_df.iterrows():
+                    if row2["user_id"] in exclude_user_ids:
+                        continue
+                    results = self._run_targeted_test_row(row2["text"], row2["output"], print_debug=print_debug)
+                    acc = self._compute_acc(results)
+                    accs.append(acc)
+            except:
+                import traceback
+                print(traceback.format_exc())
+                print("Error for", display_name)
+                continue
+            print(display_name, " Average acc:", sum(accs)/len(accs))
+            if df.empty:
+                df = pd.DataFrame(self.targeted_text_results)
+                df.rename(columns={"result": display_name}, inplace=True)
+            else:
+                preds = [x["result"] for x in self.targeted_text_results]
+                df[display_name] = preds
+        df.to_csv("targeted_text_results.csv", index=False)
+        print("Results saved to targeted_text_results.csv")

multiapp.py ADDED Viewed

	@@ -0,0 +1,22 @@

+"""Frameworks for running multiple Streamlit applications as a single app.
+"""
+import streamlit as st
+class MultiApp:
+    def __init__(self):
+        self.apps = []
+    def add_app(self, title, func):
+        self.apps.append({
+            "title": title,
+            "function": func
+        })
+    def run(self):
+        st.sidebar.header('Navigation')
+        app = st.sidebar.radio(
+            '',
+            self.apps,
+            format_func=lambda app: app['title'])
+        app['function']()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+streamlit
+torch
+transformers
+jax