Spaces:

ncats
/

EpiPipeline4RD

Running

App Files Files Community

wzkariampuzha commited on Mar 28, 2022

Commit

fc731db

1 Parent(s): 98ac1da

upload-nltk-data

Browse files

Files changed (7) hide show

app.py +12 -11
nltk_data/punkt/PY3/README +98 -0
nltk_data/punkt/PY3/english.pickle +0 -0
nltk_data/punkt/README +98 -0
nltk_data/punkt/english.pickle +0 -0
nltk_data/stopwords/README +32 -0
nltk_data/stopwords/english +179 -0

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import nltk
-nltk.download('stopwords')
-nltk.download('punkt')
 import pandas as pd
 import classify_abs
 import extract_abs
@@ -20,7 +21,7 @@ st.markdown('''<img src="https://huggingface.co/spaces/ncats/EpiPipeline4GARD/re
 #### TITLE ####
 st.title("Epidemiology Information Extraction Pipeline for Rare Diseases")
-#st.subheader("National Center for Advancing Translational Sciences (NIH/NCATS)")
 #### CHANGE SIDEBAR WIDTH ###
 st.markdown(
@@ -58,21 +59,21 @@ def load_models_experimental():
     NER_pipeline, entity_classes = extract_abs.init_NER_pipeline()
     GARD_dict, max_length = extract_abs.load_GARD_diseases()
     return classify_model_vars, NER_pipeline, entity_classes, GARD_dict, max_length
 #### DOWNLOAD FUNCTION ####
 @st.cache
 def convert_df(df):
     # IMPORTANT: Cache the conversion to prevent computation on every rerun
     return df.to_csv().encode('utf-8')
 #### SANKEY FUNCTION ####
 #@st.cache(allow_output_mutation=True)
 @st.experimental_singleton()
 def epi_sankey(sankey_data, disease_or_gard_id):
     gathered, relevant, epidemiologic = sankey_data
     fig = go.Figure(data=[go.Sankey(
         node = dict(
           pad = 15,
@@ -92,7 +93,7 @@ def epi_sankey(sankey_data, disease_or_gard_id):
     title="Search for the Epidemiology of "+disease_or_gard_id,
     font=dict(size = 10, color = 'black'),
 )
     return fig
 #### BEGIN APP ####
@@ -111,8 +112,8 @@ st.markdown("A full list of rare diseases tracked by GARD can be found [here](ht
 if disease_or_gard_id:
     df, sankey_data = extract_abs.streamlit_extraction(disease_or_gard_id, max_results, filtering,
-                                NER_pipeline, entity_classes,
-                                extract_diseases,GARD_dict, max_length,
                                 classify_model_vars)
     st.dataframe(df, height=100)
     csv = convert_df(df)
@@ -124,7 +125,7 @@ if disease_or_gard_id:
         )
     #st.dataframe(data=None, width=None, height=None)
     fig = epi_sankey(sankey_data,disease_or_gard_id)
     #if st.button('Display Sankey Diagram of Automated Search'):
     st.plotly_chart(fig, use_container_width=True)
-# st.code(body, language="python")

 import nltk
+nltk.data.path.append("./nltk_data/")
+#nltk.download('stopwords')
+#nltk.download('punkt')
 import pandas as pd
 import classify_abs
 import extract_abs
 #### TITLE ####
 st.title("Epidemiology Information Extraction Pipeline for Rare Diseases")
+#st.subheader("National Center for Advancing Translational Sciences (NIH/NCATS)")
 #### CHANGE SIDEBAR WIDTH ###
 st.markdown(
     NER_pipeline, entity_classes = extract_abs.init_NER_pipeline()
     GARD_dict, max_length = extract_abs.load_GARD_diseases()
     return classify_model_vars, NER_pipeline, entity_classes, GARD_dict, max_length
 #### DOWNLOAD FUNCTION ####
 @st.cache
 def convert_df(df):
     # IMPORTANT: Cache the conversion to prevent computation on every rerun
     return df.to_csv().encode('utf-8')
 #### SANKEY FUNCTION ####
 #@st.cache(allow_output_mutation=True)
 @st.experimental_singleton()
 def epi_sankey(sankey_data, disease_or_gard_id):
     gathered, relevant, epidemiologic = sankey_data
     fig = go.Figure(data=[go.Sankey(
         node = dict(
           pad = 15,
     title="Search for the Epidemiology of "+disease_or_gard_id,
     font=dict(size = 10, color = 'black'),
 )
     return fig
 #### BEGIN APP ####
 if disease_or_gard_id:
     df, sankey_data = extract_abs.streamlit_extraction(disease_or_gard_id, max_results, filtering,
+                                NER_pipeline, entity_classes,
+                                extract_diseases,GARD_dict, max_length,
                                 classify_model_vars)
     st.dataframe(df, height=100)
     csv = convert_df(df)
         )
     #st.dataframe(data=None, width=None, height=None)
     fig = epi_sankey(sankey_data,disease_or_gard_id)
     #if st.button('Display Sankey Diagram of Automated Search'):
     st.plotly_chart(fig, use_container_width=True)
+# st.code(body, language="python")

nltk_data/punkt/PY3/README ADDED Viewed

	@@ -0,0 +1,98 @@

+Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
+Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
+been contributed by various people using NLTK for sentence boundary detection.
+For information about how to use these models, please confer the tokenization HOWTO:
+http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
+and chapter 3.8 of the NLTK book:
+http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
+There are pretrained tokenizers for the following languages:
+File                Language            Source                             Contents                Size of training corpus(in tokens)           Model contributed by
+=======================================================================================================================================================================
+czech.pickle        Czech               Multilingual Corpus 1 (ECI)        Lidove Noviny                   ~345,000                             Jan Strunk / Tibor Kiss
+                                                                           Literarni Noviny
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+danish.pickle       Danish              Avisdata CD-Rom Ver. 1.1. 1995     Berlingske Tidende              ~550,000                             Jan Strunk / Tibor Kiss
+                                        (Berlingske Avisdata, Copenhagen)  Weekend Avisen
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+dutch.pickle        Dutch               Multilingual Corpus 1 (ECI)        De Limburger                    ~340,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+english.pickle      English             Penn Treebank (LDC)                Wall Street Journal             ~469,000                             Jan Strunk / Tibor Kiss
+                    (American)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+estonian.pickle     Estonian            University of Tartu, Estonia       Eesti Ekspress                  ~359,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+finnish.pickle      Finnish             Finnish Parole Corpus, Finnish     Books and major national        ~364,000                             Jan Strunk / Tibor Kiss
+                                        Text Bank (Suomen Kielen           newspapers
+                                        Tekstipankki)
+                                        Finnish Center for IT Science
+                                        (CSC)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+french.pickle       French              Multilingual Corpus 1 (ECI)        Le Monde                        ~370,000                             Jan Strunk / Tibor Kiss
+                    (European)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+german.pickle       German              Neue Zürcher Zeitung AG            Neue Zürcher Zeitung            ~847,000                             Jan Strunk / Tibor Kiss
+                    (Switzerland)       CD-ROM
+                    (Uses "ss"
+                     instead of "ß")
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+greek.pickle        Greek               Efstathios Stamatatos              To Vima (TO BHMA)               ~227,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+italian.pickle      Italian             Multilingual Corpus 1 (ECI)        La Stampa, Il Mattino           ~312,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+norwegian.pickle    Norwegian           Centre for Humanities              Bergens Tidende                 ~479,000                             Jan Strunk / Tibor Kiss
+                    (Bokmål and         Information Technologies,
+                     Nynorsk)           Bergen
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+polish.pickle       Polish              Polish National Corpus             Literature, newspapers, etc.  ~1,000,000                             Krzysztof Langner
+                                        (http://www.nkjp.pl/)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+portuguese.pickle   Portuguese          CETENFolha Corpus                  Folha de São Paulo              ~321,000                             Jan Strunk / Tibor Kiss
+                    (Brazilian)         (Linguateca)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+slovene.pickle      Slovene             TRACTOR                            Delo                            ~354,000                             Jan Strunk / Tibor Kiss
+                                        Slovene Academy for Arts
+                                        and Sciences
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+spanish.pickle      Spanish             Multilingual Corpus 1 (ECI)        Sur                             ~353,000                             Jan Strunk / Tibor Kiss
+                    (European)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+swedish.pickle      Swedish             Multilingual Corpus 1 (ECI)        Dagens Nyheter                  ~339,000                             Jan Strunk / Tibor Kiss
+                                                                           (and some other texts)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+turkish.pickle      Turkish             METU Turkish Corpus                Milliyet                        ~333,000                             Jan Strunk / Tibor Kiss
+                                        (Türkçe Derlem Projesi)
+                                        University of Ankara
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
+Unicode using the codecs module.
+Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
+Computational Linguistics 32: 485-525.
+---- Training Code ----
+# import punkt
+import nltk.tokenize.punkt
+# Make a new Tokenizer
+tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
+# Read in training corpus (one example: Slovene)
+import codecs
+text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
+# Train tokenizer
+tokenizer.train(text)
+# Dump pickled tokenizer
+import pickle
+out = open("slovene.pickle","wb")
+pickle.dump(tokenizer, out)
+out.close()
+---------

nltk_data/punkt/PY3/english.pickle ADDED Viewed

Binary file (407 kB). View file

nltk_data/punkt/README ADDED Viewed

	@@ -0,0 +1,98 @@

+Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
+Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
+been contributed by various people using NLTK for sentence boundary detection.
+For information about how to use these models, please confer the tokenization HOWTO:
+http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
+and chapter 3.8 of the NLTK book:
+http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
+There are pretrained tokenizers for the following languages:
+File                Language            Source                             Contents                Size of training corpus(in tokens)           Model contributed by
+=======================================================================================================================================================================
+czech.pickle        Czech               Multilingual Corpus 1 (ECI)        Lidove Noviny                   ~345,000                             Jan Strunk / Tibor Kiss
+                                                                           Literarni Noviny
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+danish.pickle       Danish              Avisdata CD-Rom Ver. 1.1. 1995     Berlingske Tidende              ~550,000                             Jan Strunk / Tibor Kiss
+                                        (Berlingske Avisdata, Copenhagen)  Weekend Avisen
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+dutch.pickle        Dutch               Multilingual Corpus 1 (ECI)        De Limburger                    ~340,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+english.pickle      English             Penn Treebank (LDC)                Wall Street Journal             ~469,000                             Jan Strunk / Tibor Kiss
+                    (American)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+estonian.pickle     Estonian            University of Tartu, Estonia       Eesti Ekspress                  ~359,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+finnish.pickle      Finnish             Finnish Parole Corpus, Finnish     Books and major national        ~364,000                             Jan Strunk / Tibor Kiss
+                                        Text Bank (Suomen Kielen           newspapers
+                                        Tekstipankki)
+                                        Finnish Center for IT Science
+                                        (CSC)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+french.pickle       French              Multilingual Corpus 1 (ECI)        Le Monde                        ~370,000                             Jan Strunk / Tibor Kiss
+                    (European)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+german.pickle       German              Neue Zürcher Zeitung AG            Neue Zürcher Zeitung            ~847,000                             Jan Strunk / Tibor Kiss
+                    (Switzerland)       CD-ROM
+                    (Uses "ss"
+                     instead of "ß")
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+greek.pickle        Greek               Efstathios Stamatatos              To Vima (TO BHMA)               ~227,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+italian.pickle      Italian             Multilingual Corpus 1 (ECI)        La Stampa, Il Mattino           ~312,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+norwegian.pickle    Norwegian           Centre for Humanities              Bergens Tidende                 ~479,000                             Jan Strunk / Tibor Kiss
+                    (Bokmål and         Information Technologies,
+                     Nynorsk)           Bergen
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+polish.pickle       Polish              Polish National Corpus             Literature, newspapers, etc.  ~1,000,000                             Krzysztof Langner
+                                        (http://www.nkjp.pl/)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+portuguese.pickle   Portuguese          CETENFolha Corpus                  Folha de São Paulo              ~321,000                             Jan Strunk / Tibor Kiss
+                    (Brazilian)         (Linguateca)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+slovene.pickle      Slovene             TRACTOR                            Delo                            ~354,000                             Jan Strunk / Tibor Kiss
+                                        Slovene Academy for Arts
+                                        and Sciences
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+spanish.pickle      Spanish             Multilingual Corpus 1 (ECI)        Sur                             ~353,000                             Jan Strunk / Tibor Kiss
+                    (European)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+swedish.pickle      Swedish             Multilingual Corpus 1 (ECI)        Dagens Nyheter                  ~339,000                             Jan Strunk / Tibor Kiss
+                                                                           (and some other texts)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+turkish.pickle      Turkish             METU Turkish Corpus                Milliyet                        ~333,000                             Jan Strunk / Tibor Kiss
+                                        (Türkçe Derlem Projesi)
+                                        University of Ankara
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
+Unicode using the codecs module.
+Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
+Computational Linguistics 32: 485-525.
+---- Training Code ----
+# import punkt
+import nltk.tokenize.punkt
+# Make a new Tokenizer
+tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
+# Read in training corpus (one example: Slovene)
+import codecs
+text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
+# Train tokenizer
+tokenizer.train(text)
+# Dump pickled tokenizer
+import pickle
+out = open("slovene.pickle","wb")
+pickle.dump(tokenizer, out)
+out.close()
+---------

nltk_data/punkt/english.pickle ADDED Viewed

The diff for this file is too large to render. See raw diff

nltk_data/stopwords/README ADDED Viewed

	@@ -0,0 +1,32 @@

+Stopwords Corpus
+This corpus contains lists of stop words for several languages.  These
+are high-frequency grammatical words which are usually ignored in text
+retrieval applications.
+They were obtained from:
+http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/
+The stop words for the Romanian language were obtained from:
+http://arlc.ro/resources/
+The English list has been augmented
+https://github.com/nltk/nltk_data/issues/22
+The German list has been corrected
+https://github.com/nltk/nltk_data/pull/49
+A Kazakh list has been added
+https://github.com/nltk/nltk_data/pull/52
+A Nepali list has been added
+https://github.com/nltk/nltk_data/pull/83
+An Azerbaijani list has been added
+https://github.com/nltk/nltk_data/pull/100
+A Greek list has been added
+https://github.com/nltk/nltk_data/pull/103
+An Indonesian list has been added
+https://github.com/nltk/nltk_data/pull/112

nltk_data/stopwords/english ADDED Viewed

	@@ -0,0 +1,179 @@

+i
+me
+my
+myself
+we
+our
+ours
+ourselves
+you
+you're
+you've
+you'll
+you'd
+your
+yours
+yourself
+yourselves
+he
+him
+his
+himself
+she
+she's
+her
+hers
+herself
+it
+it's
+its
+itself
+they
+them
+their
+theirs
+themselves
+what
+which
+who
+whom
+this
+that
+that'll
+these
+those
+am
+is
+are
+was
+were
+be
+been
+being
+have
+has
+had
+having
+do
+does
+did
+doing
+a
+an
+the
+and
+but
+if
+or
+because
+as
+until
+while
+of
+at
+by
+for
+with
+about
+against
+between
+into
+through
+during
+before
+after
+above
+below
+to
+from
+up
+down
+in
+out
+on
+off
+over
+under
+again
+further
+then
+once
+here
+there
+when
+where
+why
+how
+all
+any
+both
+each
+few
+more
+most
+other
+some
+such
+no
+nor
+not
+only
+own
+same
+so
+than
+too
+very
+s
+t
+can
+will
+just
+don
+don't
+should
+should've
+now
+d
+ll
+m
+o
+re
+ve
+y
+ain
+aren
+aren't
+couldn
+couldn't
+didn
+didn't
+doesn
+doesn't
+hadn
+hadn't
+hasn
+hasn't
+haven
+haven't
+isn
+isn't
+ma
+mightn
+mightn't
+mustn
+mustn't
+needn
+needn't
+shan
+shan't
+shouldn
+shouldn't
+wasn
+wasn't
+weren
+weren't
+won
+won't
+wouldn
+wouldn't