wzkariampuzha commited on
Commit
fc731db
1 Parent(s): 98ac1da

upload-nltk-data

Browse files
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import nltk
2
- nltk.download('stopwords')
3
- nltk.download('punkt')
 
4
  import pandas as pd
5
  import classify_abs
6
  import extract_abs
@@ -20,7 +21,7 @@ st.markdown('''<img src="https://huggingface.co/spaces/ncats/EpiPipeline4GARD/re
20
 
21
  #### TITLE ####
22
  st.title("Epidemiology Information Extraction Pipeline for Rare Diseases")
23
- #st.subheader("National Center for Advancing Translational Sciences (NIH/NCATS)")
24
 
25
  #### CHANGE SIDEBAR WIDTH ###
26
  st.markdown(
@@ -58,21 +59,21 @@ def load_models_experimental():
58
  NER_pipeline, entity_classes = extract_abs.init_NER_pipeline()
59
  GARD_dict, max_length = extract_abs.load_GARD_diseases()
60
  return classify_model_vars, NER_pipeline, entity_classes, GARD_dict, max_length
61
-
62
  #### DOWNLOAD FUNCTION ####
63
 
64
  @st.cache
65
  def convert_df(df):
66
  # IMPORTANT: Cache the conversion to prevent computation on every rerun
67
  return df.to_csv().encode('utf-8')
68
-
69
  #### SANKEY FUNCTION ####
70
 
71
  #@st.cache(allow_output_mutation=True)
72
  @st.experimental_singleton()
73
  def epi_sankey(sankey_data, disease_or_gard_id):
74
  gathered, relevant, epidemiologic = sankey_data
75
-
76
  fig = go.Figure(data=[go.Sankey(
77
  node = dict(
78
  pad = 15,
@@ -92,7 +93,7 @@ def epi_sankey(sankey_data, disease_or_gard_id):
92
  title="Search for the Epidemiology of "+disease_or_gard_id,
93
  font=dict(size = 10, color = 'black'),
94
  )
95
-
96
  return fig
97
 
98
  #### BEGIN APP ####
@@ -111,8 +112,8 @@ st.markdown("A full list of rare diseases tracked by GARD can be found [here](ht
111
 
112
  if disease_or_gard_id:
113
  df, sankey_data = extract_abs.streamlit_extraction(disease_or_gard_id, max_results, filtering,
114
- NER_pipeline, entity_classes,
115
- extract_diseases,GARD_dict, max_length,
116
  classify_model_vars)
117
  st.dataframe(df, height=100)
118
  csv = convert_df(df)
@@ -124,7 +125,7 @@ if disease_or_gard_id:
124
  )
125
  #st.dataframe(data=None, width=None, height=None)
126
  fig = epi_sankey(sankey_data,disease_or_gard_id)
127
-
128
  #if st.button('Display Sankey Diagram of Automated Search'):
129
  st.plotly_chart(fig, use_container_width=True)
130
- # st.code(body, language="python")
 
1
  import nltk
2
+ nltk.data.path.append("./nltk_data/")
3
+ #nltk.download('stopwords')
4
+ #nltk.download('punkt')
5
  import pandas as pd
6
  import classify_abs
7
  import extract_abs
 
21
 
22
  #### TITLE ####
23
  st.title("Epidemiology Information Extraction Pipeline for Rare Diseases")
24
+ #st.subheader("National Center for Advancing Translational Sciences (NIH/NCATS)")
25
 
26
  #### CHANGE SIDEBAR WIDTH ###
27
  st.markdown(
 
59
  NER_pipeline, entity_classes = extract_abs.init_NER_pipeline()
60
  GARD_dict, max_length = extract_abs.load_GARD_diseases()
61
  return classify_model_vars, NER_pipeline, entity_classes, GARD_dict, max_length
62
+
63
  #### DOWNLOAD FUNCTION ####
64
 
65
  @st.cache
66
  def convert_df(df):
67
  # IMPORTANT: Cache the conversion to prevent computation on every rerun
68
  return df.to_csv().encode('utf-8')
69
+
70
  #### SANKEY FUNCTION ####
71
 
72
  #@st.cache(allow_output_mutation=True)
73
  @st.experimental_singleton()
74
  def epi_sankey(sankey_data, disease_or_gard_id):
75
  gathered, relevant, epidemiologic = sankey_data
76
+
77
  fig = go.Figure(data=[go.Sankey(
78
  node = dict(
79
  pad = 15,
 
93
  title="Search for the Epidemiology of "+disease_or_gard_id,
94
  font=dict(size = 10, color = 'black'),
95
  )
96
+
97
  return fig
98
 
99
  #### BEGIN APP ####
 
112
 
113
  if disease_or_gard_id:
114
  df, sankey_data = extract_abs.streamlit_extraction(disease_or_gard_id, max_results, filtering,
115
+ NER_pipeline, entity_classes,
116
+ extract_diseases,GARD_dict, max_length,
117
  classify_model_vars)
118
  st.dataframe(df, height=100)
119
  csv = convert_df(df)
 
125
  )
126
  #st.dataframe(data=None, width=None, height=None)
127
  fig = epi_sankey(sankey_data,disease_or_gard_id)
128
+
129
  #if st.button('Display Sankey Diagram of Automated Search'):
130
  st.plotly_chart(fig, use_container_width=True)
131
+ # st.code(body, language="python")
nltk_data/punkt/PY3/README ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
2
+
3
+ Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
4
+ been contributed by various people using NLTK for sentence boundary detection.
5
+
6
+ For information about how to use these models, please confer the tokenization HOWTO:
7
+ http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
8
+ and chapter 3.8 of the NLTK book:
9
+ http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
10
+
11
+ There are pretrained tokenizers for the following languages:
12
+
13
+ File Language Source Contents Size of training corpus(in tokens) Model contributed by
14
+ =======================================================================================================================================================================
15
+ czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
16
+ Literarni Noviny
17
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
18
+ danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
19
+ (Berlingske Avisdata, Copenhagen) Weekend Avisen
20
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
21
+ dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
22
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
23
+ english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
24
+ (American)
25
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
26
+ estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
27
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
28
+ finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
29
+ Text Bank (Suomen Kielen newspapers
30
+ Tekstipankki)
31
+ Finnish Center for IT Science
32
+ (CSC)
33
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
34
+ french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
35
+ (European)
36
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
37
+ german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
38
+ (Switzerland) CD-ROM
39
+ (Uses "ss"
40
+ instead of "ß")
41
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
42
+ greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
43
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
44
+ italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
45
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
46
+ norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
47
+ (Bokmål and Information Technologies,
48
+ Nynorsk) Bergen
49
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
50
+ polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
51
+ (http://www.nkjp.pl/)
52
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
53
+ portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
54
+ (Brazilian) (Linguateca)
55
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
56
+ slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
57
+ Slovene Academy for Arts
58
+ and Sciences
59
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
60
+ spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
61
+ (European)
62
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
63
+ swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
64
+ (and some other texts)
65
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
66
+ turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
67
+ (Türkçe Derlem Projesi)
68
+ University of Ankara
69
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
70
+
71
+ The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
72
+ Unicode using the codecs module.
73
+
74
+ Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
75
+ Computational Linguistics 32: 485-525.
76
+
77
+ ---- Training Code ----
78
+
79
+ # import punkt
80
+ import nltk.tokenize.punkt
81
+
82
+ # Make a new Tokenizer
83
+ tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
84
+
85
+ # Read in training corpus (one example: Slovene)
86
+ import codecs
87
+ text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
88
+
89
+ # Train tokenizer
90
+ tokenizer.train(text)
91
+
92
+ # Dump pickled tokenizer
93
+ import pickle
94
+ out = open("slovene.pickle","wb")
95
+ pickle.dump(tokenizer, out)
96
+ out.close()
97
+
98
+ ---------
nltk_data/punkt/PY3/english.pickle ADDED
Binary file (407 kB). View file
 
nltk_data/punkt/README ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
2
+
3
+ Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
4
+ been contributed by various people using NLTK for sentence boundary detection.
5
+
6
+ For information about how to use these models, please confer the tokenization HOWTO:
7
+ http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
8
+ and chapter 3.8 of the NLTK book:
9
+ http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
10
+
11
+ There are pretrained tokenizers for the following languages:
12
+
13
+ File Language Source Contents Size of training corpus(in tokens) Model contributed by
14
+ =======================================================================================================================================================================
15
+ czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
16
+ Literarni Noviny
17
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
18
+ danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
19
+ (Berlingske Avisdata, Copenhagen) Weekend Avisen
20
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
21
+ dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
22
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
23
+ english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
24
+ (American)
25
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
26
+ estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
27
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
28
+ finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
29
+ Text Bank (Suomen Kielen newspapers
30
+ Tekstipankki)
31
+ Finnish Center for IT Science
32
+ (CSC)
33
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
34
+ french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
35
+ (European)
36
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
37
+ german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
38
+ (Switzerland) CD-ROM
39
+ (Uses "ss"
40
+ instead of "ß")
41
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
42
+ greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
43
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
44
+ italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
45
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
46
+ norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
47
+ (Bokmål and Information Technologies,
48
+ Nynorsk) Bergen
49
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
50
+ polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
51
+ (http://www.nkjp.pl/)
52
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
53
+ portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
54
+ (Brazilian) (Linguateca)
55
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
56
+ slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
57
+ Slovene Academy for Arts
58
+ and Sciences
59
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
60
+ spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
61
+ (European)
62
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
63
+ swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
64
+ (and some other texts)
65
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
66
+ turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
67
+ (Türkçe Derlem Projesi)
68
+ University of Ankara
69
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
70
+
71
+ The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
72
+ Unicode using the codecs module.
73
+
74
+ Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
75
+ Computational Linguistics 32: 485-525.
76
+
77
+ ---- Training Code ----
78
+
79
+ # import punkt
80
+ import nltk.tokenize.punkt
81
+
82
+ # Make a new Tokenizer
83
+ tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
84
+
85
+ # Read in training corpus (one example: Slovene)
86
+ import codecs
87
+ text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
88
+
89
+ # Train tokenizer
90
+ tokenizer.train(text)
91
+
92
+ # Dump pickled tokenizer
93
+ import pickle
94
+ out = open("slovene.pickle","wb")
95
+ pickle.dump(tokenizer, out)
96
+ out.close()
97
+
98
+ ---------
nltk_data/punkt/english.pickle ADDED
The diff for this file is too large to render. See raw diff
 
nltk_data/stopwords/README ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Stopwords Corpus
2
+
3
+ This corpus contains lists of stop words for several languages. These
4
+ are high-frequency grammatical words which are usually ignored in text
5
+ retrieval applications.
6
+
7
+ They were obtained from:
8
+ http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/
9
+
10
+ The stop words for the Romanian language were obtained from:
11
+ http://arlc.ro/resources/
12
+
13
+ The English list has been augmented
14
+ https://github.com/nltk/nltk_data/issues/22
15
+
16
+ The German list has been corrected
17
+ https://github.com/nltk/nltk_data/pull/49
18
+
19
+ A Kazakh list has been added
20
+ https://github.com/nltk/nltk_data/pull/52
21
+
22
+ A Nepali list has been added
23
+ https://github.com/nltk/nltk_data/pull/83
24
+
25
+ An Azerbaijani list has been added
26
+ https://github.com/nltk/nltk_data/pull/100
27
+
28
+ A Greek list has been added
29
+ https://github.com/nltk/nltk_data/pull/103
30
+
31
+ An Indonesian list has been added
32
+ https://github.com/nltk/nltk_data/pull/112
nltk_data/stopwords/english ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ i
2
+ me
3
+ my
4
+ myself
5
+ we
6
+ our
7
+ ours
8
+ ourselves
9
+ you
10
+ you're
11
+ you've
12
+ you'll
13
+ you'd
14
+ your
15
+ yours
16
+ yourself
17
+ yourselves
18
+ he
19
+ him
20
+ his
21
+ himself
22
+ she
23
+ she's
24
+ her
25
+ hers
26
+ herself
27
+ it
28
+ it's
29
+ its
30
+ itself
31
+ they
32
+ them
33
+ their
34
+ theirs
35
+ themselves
36
+ what
37
+ which
38
+ who
39
+ whom
40
+ this
41
+ that
42
+ that'll
43
+ these
44
+ those
45
+ am
46
+ is
47
+ are
48
+ was
49
+ were
50
+ be
51
+ been
52
+ being
53
+ have
54
+ has
55
+ had
56
+ having
57
+ do
58
+ does
59
+ did
60
+ doing
61
+ a
62
+ an
63
+ the
64
+ and
65
+ but
66
+ if
67
+ or
68
+ because
69
+ as
70
+ until
71
+ while
72
+ of
73
+ at
74
+ by
75
+ for
76
+ with
77
+ about
78
+ against
79
+ between
80
+ into
81
+ through
82
+ during
83
+ before
84
+ after
85
+ above
86
+ below
87
+ to
88
+ from
89
+ up
90
+ down
91
+ in
92
+ out
93
+ on
94
+ off
95
+ over
96
+ under
97
+ again
98
+ further
99
+ then
100
+ once
101
+ here
102
+ there
103
+ when
104
+ where
105
+ why
106
+ how
107
+ all
108
+ any
109
+ both
110
+ each
111
+ few
112
+ more
113
+ most
114
+ other
115
+ some
116
+ such
117
+ no
118
+ nor
119
+ not
120
+ only
121
+ own
122
+ same
123
+ so
124
+ than
125
+ too
126
+ very
127
+ s
128
+ t
129
+ can
130
+ will
131
+ just
132
+ don
133
+ don't
134
+ should
135
+ should've
136
+ now
137
+ d
138
+ ll
139
+ m
140
+ o
141
+ re
142
+ ve
143
+ y
144
+ ain
145
+ aren
146
+ aren't
147
+ couldn
148
+ couldn't
149
+ didn
150
+ didn't
151
+ doesn
152
+ doesn't
153
+ hadn
154
+ hadn't
155
+ hasn
156
+ hasn't
157
+ haven
158
+ haven't
159
+ isn
160
+ isn't
161
+ ma
162
+ mightn
163
+ mightn't
164
+ mustn
165
+ mustn't
166
+ needn
167
+ needn't
168
+ shan
169
+ shan't
170
+ shouldn
171
+ shouldn't
172
+ wasn
173
+ wasn't
174
+ weren
175
+ weren't
176
+ won
177
+ won't
178
+ wouldn
179
+ wouldn't