ranajoy98 commited on
Commit
f380bb8
β€’
1 Parent(s): a8426b5

Upload hunflair_test.ipynb

Browse files
Files changed (1) hide show
  1. hunflair_test.ipynb +318 -0
hunflair_test.ipynb ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "C:\\Users\\ranajoy.bhattacharya\\Anaconda3\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n",
13
+ " warnings.warn(msg)\n"
14
+ ]
15
+ }
16
+ ],
17
+ "source": [
18
+ "from flair.data import Sentence\n",
19
+ "from flair.models import MultiTagger"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": 2,
25
+ "metadata": {},
26
+ "outputs": [],
27
+ "source": [
28
+ "from flair.tokenization import SciSpacyTokenizer"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 7,
34
+ "metadata": {},
35
+ "outputs": [],
36
+ "source": [
37
+ "sentence = Sentence(\"Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome\",\n",
38
+ " use_tokenizer=SciSpacyTokenizer())\n",
39
+ "\n"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": 4,
45
+ "metadata": {},
46
+ "outputs": [
47
+ {
48
+ "name": "stdout",
49
+ "output_type": "stream",
50
+ "text": [
51
+ "2022-10-27 15:38:14,268 https://nlp.informatik.hu-berlin.de/resources/models/hunflair_smallish_models/cellline/hunflair-celline-v1.0.pt not found in cache, downloading to C:\\Users\\RANAJO~1.BHA\\AppData\\Local\\Temp\\tmpsn3vv9l1\n"
52
+ ]
53
+ },
54
+ {
55
+ "name": "stderr",
56
+ "output_type": "stream",
57
+ "text": [
58
+ "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1104886372/1104886372 [02:45<00:00, 6695019.03B/s]"
59
+ ]
60
+ },
61
+ {
62
+ "name": "stdout",
63
+ "output_type": "stream",
64
+ "text": [
65
+ "2022-10-27 15:40:59,814 copying C:\\Users\\RANAJO~1.BHA\\AppData\\Local\\Temp\\tmpsn3vv9l1 to cache at C:\\Users\\ranajoy.bhattacharya\\.flair\\models\\hunflair-celline-v1.0.pt\n"
66
+ ]
67
+ },
68
+ {
69
+ "name": "stderr",
70
+ "output_type": "stream",
71
+ "text": [
72
+ "\n"
73
+ ]
74
+ },
75
+ {
76
+ "name": "stdout",
77
+ "output_type": "stream",
78
+ "text": [
79
+ "2022-10-27 15:41:02,302 removing temp file C:\\Users\\RANAJO~1.BHA\\AppData\\Local\\Temp\\tmpsn3vv9l1\n",
80
+ "2022-10-27 15:41:02,536 loading file C:\\Users\\ranajoy.bhattacharya\\.flair\\models\\hunflair-celline-v1.0.pt\n",
81
+ "2022-10-27 15:41:08,505 SequenceTagger predicts: Dictionary with 8 tags: <unk>, O, S-CellLine, B-CellLine, I-CellLine, E-CellLine, <START>, <STOP>\n",
82
+ "2022-10-27 15:41:09,549 https://nlp.informatik.hu-berlin.de/resources/models/hunflair_allcorpus_models/huner-chemical/hunflair-chemical-full-v1.0.pt not found in cache, downloading to C:\\Users\\RANAJO~1.BHA\\AppData\\Local\\Temp\\tmpw4ze4yrc\n"
83
+ ]
84
+ },
85
+ {
86
+ "name": "stderr",
87
+ "output_type": "stream",
88
+ "text": [
89
+ "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1104888221/1104888221 [02:06<00:00, 8735803.50B/s]"
90
+ ]
91
+ },
92
+ {
93
+ "name": "stdout",
94
+ "output_type": "stream",
95
+ "text": [
96
+ "2022-10-27 15:43:16,584 copying C:\\Users\\RANAJO~1.BHA\\AppData\\Local\\Temp\\tmpw4ze4yrc to cache at C:\\Users\\ranajoy.bhattacharya\\.flair\\models\\hunflair-chemical-full-v1.0.pt\n"
97
+ ]
98
+ },
99
+ {
100
+ "name": "stderr",
101
+ "output_type": "stream",
102
+ "text": [
103
+ "\n"
104
+ ]
105
+ },
106
+ {
107
+ "name": "stdout",
108
+ "output_type": "stream",
109
+ "text": [
110
+ "2022-10-27 15:43:18,094 removing temp file C:\\Users\\RANAJO~1.BHA\\AppData\\Local\\Temp\\tmpw4ze4yrc\n",
111
+ "2022-10-27 15:43:18,208 loading file C:\\Users\\ranajoy.bhattacharya\\.flair\\models\\hunflair-chemical-full-v1.0.pt\n",
112
+ "2022-10-27 15:43:22,887 SequenceTagger predicts: Dictionary with 8 tags: <unk>, O, S-Chemical, B-Chemical, I-Chemical, E-Chemical, <START>, <STOP>\n",
113
+ "2022-10-27 15:43:23,826 https://nlp.informatik.hu-berlin.de/resources/models/hunflair_allcorpus_models/huner-disease/hunflair-disease-full-v1.0.pt not found in cache, downloading to C:\\Users\\RANAJO~1.BHA\\AppData\\Local\\Temp\\tmpy18ixxme\n"
114
+ ]
115
+ },
116
+ {
117
+ "name": "stderr",
118
+ "output_type": "stream",
119
+ "text": [
120
+ "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1104886193/1104886193 [01:33<00:00, 11804678.71B/s]"
121
+ ]
122
+ },
123
+ {
124
+ "name": "stdout",
125
+ "output_type": "stream",
126
+ "text": [
127
+ "2022-10-27 15:44:57,965 copying C:\\Users\\RANAJO~1.BHA\\AppData\\Local\\Temp\\tmpy18ixxme to cache at C:\\Users\\ranajoy.bhattacharya\\.flair\\models\\hunflair-disease-full-v1.0.pt\n"
128
+ ]
129
+ },
130
+ {
131
+ "name": "stderr",
132
+ "output_type": "stream",
133
+ "text": [
134
+ "\n"
135
+ ]
136
+ },
137
+ {
138
+ "name": "stdout",
139
+ "output_type": "stream",
140
+ "text": [
141
+ "2022-10-27 15:44:59,866 removing temp file C:\\Users\\RANAJO~1.BHA\\AppData\\Local\\Temp\\tmpy18ixxme\n",
142
+ "2022-10-27 15:45:00,019 loading file C:\\Users\\ranajoy.bhattacharya\\.flair\\models\\hunflair-disease-full-v1.0.pt\n",
143
+ "2022-10-27 15:45:06,491 SequenceTagger predicts: Dictionary with 8 tags: <unk>, O, B-Disease, E-Disease, I-Disease, S-Disease, <START>, <STOP>\n",
144
+ "2022-10-27 15:45:07,641 https://nlp.informatik.hu-berlin.de/resources/models/hunflair_allcorpus_models/huner-gene/hunflair-gene-full-v1.0.pt not found in cache, downloading to C:\\Users\\RANAJO~1.BHA\\AppData\\Local\\Temp\\tmp1qgb402b\n"
145
+ ]
146
+ },
147
+ {
148
+ "name": "stderr",
149
+ "output_type": "stream",
150
+ "text": [
151
+ "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1104887211/1104887211 [01:22<00:00, 13387243.47B/s]"
152
+ ]
153
+ },
154
+ {
155
+ "name": "stdout",
156
+ "output_type": "stream",
157
+ "text": [
158
+ "2022-10-27 15:46:30,726 copying C:\\Users\\RANAJO~1.BHA\\AppData\\Local\\Temp\\tmp1qgb402b to cache at C:\\Users\\ranajoy.bhattacharya\\.flair\\models\\hunflair-gene-full-v1.0.pt\n"
159
+ ]
160
+ },
161
+ {
162
+ "name": "stderr",
163
+ "output_type": "stream",
164
+ "text": [
165
+ "\n"
166
+ ]
167
+ },
168
+ {
169
+ "name": "stdout",
170
+ "output_type": "stream",
171
+ "text": [
172
+ "2022-10-27 15:46:32,696 removing temp file C:\\Users\\RANAJO~1.BHA\\AppData\\Local\\Temp\\tmp1qgb402b\n",
173
+ "2022-10-27 15:46:32,831 loading file C:\\Users\\ranajoy.bhattacharya\\.flair\\models\\hunflair-gene-full-v1.0.pt\n",
174
+ "2022-10-27 15:46:38,569 SequenceTagger predicts: Dictionary with 8 tags: <unk>, O, S-Gene, B-Gene, I-Gene, E-Gene, <START>, <STOP>\n",
175
+ "2022-10-27 15:46:39,870 https://nlp.informatik.hu-berlin.de/resources/models/hunflair_allcorpus_models/huner-species/hunflair-species-full-v1.1.pt not found in cache, downloading to C:\\Users\\RANAJO~1.BHA\\AppData\\Local\\Temp\\tmpj1y4y_6v\n"
176
+ ]
177
+ },
178
+ {
179
+ "name": "stderr",
180
+ "output_type": "stream",
181
+ "text": [
182
+ "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1104886931/1104886931 [01:16<00:00, 14508392.59B/s]"
183
+ ]
184
+ },
185
+ {
186
+ "name": "stdout",
187
+ "output_type": "stream",
188
+ "text": [
189
+ "2022-10-27 15:47:56,571 copying C:\\Users\\RANAJO~1.BHA\\AppData\\Local\\Temp\\tmpj1y4y_6v to cache at C:\\Users\\ranajoy.bhattacharya\\.flair\\models\\hunflair-species-full-v1.1.pt\n"
190
+ ]
191
+ },
192
+ {
193
+ "name": "stderr",
194
+ "output_type": "stream",
195
+ "text": [
196
+ "\n"
197
+ ]
198
+ },
199
+ {
200
+ "name": "stdout",
201
+ "output_type": "stream",
202
+ "text": [
203
+ "2022-10-27 15:47:58,616 removing temp file C:\\Users\\RANAJO~1.BHA\\AppData\\Local\\Temp\\tmpj1y4y_6v\n",
204
+ "2022-10-27 15:47:58,765 loading file C:\\Users\\ranajoy.bhattacharya\\.flair\\models\\hunflair-species-full-v1.1.pt\n",
205
+ "2022-10-27 15:48:04,720 SequenceTagger predicts: Dictionary with 8 tags: <unk>, O, S-Species, B-Species, I-Species, E-Species, <START>, <STOP>\n"
206
+ ]
207
+ }
208
+ ],
209
+ "source": [
210
+ "# load biomedical tagger\n",
211
+ "tagger = MultiTagger.load(\"hunflair\")\n"
212
+ ]
213
+ },
214
+ {
215
+ "cell_type": "code",
216
+ "execution_count": 9,
217
+ "metadata": {},
218
+ "outputs": [
219
+ {
220
+ "name": "stdout",
221
+ "output_type": "stream",
222
+ "text": [
223
+ "Span[0:2]: \"Behavioral abnormalities\" β†’ Disease (0.6736)\n",
224
+ "Span[9:12]: \"Fragile X Syndrome\" β†’ Disease (0.99)\n",
225
+ "Span[4:5]: \"Fmr1\" β†’ Gene (0.838)\n",
226
+ "Span[6:7]: \"Mouse\" β†’ Species (0.9979)\n"
227
+ ]
228
+ }
229
+ ],
230
+ "source": [
231
+ "for annotation_layer in sentence.annotation_layers.keys():\n",
232
+ " for entity in sentence.get_spans(annotation_layer):\n",
233
+ " print(entity)"
234
+ ]
235
+ },
236
+ {
237
+ "cell_type": "code",
238
+ "execution_count": 10,
239
+ "metadata": {},
240
+ "outputs": [
241
+ {
242
+ "name": "stdout",
243
+ "output_type": "stream",
244
+ "text": [
245
+ "2022-10-27 18:06:43,204 loading file C:\\Users\\ranajoy.bhattacharya\\.flair\\models\\hunflair-chemical-full-v1.0.pt\n",
246
+ "2022-10-27 18:06:49,684 SequenceTagger predicts: Dictionary with 8 tags: <unk>, O, S-Chemical, B-Chemical, I-Chemical, E-Chemical, <START>, <STOP>\n"
247
+ ]
248
+ }
249
+ ],
250
+ "source": [
251
+ "sentence = Sentence(\"Polycrystalline Li4+MxSi1-xO4 (M = B, Al) samples were prepared from reagent grade H3BO3 Al2O3, SiO2 and either LiOH*H2O (H samples) or Li2CO3 (C samples). Intimate stoichiometric mixtures were placed in an Al2O3 crucible or an Au foil boat and fired.\")\n",
252
+ "\n",
253
+ "# load biomedical tagger\n",
254
+ "tagger = MultiTagger.load(\"hunflair-chemical\")\n",
255
+ "\n",
256
+ "# tag sentence\n",
257
+ "tagger.predict(sentence)"
258
+ ]
259
+ },
260
+ {
261
+ "cell_type": "code",
262
+ "execution_count": 13,
263
+ "metadata": {},
264
+ "outputs": [
265
+ {
266
+ "name": "stdout",
267
+ "output_type": "stream",
268
+ "text": [
269
+ "Span[1:2]: \"Li4\" β†’ Chemical (0.754)\n",
270
+ "Span[3:4]: \"MxSi1-xO4\" β†’ Chemical (0.8234)\n",
271
+ "Span[9:10]: \"Al\" β†’ Chemical (0.9942)\n",
272
+ "Span[17:18]: \"H3BO3\" β†’ Chemical (0.7564)\n",
273
+ "Span[18:19]: \"Al2O3\" β†’ Chemical (0.5558)\n",
274
+ "Span[20:21]: \"SiO2\" β†’ Chemical (0.973)\n",
275
+ "Span[23:24]: \"LiOH\" β†’ Chemical (0.9721)\n",
276
+ "Span[25:26]: \"H2O\" β†’ Chemical (0.8947)\n",
277
+ "Span[31:32]: \"Li2CO3\" β†’ Chemical (0.9224)\n",
278
+ "Span[44:45]: \"Al2O3\" β†’ Chemical (0.9847)\n",
279
+ "Span[48:49]: \"Au\" β†’ Chemical (0.9787)\n"
280
+ ]
281
+ }
282
+ ],
283
+ "source": [
284
+ "for annotation_layer in sentence.annotation_layers.keys():\n",
285
+ " for entity in sentence.get_spans(annotation_layer):\n",
286
+ " print(entity)"
287
+ ]
288
+ },
289
+ {
290
+ "cell_type": "code",
291
+ "execution_count": null,
292
+ "metadata": {},
293
+ "outputs": [],
294
+ "source": []
295
+ }
296
+ ],
297
+ "metadata": {
298
+ "kernelspec": {
299
+ "display_name": "Python 3",
300
+ "language": "python",
301
+ "name": "python3"
302
+ },
303
+ "language_info": {
304
+ "codemirror_mode": {
305
+ "name": "ipython",
306
+ "version": 3
307
+ },
308
+ "file_extension": ".py",
309
+ "mimetype": "text/x-python",
310
+ "name": "python",
311
+ "nbconvert_exporter": "python",
312
+ "pygments_lexer": "ipython3",
313
+ "version": "3.7.6"
314
+ }
315
+ },
316
+ "nbformat": 4,
317
+ "nbformat_minor": 4
318
+ }