Djacon commited on
Commit
0ad1e5d
1 Parent(s): 123b321

bypass nltk problem

Browse files
.gitignore CHANGED
@@ -1,2 +1 @@
1
- __pycache__/
2
- nltk-data
 
1
+ __pycache__/
 
extract.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  import nltk
3
- nltk.download('punkt', download_dir=os.path.join(os.getcwd(), 'nltk-data'))
4
  print(nltk.data.path)
5
 
6
  from nltk.cluster.util import cosine_distance
 
1
  import os
2
  import nltk
3
+ nltk.data.path.append(os.path.join(os.getcwd(), 'nltk-data'))
4
  print(nltk.data.path)
5
 
6
  from nltk.cluster.util import cosine_distance
nltk-data/tokenizers/punkt.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51c3078994aeaf650bfc8e028be4fb42b4a0d177d41c012b6a983979653660ec
3
+ size 13905355
nltk-data/tokenizers/punkt/.DS_Store ADDED
Binary file (6.15 kB). View file
 
nltk-data/tokenizers/punkt/PY3/README ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
2
+
3
+ Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
4
+ been contributed by various people using NLTK for sentence boundary detection.
5
+
6
+ For information about how to use these models, please confer the tokenization HOWTO:
7
+ http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
8
+ and chapter 3.8 of the NLTK book:
9
+ http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
10
+
11
+ There are pretrained tokenizers for the following languages:
12
+
13
+ File Language Source Contents Size of training corpus(in tokens) Model contributed by
14
+ =======================================================================================================================================================================
15
+ czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
16
+ Literarni Noviny
17
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
18
+ danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
19
+ (Berlingske Avisdata, Copenhagen) Weekend Avisen
20
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
21
+ dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
22
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
23
+ english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
24
+ (American)
25
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
26
+ estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
27
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
28
+ finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
29
+ Text Bank (Suomen Kielen newspapers
30
+ Tekstipankki)
31
+ Finnish Center for IT Science
32
+ (CSC)
33
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
34
+ french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
35
+ (European)
36
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
37
+ german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
38
+ (Switzerland) CD-ROM
39
+ (Uses "ss"
40
+ instead of "ß")
41
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
42
+ greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
43
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
44
+ italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
45
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
46
+ norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
47
+ (Bokmål and Information Technologies,
48
+ Nynorsk) Bergen
49
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
50
+ polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
51
+ (http://www.nkjp.pl/)
52
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
53
+ portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
54
+ (Brazilian) (Linguateca)
55
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
56
+ slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
57
+ Slovene Academy for Arts
58
+ and Sciences
59
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
60
+ spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
61
+ (European)
62
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
63
+ swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
64
+ (and some other texts)
65
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
66
+ turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
67
+ (Türkçe Derlem Projesi)
68
+ University of Ankara
69
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
70
+
71
+ The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
72
+ Unicode using the codecs module.
73
+
74
+ Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
75
+ Computational Linguistics 32: 485-525.
76
+
77
+ ---- Training Code ----
78
+
79
+ # import punkt
80
+ import nltk.tokenize.punkt
81
+
82
+ # Make a new Tokenizer
83
+ tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
84
+
85
+ # Read in training corpus (one example: Slovene)
86
+ import codecs
87
+ text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
88
+
89
+ # Train tokenizer
90
+ tokenizer.train(text)
91
+
92
+ # Dump pickled tokenizer
93
+ import pickle
94
+ out = open("slovene.pickle","wb")
95
+ pickle.dump(tokenizer, out)
96
+ out.close()
97
+
98
+ ---------
nltk-data/tokenizers/punkt/PY3/english.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cad3758596392364e3be9803dbd7ebeda384b68937b488a01365f5551bb942c
3
+ size 406697
nltk-data/tokenizers/punkt/PY3/russian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:549762f8190024d89b511472df21a3a135eee5d9233e63ac244db737c2c61d7e
3
+ size 33020
nltk-data/tokenizers/punkt/README ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
2
+
3
+ Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
4
+ been contributed by various people using NLTK for sentence boundary detection.
5
+
6
+ For information about how to use these models, please confer the tokenization HOWTO:
7
+ http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
8
+ and chapter 3.8 of the NLTK book:
9
+ http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
10
+
11
+ There are pretrained tokenizers for the following languages:
12
+
13
+ File Language Source Contents Size of training corpus(in tokens) Model contributed by
14
+ =======================================================================================================================================================================
15
+ czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
16
+ Literarni Noviny
17
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
18
+ danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
19
+ (Berlingske Avisdata, Copenhagen) Weekend Avisen
20
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
21
+ dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
22
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
23
+ english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
24
+ (American)
25
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
26
+ estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
27
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
28
+ finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
29
+ Text Bank (Suomen Kielen newspapers
30
+ Tekstipankki)
31
+ Finnish Center for IT Science
32
+ (CSC)
33
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
34
+ french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
35
+ (European)
36
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
37
+ german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
38
+ (Switzerland) CD-ROM
39
+ (Uses "ss"
40
+ instead of "ß")
41
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
42
+ greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
43
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
44
+ italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
45
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
46
+ norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
47
+ (Bokmål and Information Technologies,
48
+ Nynorsk) Bergen
49
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
50
+ polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
51
+ (http://www.nkjp.pl/)
52
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
53
+ portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
54
+ (Brazilian) (Linguateca)
55
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
56
+ slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
57
+ Slovene Academy for Arts
58
+ and Sciences
59
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
60
+ spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
61
+ (European)
62
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
63
+ swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
64
+ (and some other texts)
65
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
66
+ turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
67
+ (Türkçe Derlem Projesi)
68
+ University of Ankara
69
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
70
+
71
+ The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
72
+ Unicode using the codecs module.
73
+
74
+ Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
75
+ Computational Linguistics 32: 485-525.
76
+
77
+ ---- Training Code ----
78
+
79
+ # import punkt
80
+ import nltk.tokenize.punkt
81
+
82
+ # Make a new Tokenizer
83
+ tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
84
+
85
+ # Read in training corpus (one example: Slovene)
86
+ import codecs
87
+ text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
88
+
89
+ # Train tokenizer
90
+ tokenizer.train(text)
91
+
92
+ # Dump pickled tokenizer
93
+ import pickle
94
+ out = open("slovene.pickle","wb")
95
+ pickle.dump(tokenizer, out)
96
+ out.close()
97
+
98
+ ---------
nltk-data/tokenizers/punkt/english.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dda37972ae88998a6fd3e3ec002697a6bd362b32d050fda7d7ca5276873092aa
3
+ size 433305
nltk-data/tokenizers/punkt/russian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc984432fbe31f7000014f8047502476889169c60f09be5413ca09276b16c909
3
+ size 33027