Anis Taluqdar commited on
Commit
8a11254
1 Parent(s): 1339b3a

Commit message

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. __pycache__/chat.cpython-310.pyc +0 -0
  2. __pycache__/model.cpython-310.pyc +0 -0
  3. __pycache__/nltk_utils.cpython-310.pyc +0 -0
  4. app.py +19 -0
  5. chat.py +84 -0
  6. data.pth +3 -0
  7. intents.json +102 -0
  8. model.py +19 -0
  9. nltk_data/tokenizers/punkt.zip +3 -0
  10. nltk_data/tokenizers/punkt/.DS_Store +0 -0
  11. nltk_data/tokenizers/punkt/PY3/README +98 -0
  12. nltk_data/tokenizers/punkt/PY3/czech.pickle +3 -0
  13. nltk_data/tokenizers/punkt/PY3/danish.pickle +3 -0
  14. nltk_data/tokenizers/punkt/PY3/dutch.pickle +3 -0
  15. nltk_data/tokenizers/punkt/PY3/english.pickle +3 -0
  16. nltk_data/tokenizers/punkt/PY3/estonian.pickle +3 -0
  17. nltk_data/tokenizers/punkt/PY3/finnish.pickle +3 -0
  18. nltk_data/tokenizers/punkt/PY3/french.pickle +3 -0
  19. nltk_data/tokenizers/punkt/PY3/german.pickle +3 -0
  20. nltk_data/tokenizers/punkt/PY3/greek.pickle +3 -0
  21. nltk_data/tokenizers/punkt/PY3/italian.pickle +3 -0
  22. nltk_data/tokenizers/punkt/PY3/malayalam.pickle +3 -0
  23. nltk_data/tokenizers/punkt/PY3/norwegian.pickle +3 -0
  24. nltk_data/tokenizers/punkt/PY3/polish.pickle +3 -0
  25. nltk_data/tokenizers/punkt/PY3/portuguese.pickle +3 -0
  26. nltk_data/tokenizers/punkt/PY3/russian.pickle +3 -0
  27. nltk_data/tokenizers/punkt/PY3/slovene.pickle +3 -0
  28. nltk_data/tokenizers/punkt/PY3/spanish.pickle +3 -0
  29. nltk_data/tokenizers/punkt/PY3/swedish.pickle +3 -0
  30. nltk_data/tokenizers/punkt/PY3/turkish.pickle +3 -0
  31. nltk_data/tokenizers/punkt/README +98 -0
  32. nltk_data/tokenizers/punkt/czech.pickle +3 -0
  33. nltk_data/tokenizers/punkt/danish.pickle +3 -0
  34. nltk_data/tokenizers/punkt/dutch.pickle +3 -0
  35. nltk_data/tokenizers/punkt/english.pickle +3 -0
  36. nltk_data/tokenizers/punkt/estonian.pickle +3 -0
  37. nltk_data/tokenizers/punkt/finnish.pickle +3 -0
  38. nltk_data/tokenizers/punkt/french.pickle +3 -0
  39. nltk_data/tokenizers/punkt/german.pickle +3 -0
  40. nltk_data/tokenizers/punkt/greek.pickle +3 -0
  41. nltk_data/tokenizers/punkt/italian.pickle +3 -0
  42. nltk_data/tokenizers/punkt/malayalam.pickle +3 -0
  43. nltk_data/tokenizers/punkt/norwegian.pickle +3 -0
  44. nltk_data/tokenizers/punkt/polish.pickle +3 -0
  45. nltk_data/tokenizers/punkt/portuguese.pickle +3 -0
  46. nltk_data/tokenizers/punkt/russian.pickle +3 -0
  47. nltk_data/tokenizers/punkt/slovene.pickle +3 -0
  48. nltk_data/tokenizers/punkt/spanish.pickle +3 -0
  49. nltk_data/tokenizers/punkt/swedish.pickle +3 -0
  50. nltk_data/tokenizers/punkt/turkish.pickle +3 -0
__pycache__/chat.cpython-310.pyc ADDED
Binary file (1.47 kB). View file
 
__pycache__/model.cpython-310.pyc ADDED
Binary file (864 Bytes). View file
 
__pycache__/nltk_utils.cpython-310.pyc ADDED
Binary file (939 Bytes). View file
 
app.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, render_template, request, jsonify
2
+ from chat import get_response
3
+
4
+ app = Flask(__name__)
5
+
6
+ @app.get("/")
7
+ def index_get():
8
+ return render_template("base.html")
9
+
10
+ @app.post("/predict")
11
+ def predict():
12
+ text = request.get_json().get("message")
13
+ response = get_response(text)
14
+ message = {"answer": response}
15
+ return jsonify(message)
16
+
17
+
18
+ if __name__ == "__main__":
19
+ app.run(debug=True)
chat.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import json
3
+
4
+ import torch
5
+
6
+ from model import NeuralNet
7
+ from nltk_utils import bag_of_words, tokenize
8
+
9
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
10
+
11
+ with open('intents.json', 'r') as json_data:
12
+ intents = json.load(json_data)
13
+
14
+ FILE = 'data.pth'
15
+ data = torch.load(FILE)
16
+
17
+ input_size = data["input_size"]
18
+ hidden_size = data["hidden_size"]
19
+ output_size = data["output_size"]
20
+ all_words = data["all_words"]
21
+ tags = data["tags"]
22
+ model_state = data["model_state"]
23
+
24
+ model = NeuralNet(input_size,hidden_size,output_size).to(device)
25
+ model.load_state_dict(model_state)
26
+ model.eval()
27
+
28
+ bot_name = "Anis"
29
+ # print("Let's chat! (type 'quit' to exit)")
30
+ # while True:
31
+ # sentence = input("You: ")
32
+ # if sentence == "quit":
33
+ # break
34
+
35
+ # sentence = tokenize(sentence)
36
+ # X = bag_of_words(sentence, all_words)
37
+ # X = X.reshape(1, X.shape[0])
38
+ # X = torch.from_numpy(X).to(device)
39
+
40
+ # output = model(X)
41
+ # _, predicted = torch.max(output, dim=1)
42
+
43
+ # tag = tags[predicted.item()]
44
+
45
+ # probs = torch.softmax(output, dim=1)
46
+ # prob = probs[0][predicted.item()]
47
+ # if prob.item() > 0.75:
48
+ # for intent in intents['intents']:
49
+ # if tag == intent['tag']:
50
+ # print(f"{bot_name}: { random.choice(intent['responses'])}")
51
+ # else:
52
+ # print(f"{bot_name}: I do not understand...")
53
+
54
+ def get_response(msg):
55
+ sentence = tokenize(msg)
56
+ X = bag_of_words(sentence, all_words)
57
+ X = X.reshape(1, X.shape[0])
58
+ X = torch.from_numpy(X).to(device)
59
+
60
+ output = model(X)
61
+ _, predicted = torch.max(output, dim=1)
62
+
63
+ tag = tags[predicted.item()]
64
+
65
+ probs = torch.softmax(output, dim=1)
66
+ prob = probs[0][predicted.item()]
67
+ if prob.item() > 0.75:
68
+ for intent in intents['intents']:
69
+ if tag == intent["tag"]:
70
+ return random.choice(intent['responses'])
71
+
72
+ return "I do not understand..."
73
+
74
+
75
+ if __name__ == "__main__":
76
+ print("Let's chat! (type 'quit' to exit)")
77
+ while True:
78
+ # sentence = "do you use credit cards?"
79
+ sentence = input("You: ")
80
+ if sentence == "quit":
81
+ break
82
+
83
+ resp = get_response(sentence)
84
+ print(resp)
data.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e72a22849441369c25180e1ca209ce65e2921c2aea6d01a9c6c06cf50e6004bd
3
+ size 5290
intents.json ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "intents": [
3
+ {
4
+ "tag": "greeting",
5
+ "patterns": [
6
+ "Hi",
7
+ "Hello",
8
+ "Hey",
9
+ "Hi there"
10
+ ],
11
+ "responses": [
12
+ "Hello! How can I help you today?",
13
+ "Hi there! What can I do for you?"
14
+ ]
15
+ },
16
+ {
17
+ "tag": "product_inquiry",
18
+ "patterns": [
19
+ "Can you tell me about your products?",
20
+ "What products do you sell?",
21
+ "Tell me about your products"
22
+ ],
23
+ "responses": [
24
+ "We offer a variety of products. Please visit our website for more details.",
25
+ "You can find all our products listed on our website."
26
+ ]
27
+ },
28
+ {
29
+ "tag": "order_status",
30
+ "patterns": [
31
+ "Where is my order?",
32
+ "Can you check my order status?",
33
+ "I want to know my order status"
34
+ ],
35
+ "responses": [
36
+ "Please provide your order number, and I will check the status for you.",
37
+ "Let me know your order number to check the status."
38
+ ]
39
+ },
40
+ {
41
+ "tag": "return_policy",
42
+ "patterns": [
43
+ "What is your return policy?",
44
+ "How do I return a product?",
45
+ "Can I return a product?"
46
+ ],
47
+ "responses": [
48
+ "You can return products within 30 days of purchase. Visit our website for more details.",
49
+ "Our return policy allows returns within 30 days. Please check our website for the process."
50
+ ]
51
+ },
52
+ {
53
+ "tag": "shipping_info",
54
+ "patterns": [
55
+ "How long does shipping take?",
56
+ "What are your shipping options?",
57
+ "Tell me about your shipping"
58
+ ],
59
+ "responses": [
60
+ "Shipping usually takes 3-5 business days. We also offer express shipping.",
61
+ "Standard shipping takes 3-5 days. Express shipping is also available."
62
+ ]
63
+ },
64
+ {
65
+ "tag": "contact_support",
66
+ "patterns": [
67
+ "How do I contact support?",
68
+ "I need help from customer support",
69
+ "Can you give me your support contact?"
70
+ ],
71
+ "responses": [
72
+ "You can contact our support team at support@mail.com or call +123456789.",
73
+ "Reach out to our support team via email at support@mail.com or phone at +123456789."
74
+ ]
75
+ },
76
+ {
77
+ "tag": "business_hours",
78
+ "patterns": [
79
+ "What are your business hours?",
80
+ "When are you open?",
81
+ "Tell me your working hours"
82
+ ],
83
+ "responses": [
84
+ "Our business hours are Monday to Friday, 9 AM to 5 PM.",
85
+ "We are open from Monday to Friday, 9 AM to 5 PM."
86
+ ]
87
+ },
88
+ {
89
+ "tag": "goodbye",
90
+ "patterns": [
91
+ "Bye",
92
+ "Goodbye",
93
+ "Thanks for your help"
94
+ ],
95
+ "responses": [
96
+ "Goodbye! Have a great day!",
97
+ "You're welcome! Bye!"
98
+ ]
99
+ }
100
+ ]
101
+ }
102
+
model.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ class NeuralNet(nn.Module):
5
+ def __init__(self, input_size, hidden_size, num_classes):
6
+ super(NeuralNet, self).__init__()
7
+ self.l1 = nn.Linear(input_size, hidden_size)
8
+ self.l2 = nn.Linear(hidden_size, hidden_size)
9
+ self.l3 = nn.Linear(hidden_size, num_classes)
10
+ self.relu = nn.ReLU()
11
+
12
+ def forward(self, x):
13
+ out = self.l1(x)
14
+ out = self.relu(out)
15
+ out = self.l2(out)
16
+ out = self.relu(out)
17
+ out = self.l3(out)
18
+
19
+ return out
nltk_data/tokenizers/punkt.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51c3078994aeaf650bfc8e028be4fb42b4a0d177d41c012b6a983979653660ec
3
+ size 13905355
nltk_data/tokenizers/punkt/.DS_Store ADDED
Binary file (6.15 kB). View file
 
nltk_data/tokenizers/punkt/PY3/README ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
2
+
3
+ Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
4
+ been contributed by various people using NLTK for sentence boundary detection.
5
+
6
+ For information about how to use these models, please confer the tokenization HOWTO:
7
+ http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
8
+ and chapter 3.8 of the NLTK book:
9
+ http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
10
+
11
+ There are pretrained tokenizers for the following languages:
12
+
13
+ File Language Source Contents Size of training corpus(in tokens) Model contributed by
14
+ =======================================================================================================================================================================
15
+ czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
16
+ Literarni Noviny
17
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
18
+ danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
19
+ (Berlingske Avisdata, Copenhagen) Weekend Avisen
20
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
21
+ dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
22
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
23
+ english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
24
+ (American)
25
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
26
+ estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
27
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
28
+ finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
29
+ Text Bank (Suomen Kielen newspapers
30
+ Tekstipankki)
31
+ Finnish Center for IT Science
32
+ (CSC)
33
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
34
+ french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
35
+ (European)
36
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
37
+ german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
38
+ (Switzerland) CD-ROM
39
+ (Uses "ss"
40
+ instead of "ß")
41
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
42
+ greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
43
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
44
+ italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
45
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
46
+ norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
47
+ (Bokmål and Information Technologies,
48
+ Nynorsk) Bergen
49
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
50
+ polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
51
+ (http://www.nkjp.pl/)
52
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
53
+ portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
54
+ (Brazilian) (Linguateca)
55
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
56
+ slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
57
+ Slovene Academy for Arts
58
+ and Sciences
59
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
60
+ spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
61
+ (European)
62
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
63
+ swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
64
+ (and some other texts)
65
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
66
+ turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
67
+ (Türkçe Derlem Projesi)
68
+ University of Ankara
69
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
70
+
71
+ The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
72
+ Unicode using the codecs module.
73
+
74
+ Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
75
+ Computational Linguistics 32: 485-525.
76
+
77
+ ---- Training Code ----
78
+
79
+ # import punkt
80
+ import nltk.tokenize.punkt
81
+
82
+ # Make a new Tokenizer
83
+ tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
84
+
85
+ # Read in training corpus (one example: Slovene)
86
+ import codecs
87
+ text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
88
+
89
+ # Train tokenizer
90
+ tokenizer.train(text)
91
+
92
+ # Dump pickled tokenizer
93
+ import pickle
94
+ out = open("slovene.pickle","wb")
95
+ pickle.dump(tokenizer, out)
96
+ out.close()
97
+
98
+ ---------
nltk_data/tokenizers/punkt/PY3/czech.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64b0734b6fbe8e8d7cac79f48d1dd9f853824e57c4e3594dadd74ba2c1d97f50
3
+ size 1119050
nltk_data/tokenizers/punkt/PY3/danish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6189c7dd254e29e2bd406a7f6a4336297c8953214792466a790ea4444223ceb3
3
+ size 1191710
nltk_data/tokenizers/punkt/PY3/dutch.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fda0d6a13f02e8898daec7fe923da88e25abe081bcfa755c0e015075c215fe4c
3
+ size 693759
nltk_data/tokenizers/punkt/PY3/english.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cad3758596392364e3be9803dbd7ebeda384b68937b488a01365f5551bb942c
3
+ size 406697
nltk_data/tokenizers/punkt/PY3/estonian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b364f72538d17b146a98009ad239a8096ce6c0a8b02958c0bc776ecd0c58a25f
3
+ size 1499502
nltk_data/tokenizers/punkt/PY3/finnish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a4b5ff5500ee851c456f9dd40d5fc0d8c1859c88eb3178de1317d26b7d22833
3
+ size 1852226
nltk_data/tokenizers/punkt/PY3/french.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28e3a4cd2971989b3cb9fd3433a6f15d17981e464db2be039364313b5de94f29
3
+ size 553575
nltk_data/tokenizers/punkt/PY3/german.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddcbbe85e2042a019b1a6e37fd8c153286c38ba201fae0f5bfd9a3f74abae25c
3
+ size 1463575
nltk_data/tokenizers/punkt/PY3/greek.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85dabc44ab90a5f208ef37ff6b4892ebe7e740f71fb4da47cfd95417ca3e22fd
3
+ size 876006
nltk_data/tokenizers/punkt/PY3/italian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68a94007b1e4ffdc4d1a190185ca5442c3dafeb17ab39d30329e84cd74a43947
3
+ size 615089
nltk_data/tokenizers/punkt/PY3/malayalam.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f8cf58acbdb7f472ac40affc13663be42dafb47c15030c11ade0444c9e0e53d
3
+ size 221207
nltk_data/tokenizers/punkt/PY3/norwegian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ff7a46d1438b311457d15d7763060b8d3270852c1850fd788c5cee194dc4a1d
3
+ size 1181271
nltk_data/tokenizers/punkt/PY3/polish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:624900ae3ddfb4854a98c5d3b8b1c9bb719975f33fee61ce1441dab9f8a00718
3
+ size 1738386
nltk_data/tokenizers/punkt/PY3/portuguese.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02a0b7b25c3c7471e1791b66a31bbb530afbb0160aee4fcecf0107652067b4a1
3
+ size 611919
nltk_data/tokenizers/punkt/PY3/russian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:549762f8190024d89b511472df21a3a135eee5d9233e63ac244db737c2c61d7e
3
+ size 33020
nltk_data/tokenizers/punkt/PY3/slovene.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52ef2cc0ed27d79b3aa635cbbc40ad811883a75a4b8a8be1ae406972870fd864
3
+ size 734444
nltk_data/tokenizers/punkt/PY3/spanish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:164a50fadc5a49f8ec7426eae11d3111ee752b48a3ef373d47745011192a5984
3
+ size 562337
nltk_data/tokenizers/punkt/PY3/swedish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0f7d538bfd5266633b09e842cd92e9e0ac10f1d923bf211e1497972ddc47318
3
+ size 979681
nltk_data/tokenizers/punkt/PY3/turkish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae68ef5863728ac5332e87eb1f6bae772ff32a13a4caa2b01a5c68103e853c5b
3
+ size 1017038
nltk_data/tokenizers/punkt/README ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
2
+
3
+ Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
4
+ been contributed by various people using NLTK for sentence boundary detection.
5
+
6
+ For information about how to use these models, please confer the tokenization HOWTO:
7
+ http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
8
+ and chapter 3.8 of the NLTK book:
9
+ http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
10
+
11
+ There are pretrained tokenizers for the following languages:
12
+
13
+ File Language Source Contents Size of training corpus(in tokens) Model contributed by
14
+ =======================================================================================================================================================================
15
+ czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
16
+ Literarni Noviny
17
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
18
+ danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
19
+ (Berlingske Avisdata, Copenhagen) Weekend Avisen
20
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
21
+ dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
22
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
23
+ english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
24
+ (American)
25
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
26
+ estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
27
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
28
+ finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
29
+ Text Bank (Suomen Kielen newspapers
30
+ Tekstipankki)
31
+ Finnish Center for IT Science
32
+ (CSC)
33
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
34
+ french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
35
+ (European)
36
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
37
+ german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
38
+ (Switzerland) CD-ROM
39
+ (Uses "ss"
40
+ instead of "ß")
41
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
42
+ greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
43
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
44
+ italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
45
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
46
+ norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
47
+ (Bokmål and Information Technologies,
48
+ Nynorsk) Bergen
49
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
50
+ polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
51
+ (http://www.nkjp.pl/)
52
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
53
+ portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
54
+ (Brazilian) (Linguateca)
55
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
56
+ slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
57
+ Slovene Academy for Arts
58
+ and Sciences
59
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
60
+ spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
61
+ (European)
62
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
63
+ swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
64
+ (and some other texts)
65
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
66
+ turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
67
+ (Türkçe Derlem Projesi)
68
+ University of Ankara
69
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
70
+
71
+ The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
72
+ Unicode using the codecs module.
73
+
74
+ Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
75
+ Computational Linguistics 32: 485-525.
76
+
77
+ ---- Training Code ----
78
+
79
+ # import punkt
80
+ import nltk.tokenize.punkt
81
+
82
+ # Make a new Tokenizer
83
+ tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
84
+
85
+ # Read in training corpus (one example: Slovene)
86
+ import codecs
87
+ text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
88
+
89
+ # Train tokenizer
90
+ tokenizer.train(text)
91
+
92
+ # Dump pickled tokenizer
93
+ import pickle
94
+ out = open("slovene.pickle","wb")
95
+ pickle.dump(tokenizer, out)
96
+ out.close()
97
+
98
+ ---------
nltk_data/tokenizers/punkt/czech.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ba73d293c7d7953956bcf02f3695ec5c1f0d527f2a3c38097f5593394fa1690
3
+ size 1265552
nltk_data/tokenizers/punkt/danish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea29760a0a9197f52ca59e78aeafc5a6f55d05258faf7db1709b2b9eb321ef20
3
+ size 1264725
nltk_data/tokenizers/punkt/dutch.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a8e26b3d68c45c38e594d19e2d5677447bfdcaa636d3b1e7acfed0e9272d73c
3
+ size 742624
nltk_data/tokenizers/punkt/english.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dda37972ae88998a6fd3e3ec002697a6bd362b32d050fda7d7ca5276873092aa
3
+ size 433305
nltk_data/tokenizers/punkt/estonian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3867fee26a36bdb197c64362aa13ac683f5f33fa4d0d225a5d56707582a55a1d
3
+ size 1596714
nltk_data/tokenizers/punkt/finnish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a9e17b3d5b4df76345d812b8a65b1da0767eda5086eadcc11e625eef0942835
3
+ size 1951656
nltk_data/tokenizers/punkt/french.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de05f3d5647d3d2296626fb83f68428e4c6ad6e05a00ed4694c8bdc8f2f197ee
3
+ size 583482
nltk_data/tokenizers/punkt/german.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eab497fa085413130c8fd0fb13b929128930afe2f6a26ea8715c95df7088e97c
3
+ size 1526714
nltk_data/tokenizers/punkt/greek.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21752a6762fad5cfe46fb5c45fad9a85484a0e8e81c67e6af6fb973cfc27d67c
3
+ size 1953106
nltk_data/tokenizers/punkt/italian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcb2717d7be5f26e860a92e05acf69b1123a5f4527cd7a269a9ab9e9e668c805
3
+ size 658331
nltk_data/tokenizers/punkt/malayalam.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f8cf58acbdb7f472ac40affc13663be42dafb47c15030c11ade0444c9e0e53d
3
+ size 221207
nltk_data/tokenizers/punkt/norwegian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4a97f8f9a03a0338dd746bcc89a0ae0f54ae43b835fa37d83e279e1ca794faf
3
+ size 1259779
nltk_data/tokenizers/punkt/polish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16127b6d10933427a3e90fb20e9be53e1fb371ff79a730c1030734ed80b90c92
3
+ size 2042451
nltk_data/tokenizers/punkt/portuguese.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb01bf7c79a4eadc2178bbd209665139a0e4b38f2d1c44fef097de93955140e0
3
+ size 649051
nltk_data/tokenizers/punkt/russian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc984432fbe31f7000014f8047502476889169c60f09be5413ca09276b16c909
3
+ size 33027
nltk_data/tokenizers/punkt/slovene.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7dac650212b3787b39996c01bd2084115493e6f6ec390bab61f767525b08b8ea
3
+ size 832867
nltk_data/tokenizers/punkt/spanish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:271dc6027c4aae056f72a9bfab5645cf67e198bf4f972895844e40f5989ccdc3
3
+ size 597831
nltk_data/tokenizers/punkt/swedish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40d50ebdad6caa87715f2e300b1217ec92c42de205a543cc4a56903bd2c9acfa
3
+ size 1034496
nltk_data/tokenizers/punkt/turkish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3ae47d76501d027698809d12e75292c9c392910488543342802f95db9765ccc
3
+ size 1225013