Virajtharinda commited on
Commit
f418a33
·
verified ·
1 Parent(s): bca570b

Upload 2 files

Browse files
Files changed (2) hide show
  1. SinhalaMLM.ipynb +710 -0
  2. Tokenizer.ipynb +720 -0
SinhalaMLM.ipynb ADDED
@@ -0,0 +1,710 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "3564591f-25e4-4be6-9104-acec74b5a120",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "Collecting datasets\n",
14
+ " Using cached datasets-2.16.1-py3-none-any.whl (507 kB)\n",
15
+ "Collecting pandas\n",
16
+ " Downloading pandas-2.2.0-cp310-cp310-win_amd64.whl (11.6 MB)\n",
17
+ " ---------------------------------------- 0.0/11.6 MB ? eta -:--:--\n",
18
+ " --------------------------------------- 0.0/11.6 MB 640.0 kB/s eta 0:00:19\n",
19
+ " --------------------------------------- 0.0/11.6 MB 640.0 kB/s eta 0:00:19\n",
20
+ " --------------------------------------- 0.0/11.6 MB 195.7 kB/s eta 0:00:59\n",
21
+ " --------------------------------------- 0.1/11.6 MB 381.3 kB/s eta 0:00:31\n",
22
+ " --------------------------------------- 0.1/11.6 MB 435.7 kB/s eta 0:00:27\n",
23
+ " --------------------------------------- 0.1/11.6 MB 399.4 kB/s eta 0:00:29\n",
24
+ " -------------------------------------- 0.2/11.6 MB 591.4 kB/s eta 0:00:20\n",
25
+ " -------------------------------------- 0.2/11.6 MB 602.4 kB/s eta 0:00:19\n",
26
+ " -------------------------------------- 0.2/11.6 MB 602.4 kB/s eta 0:00:19\n",
27
+ " - ------------------------------------- 0.3/11.6 MB 696.7 kB/s eta 0:00:17\n",
28
+ " - ------------------------------------- 0.4/11.6 MB 673.7 kB/s eta 0:00:17\n",
29
+ " - ------------------------------------- 0.4/11.6 MB 636.7 kB/s eta 0:00:18\n",
30
+ " - ------------------------------------- 0.5/11.6 MB 702.9 kB/s eta 0:00:16\n",
31
+ " -- ------------------------------------ 0.6/11.6 MB 893.0 kB/s eta 0:00:13\n",
32
+ " -- ------------------------------------ 0.6/11.6 MB 863.8 kB/s eta 0:00:13\n",
33
+ " -- ------------------------------------ 0.8/11.6 MB 955.9 kB/s eta 0:00:12\n",
34
+ " --- ------------------------------------ 0.9/11.6 MB 1.1 MB/s eta 0:00:10\n",
35
+ " ---- ----------------------------------- 1.2/11.6 MB 1.3 MB/s eta 0:00:09\n",
36
+ " ---- ----------------------------------- 1.2/11.6 MB 1.3 MB/s eta 0:00:08\n",
37
+ " ----- ---------------------------------- 1.5/11.6 MB 1.5 MB/s eta 0:00:07\n",
38
+ " ----- ---------------------------------- 1.6/11.6 MB 1.6 MB/s eta 0:00:07\n",
39
+ " ------ --------------------------------- 1.8/11.6 MB 1.6 MB/s eta 0:00:06\n",
40
+ " ------ --------------------------------- 2.0/11.6 MB 1.7 MB/s eta 0:00:06\n",
41
+ " ------- -------------------------------- 2.2/11.6 MB 1.9 MB/s eta 0:00:06\n",
42
+ " -------- ------------------------------- 2.3/11.6 MB 1.9 MB/s eta 0:00:05\n",
43
+ " -------- ------------------------------- 2.4/11.6 MB 1.9 MB/s eta 0:00:05\n",
44
+ " --------- ------------------------------ 2.6/11.6 MB 1.9 MB/s eta 0:00:05\n",
45
+ " --------- ------------------------------ 2.7/11.6 MB 1.9 MB/s eta 0:00:05\n",
46
+ " --------- ------------------------------ 2.8/11.6 MB 2.0 MB/s eta 0:00:05\n",
47
+ " ---------- ----------------------------- 3.0/11.6 MB 2.0 MB/s eta 0:00:05\n",
48
+ " ---------- ----------------------------- 3.1/11.6 MB 2.0 MB/s eta 0:00:05\n",
49
+ " ---------- ----------------------------- 3.2/11.6 MB 2.0 MB/s eta 0:00:05\n",
50
+ " ----------- ---------------------------- 3.3/11.6 MB 2.0 MB/s eta 0:00:05\n",
51
+ " ----------- ---------------------------- 3.3/11.6 MB 2.0 MB/s eta 0:00:05\n",
52
+ " ----------- ---------------------------- 3.4/11.6 MB 2.0 MB/s eta 0:00:05\n",
53
+ " ------------ --------------------------- 3.5/11.6 MB 1.9 MB/s eta 0:00:05\n",
54
+ " ------------ --------------------------- 3.6/11.6 MB 2.0 MB/s eta 0:00:05\n",
55
+ " ------------- -------------------------- 3.9/11.6 MB 2.0 MB/s eta 0:00:04\n",
56
+ " ------------- -------------------------- 4.0/11.6 MB 2.1 MB/s eta 0:00:04\n",
57
+ " -------------- ------------------------- 4.2/11.6 MB 2.1 MB/s eta 0:00:04\n",
58
+ " -------------- ------------------------- 4.3/11.6 MB 2.1 MB/s eta 0:00:04\n",
59
+ " --------------- ------------------------ 4.6/11.6 MB 2.2 MB/s eta 0:00:04\n",
60
+ " ---------------- ----------------------- 4.8/11.6 MB 2.2 MB/s eta 0:00:04\n",
61
+ " ---------------- ----------------------- 4.9/11.6 MB 2.2 MB/s eta 0:00:03\n",
62
+ " ------------------ --------------------- 5.3/11.6 MB 2.4 MB/s eta 0:00:03\n",
63
+ " ------------------- -------------------- 5.7/11.6 MB 2.5 MB/s eta 0:00:03\n",
64
+ " -------------------- ------------------- 5.8/11.6 MB 2.5 MB/s eta 0:00:03\n",
65
+ " -------------------- ------------------- 5.9/11.6 MB 2.5 MB/s eta 0:00:03\n",
66
+ " -------------------- ------------------- 6.1/11.6 MB 2.5 MB/s eta 0:00:03\n",
67
+ " --------------------- ------------------ 6.3/11.6 MB 2.5 MB/s eta 0:00:03\n",
68
+ " --------------------- ------------------ 6.3/11.6 MB 2.5 MB/s eta 0:00:03\n",
69
+ " ---------------------- ----------------- 6.5/11.6 MB 2.5 MB/s eta 0:00:03\n",
70
+ " ----------------------- ---------------- 6.7/11.6 MB 2.5 MB/s eta 0:00:02\n",
71
+ " ------------------------ --------------- 7.0/11.6 MB 2.6 MB/s eta 0:00:02\n",
72
+ " ------------------------ --------------- 7.1/11.6 MB 2.6 MB/s eta 0:00:02\n",
73
+ " ------------------------- -------------- 7.4/11.6 MB 2.6 MB/s eta 0:00:02\n",
74
+ " -------------------------- ------------- 7.6/11.6 MB 2.7 MB/s eta 0:00:02\n",
75
+ " --------------------------- ------------ 7.9/11.6 MB 2.7 MB/s eta 0:00:02\n",
76
+ " ---------------------------- ----------- 8.1/11.6 MB 2.8 MB/s eta 0:00:02\n",
77
+ " ---------------------------- ----------- 8.3/11.6 MB 2.8 MB/s eta 0:00:02\n",
78
+ " ----------------------------- ---------- 8.6/11.6 MB 2.8 MB/s eta 0:00:02\n",
79
+ " ------------------------------ --------- 8.8/11.6 MB 2.8 MB/s eta 0:00:01\n",
80
+ " ------------------------------- -------- 9.1/11.6 MB 2.9 MB/s eta 0:00:01\n",
81
+ " -------------------------------- ------- 9.3/11.6 MB 2.9 MB/s eta 0:00:01\n",
82
+ " -------------------------------- ------- 9.4/11.6 MB 2.9 MB/s eta 0:00:01\n",
83
+ " -------------------------------- ------- 9.5/11.6 MB 2.9 MB/s eta 0:00:01\n",
84
+ " --------------------------------- ------ 9.6/11.6 MB 2.9 MB/s eta 0:00:01\n",
85
+ " --------------------------------- ------ 9.8/11.6 MB 2.9 MB/s eta 0:00:01\n",
86
+ " ---------------------------------- ----- 10.1/11.6 MB 3.0 MB/s eta 0:00:01\n",
87
+ " ----------------------------------- ---- 10.3/11.6 MB 3.1 MB/s eta 0:00:01\n",
88
+ " ------------------------------------ --- 10.5/11.6 MB 3.3 MB/s eta 0:00:01\n",
89
+ " ------------------------------------ --- 10.7/11.6 MB 3.5 MB/s eta 0:00:01\n",
90
+ " ------------------------------------- -- 10.9/11.6 MB 3.5 MB/s eta 0:00:01\n",
91
+ " -------------------------------------- - 11.0/11.6 MB 3.6 MB/s eta 0:00:01\n",
92
+ " --------------------------------------- 11.3/11.6 MB 3.6 MB/s eta 0:00:01\n",
93
+ " --------------------------------------- 11.5/11.6 MB 3.7 MB/s eta 0:00:01\n",
94
+ " --------------------------------------- 11.6/11.6 MB 3.6 MB/s eta 0:00:01\n",
95
+ " ---------------------------------------- 11.6/11.6 MB 3.6 MB/s eta 0:00:00\n",
96
+ "Collecting filelock\n",
97
+ " Using cached filelock-3.13.1-py3-none-any.whl (11 kB)\n",
98
+ "Requirement already satisfied: requests>=2.19.0 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from datasets) (2.31.0)\n",
99
+ "Collecting multiprocess\n",
100
+ " Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n",
101
+ " ---------------------------------------- 0.0/134.8 kB ? eta -:--:--\n",
102
+ " ---------------------------------- --- 122.9/134.8 kB 3.5 MB/s eta 0:00:01\n",
103
+ " -------------------------------------- 134.8/134.8 kB 2.0 MB/s eta 0:00:00\n",
104
+ "Collecting huggingface-hub>=0.19.4\n",
105
+ " Using cached huggingface_hub-0.20.3-py3-none-any.whl (330 kB)\n",
106
+ "Collecting pyarrow>=8.0.0\n",
107
+ " Downloading pyarrow-15.0.0-cp310-cp310-win_amd64.whl (24.8 MB)\n",
108
+ " ---------------------------------------- 0.0/24.8 MB ? eta -:--:--\n",
109
+ " ---------------------------------------- 0.1/24.8 MB 3.3 MB/s eta 0:00:08\n",
110
+ " ---------------------------------------- 0.2/24.8 MB 3.5 MB/s eta 0:00:08\n",
111
+ " --------------------------------------- 0.5/24.8 MB 4.2 MB/s eta 0:00:06\n",
112
+ " - -------------------------------------- 0.6/24.8 MB 4.1 MB/s eta 0:00:06\n",
113
+ " - -------------------------------------- 0.8/24.8 MB 4.2 MB/s eta 0:00:06\n",
114
+ " - -------------------------------------- 1.0/24.8 MB 4.2 MB/s eta 0:00:06\n",
115
+ " - -------------------------------------- 1.2/24.8 MB 4.4 MB/s eta 0:00:06\n",
116
+ " -- ------------------------------------- 1.4/24.8 MB 4.4 MB/s eta 0:00:06\n",
117
+ " -- ------------------------------------- 1.6/24.8 MB 4.5 MB/s eta 0:00:06\n",
118
+ " -- ------------------------------------- 1.8/24.8 MB 4.6 MB/s eta 0:00:05\n",
119
+ " --- ------------------------------------ 2.0/24.8 MB 4.5 MB/s eta 0:00:06\n",
120
+ " --- ------------------------------------ 2.2/24.8 MB 4.6 MB/s eta 0:00:05\n",
121
+ " --- ------------------------------------ 2.3/24.8 MB 4.5 MB/s eta 0:00:06\n",
122
+ " ---- ----------------------------------- 2.6/24.8 MB 4.5 MB/s eta 0:00:05\n",
123
+ " ---- ----------------------------------- 2.7/24.8 MB 4.5 MB/s eta 0:00:05\n",
124
+ " ---- ----------------------------------- 2.8/24.8 MB 4.4 MB/s eta 0:00:06\n",
125
+ " ---- ----------------------------------- 3.0/24.8 MB 4.5 MB/s eta 0:00:05\n",
126
+ " ---- ----------------------------------- 3.1/24.8 MB 4.4 MB/s eta 0:00:05\n",
127
+ " ----- ---------------------------------- 3.3/24.8 MB 4.4 MB/s eta 0:00:05\n",
128
+ " ----- ---------------------------------- 3.4/24.8 MB 4.3 MB/s eta 0:00:05\n",
129
+ " ----- ---------------------------------- 3.7/24.8 MB 4.3 MB/s eta 0:00:05\n",
130
+ " ------ --------------------------------- 3.9/24.8 MB 4.5 MB/s eta 0:00:05\n",
131
+ " ------ --------------------------------- 4.2/24.8 MB 4.5 MB/s eta 0:00:05\n",
132
+ " ------- -------------------------------- 4.4/24.8 MB 4.7 MB/s eta 0:00:05\n",
133
+ " ------- -------------------------------- 4.7/24.8 MB 4.7 MB/s eta 0:00:05\n",
134
+ " ------- -------------------------------- 4.8/24.8 MB 4.6 MB/s eta 0:00:05\n",
135
+ " ------- -------------------------------- 4.9/24.8 MB 4.5 MB/s eta 0:00:05\n",
136
+ " -------- ------------------------------- 5.0/24.8 MB 4.5 MB/s eta 0:00:05\n",
137
+ " -------- ------------------------------- 5.1/24.8 MB 4.4 MB/s eta 0:00:05\n",
138
+ " -------- ------------------------------- 5.2/24.8 MB 4.3 MB/s eta 0:00:05\n",
139
+ " -------- ------------------------------- 5.3/24.8 MB 4.3 MB/s eta 0:00:05\n",
140
+ " -------- ------------------------------- 5.5/24.8 MB 4.3 MB/s eta 0:00:05\n",
141
+ " --------- ------------------------------ 5.7/24.8 MB 4.3 MB/s eta 0:00:05\n",
142
+ " --------- ------------------------------ 5.9/24.8 MB 4.4 MB/s eta 0:00:05\n",
143
+ " --------- ------------------------------ 6.1/24.8 MB 4.3 MB/s eta 0:00:05\n",
144
+ " ---------- ----------------------------- 6.3/24.8 MB 4.4 MB/s eta 0:00:05\n",
145
+ " ---------- ----------------------------- 6.5/24.8 MB 4.3 MB/s eta 0:00:05\n",
146
+ " ---------- ----------------------------- 6.6/24.8 MB 4.3 MB/s eta 0:00:05\n",
147
+ " ---------- ----------------------------- 6.8/24.8 MB 4.3 MB/s eta 0:00:05\n",
148
+ " ----------- ---------------------------- 7.0/24.8 MB 4.3 MB/s eta 0:00:05\n",
149
+ " ----------- ---------------------------- 7.1/24.8 MB 4.3 MB/s eta 0:00:05\n",
150
+ " ----------- ---------------------------- 7.2/24.8 MB 4.3 MB/s eta 0:00:05\n",
151
+ " ----------- ---------------------------- 7.3/24.8 MB 4.2 MB/s eta 0:00:05\n",
152
+ " ------------ --------------------------- 7.5/24.8 MB 4.3 MB/s eta 0:00:05\n",
153
+ " ------------ --------------------------- 7.7/24.8 MB 4.2 MB/s eta 0:00:05\n",
154
+ " ------------ --------------------------- 7.9/24.8 MB 4.3 MB/s eta 0:00:04\n",
155
+ " ------------- -------------------------- 8.1/24.8 MB 4.3 MB/s eta 0:00:04\n",
156
+ " ------------- -------------------------- 8.2/24.8 MB 4.2 MB/s eta 0:00:04\n",
157
+ " ------------- -------------------------- 8.5/24.8 MB 4.3 MB/s eta 0:00:04\n",
158
+ " -------------- ------------------------- 8.7/24.8 MB 4.3 MB/s eta 0:00:04\n",
159
+ " -------------- ------------------------- 8.9/24.8 MB 4.3 MB/s eta 0:00:04\n",
160
+ " -------------- ------------------------- 9.1/24.8 MB 4.3 MB/s eta 0:00:04\n",
161
+ " -------------- ------------------------- 9.2/24.8 MB 4.3 MB/s eta 0:00:04\n",
162
+ " --------------- ------------------------ 9.5/24.8 MB 4.3 MB/s eta 0:00:04\n",
163
+ " --------------- ------------------------ 9.6/24.8 MB 4.3 MB/s eta 0:00:04\n",
164
+ " --------------- ------------------------ 9.9/24.8 MB 4.4 MB/s eta 0:00:04\n",
165
+ " ---------------- ----------------------- 10.0/24.8 MB 4.4 MB/s eta 0:00:04\n",
166
+ " ---------------- ----------------------- 10.3/24.8 MB 4.4 MB/s eta 0:00:04\n",
167
+ " ---------------- ----------------------- 10.4/24.8 MB 4.4 MB/s eta 0:00:04\n",
168
+ " ----------------- ---------------------- 10.6/24.8 MB 4.4 MB/s eta 0:00:04\n",
169
+ " ----------------- ---------------------- 10.7/24.8 MB 4.4 MB/s eta 0:00:04\n",
170
+ " ----------------- ---------------------- 10.9/24.8 MB 4.4 MB/s eta 0:00:04\n",
171
+ " ----------------- ---------------------- 11.1/24.8 MB 4.4 MB/s eta 0:00:04\n",
172
+ " ------------------ --------------------- 11.3/24.8 MB 4.4 MB/s eta 0:00:04\n",
173
+ " ------------------ --------------------- 11.6/24.8 MB 4.4 MB/s eta 0:00:03\n",
174
+ " ------------------ --------------------- 11.6/24.8 MB 4.4 MB/s eta 0:00:03\n",
175
+ " ------------------ --------------------- 11.7/24.8 MB 4.3 MB/s eta 0:00:04\n",
176
+ " ------------------- -------------------- 12.0/24.8 MB 4.4 MB/s eta 0:00:03\n",
177
+ " ------------------- -------------------- 12.2/24.8 MB 4.4 MB/s eta 0:00:03\n",
178
+ " -------------------- ------------------- 12.5/24.8 MB 4.4 MB/s eta 0:00:03\n",
179
+ " -------------------- ------------------- 12.7/24.8 MB 4.5 MB/s eta 0:00:03\n",
180
+ " -------------------- ------------------- 12.9/24.8 MB 4.5 MB/s eta 0:00:03\n",
181
+ " --------------------- ------------------ 13.1/24.8 MB 4.5 MB/s eta 0:00:03\n",
182
+ " --------------------- ------------------ 13.3/24.8 MB 4.5 MB/s eta 0:00:03\n",
183
+ " --------------------- ------------------ 13.5/24.8 MB 4.5 MB/s eta 0:00:03\n",
184
+ " ---------------------- ----------------- 13.7/24.8 MB 4.6 MB/s eta 0:00:03\n",
185
+ " ---------------------- ----------------- 13.8/24.8 MB 4.5 MB/s eta 0:00:03\n",
186
+ " ---------------------- ----------------- 14.0/24.8 MB 4.5 MB/s eta 0:00:03\n",
187
+ " ---------------------- ----------------- 14.2/24.8 MB 4.4 MB/s eta 0:00:03\n",
188
+ " ----------------------- ---------------- 14.3/24.8 MB 4.5 MB/s eta 0:00:03\n",
189
+ " ----------------------- ---------------- 14.5/24.8 MB 4.4 MB/s eta 0:00:03\n",
190
+ " ----------------------- ---------------- 14.7/24.8 MB 4.4 MB/s eta 0:00:03\n",
191
+ " ------------------------ --------------- 15.0/24.8 MB 4.4 MB/s eta 0:00:03\n",
192
+ " ------------------------ --------------- 15.2/24.8 MB 4.5 MB/s eta 0:00:03\n",
193
+ " ------------------------ --------------- 15.4/24.8 MB 4.6 MB/s eta 0:00:03\n",
194
+ " ------------------------- -------------- 15.5/24.8 MB 4.5 MB/s eta 0:00:03\n",
195
+ " ------------------------- -------------- 15.7/24.8 MB 4.5 MB/s eta 0:00:03\n",
196
+ " ------------------------- -------------- 15.9/24.8 MB 4.5 MB/s eta 0:00:02\n",
197
+ " -------------------------- ------------- 16.1/24.8 MB 4.5 MB/s eta 0:00:02\n",
198
+ " -------------------------- ------------- 16.3/24.8 MB 4.5 MB/s eta 0:00:02\n",
199
+ " -------------------------- ------------- 16.4/24.8 MB 4.5 MB/s eta 0:00:02\n",
200
+ " -------------------------- ------------- 16.6/24.8 MB 4.4 MB/s eta 0:00:02\n",
201
+ " --------------------------- ------------ 16.8/24.8 MB 4.5 MB/s eta 0:00:02\n",
202
+ " --------------------------- ------------ 17.0/24.8 MB 4.5 MB/s eta 0:00:02\n",
203
+ " --------------------------- ------------ 17.1/24.8 MB 4.5 MB/s eta 0:00:02\n",
204
+ " --------------------------- ------------ 17.2/24.8 MB 4.4 MB/s eta 0:00:02\n",
205
+ " ---------------------------- ----------- 17.4/24.8 MB 4.5 MB/s eta 0:00:02\n",
206
+ " ---------------------------- ----------- 17.6/24.8 MB 4.5 MB/s eta 0:00:02\n",
207
+ " ---------------------------- ----------- 17.8/24.8 MB 4.5 MB/s eta 0:00:02\n",
208
+ " ----------------------------- ---------- 18.0/24.8 MB 4.5 MB/s eta 0:00:02\n",
209
+ " ----------------------------- ---------- 18.2/24.8 MB 4.5 MB/s eta 0:00:02\n",
210
+ " ----------------------------- ---------- 18.4/24.8 MB 4.5 MB/s eta 0:00:02\n",
211
+ " ----------------------------- ---------- 18.6/24.8 MB 4.4 MB/s eta 0:00:02\n",
212
+ " ------------------------------ --------- 18.7/24.8 MB 4.4 MB/s eta 0:00:02\n",
213
+ " ------------------------------ --------- 18.9/24.8 MB 4.4 MB/s eta 0:00:02\n",
214
+ " ------------------------------ --------- 19.0/24.8 MB 4.3 MB/s eta 0:00:02\n",
215
+ " ------------------------------ --------- 19.2/24.8 MB 4.3 MB/s eta 0:00:02\n",
216
+ " ------------------------------- -------- 19.4/24.8 MB 4.3 MB/s eta 0:00:02\n",
217
+ " ------------------------------- -------- 19.6/24.8 MB 4.3 MB/s eta 0:00:02\n",
218
+ " ------------------------------- -------- 19.8/24.8 MB 4.3 MB/s eta 0:00:02\n",
219
+ " -------------------------------- ------- 20.0/24.8 MB 4.3 MB/s eta 0:00:02\n",
220
+ " -------------------------------- ------- 20.2/24.8 MB 4.3 MB/s eta 0:00:02\n",
221
+ " -------------------------------- ------- 20.4/24.8 MB 4.2 MB/s eta 0:00:02\n",
222
+ " --------------------------------- ------ 20.5/24.8 MB 4.2 MB/s eta 0:00:02\n",
223
+ " --------------------------------- ------ 20.6/24.8 MB 4.2 MB/s eta 0:00:01\n",
224
+ " --------------------------------- ------ 20.7/24.8 MB 4.1 MB/s eta 0:00:01\n",
225
+ " --------------------------------- ------ 20.8/24.8 MB 4.1 MB/s eta 0:00:01\n",
226
+ " --------------------------------- ------ 21.0/24.8 MB 4.1 MB/s eta 0:00:01\n",
227
+ " ---------------------------------- ----- 21.1/24.8 MB 4.1 MB/s eta 0:00:01\n",
228
+ " ---------------------------------- ----- 21.3/24.8 MB 4.1 MB/s eta 0:00:01\n",
229
+ " ---------------------------------- ----- 21.4/24.8 MB 4.1 MB/s eta 0:00:01\n",
230
+ " ---------------------------------- ----- 21.4/24.8 MB 4.1 MB/s eta 0:00:01\n",
231
+ " ---------------------------------- ----- 21.6/24.8 MB 4.0 MB/s eta 0:00:01\n",
232
+ " ----------------------------------- ---- 21.8/24.8 MB 4.0 MB/s eta 0:00:01\n",
233
+ " ----------------------------------- ---- 22.0/24.8 MB 4.0 MB/s eta 0:00:01\n",
234
+ " ----------------------------------- ---- 22.1/24.8 MB 4.0 MB/s eta 0:00:01\n",
235
+ " ----------------------------------- ---- 22.3/24.8 MB 4.0 MB/s eta 0:00:01\n",
236
+ " ------------------------------------ --- 22.5/24.8 MB 3.9 MB/s eta 0:00:01\n",
237
+ " ------------------------------------ --- 22.6/24.8 MB 3.9 MB/s eta 0:00:01\n",
238
+ " ------------------------------------ --- 22.7/24.8 MB 3.9 MB/s eta 0:00:01\n",
239
+ " ------------------------------------ --- 22.8/24.8 MB 3.8 MB/s eta 0:00:01\n",
240
+ " ------------------------------------ --- 22.9/24.8 MB 3.8 MB/s eta 0:00:01\n",
241
+ " ------------------------------------- -- 23.0/24.8 MB 3.8 MB/s eta 0:00:01\n",
242
+ " ------------------------------------- -- 23.0/24.8 MB 3.8 MB/s eta 0:00:01\n",
243
+ " ------------------------------------- -- 23.0/24.8 MB 3.8 MB/s eta 0:00:01\n",
244
+ " ------------------------------------- -- 23.0/24.8 MB 3.8 MB/s eta 0:00:01\n",
245
+ " ------------------------------------- -- 23.0/24.8 MB 3.8 MB/s eta 0:00:01\n",
246
+ " ------------------------------------- -- 23.0/24.8 MB 3.8 MB/s eta 0:00:01\n",
247
+ " ------------------------------------- -- 23.1/24.8 MB 3.5 MB/s eta 0:00:01\n",
248
+ " ------------------------------------- -- 23.2/24.8 MB 3.5 MB/s eta 0:00:01\n",
249
+ " ------------------------------------- -- 23.4/24.8 MB 3.5 MB/s eta 0:00:01\n",
250
+ " ------------------------------------- -- 23.5/24.8 MB 3.4 MB/s eta 0:00:01\n",
251
+ " ------------------------------------- -- 23.5/24.8 MB 3.4 MB/s eta 0:00:01\n",
252
+ " -------------------------------------- - 23.6/24.8 MB 3.4 MB/s eta 0:00:01\n",
253
+ " -------------------------------------- - 23.8/24.8 MB 3.4 MB/s eta 0:00:01\n",
254
+ " -------------------------------------- - 24.0/24.8 MB 3.4 MB/s eta 0:00:01\n",
255
+ " --------------------------------------- 24.2/24.8 MB 3.4 MB/s eta 0:00:01\n",
256
+ " --------------------------------------- 24.4/24.8 MB 3.4 MB/s eta 0:00:01\n",
257
+ " --------------------------------------- 24.6/24.8 MB 3.4 MB/s eta 0:00:01\n",
258
+ " --------------------------------------- 24.7/24.8 MB 3.4 MB/s eta 0:00:01\n",
259
+ " --------------------------------------- 24.8/24.8 MB 3.4 MB/s eta 0:00:01\n",
260
+ " --------------------------------------- 24.8/24.8 MB 3.4 MB/s eta 0:00:01\n",
261
+ " --------------------------------------- 24.8/24.8 MB 3.4 MB/s eta 0:00:01\n",
262
+ " --------------------------------------- 24.8/24.8 MB 3.4 MB/s eta 0:00:01\n",
263
+ " --------------------------------------- 24.8/24.8 MB 3.4 MB/s eta 0:00:01\n",
264
+ " --------------------------------------- 24.8/24.8 MB 3.4 MB/s eta 0:00:01\n",
265
+ " --------------------------------------- 24.8/24.8 MB 3.4 MB/s eta 0:00:01\n",
266
+ " ---------------------------------------- 24.8/24.8 MB 3.1 MB/s eta 0:00:00\n",
267
+ "Collecting fsspec[http]<=2023.10.0,>=2023.1.0\n",
268
+ " Using cached fsspec-2023.10.0-py3-none-any.whl (166 kB)\n",
269
+ "Collecting aiohttp\n",
270
+ " Downloading aiohttp-3.9.1-cp310-cp310-win_amd64.whl (364 kB)\n",
271
+ " ---------------------------------------- 0.0/364.6 kB ? eta -:--:--\n",
272
+ " ----------- -------------------------- 112.6/364.6 kB 3.2 MB/s eta 0:00:01\n",
273
+ " --------------------- ---------------- 204.8/364.6 kB 3.1 MB/s eta 0:00:01\n",
274
+ " ----------------------------------- -- 337.9/364.6 kB 3.0 MB/s eta 0:00:01\n",
275
+ " -------------------------------------- 364.6/364.6 kB 2.8 MB/s eta 0:00:00\n",
276
+ "Collecting numpy>=1.17\n",
277
+ " Downloading numpy-1.26.3-cp310-cp310-win_amd64.whl (15.8 MB)\n",
278
+ " ---------------------------------------- 0.0/15.8 MB ? eta -:--:--\n",
279
+ " --------------------------------------- 0.2/15.8 MB 4.6 MB/s eta 0:00:04\n",
280
+ " --------------------------------------- 0.3/15.8 MB 4.1 MB/s eta 0:00:04\n",
281
+ " - -------------------------------------- 0.5/15.8 MB 3.9 MB/s eta 0:00:04\n",
282
+ " - -------------------------------------- 0.7/15.8 MB 3.8 MB/s eta 0:00:04\n",
283
+ " -- ------------------------------------- 0.8/15.8 MB 3.6 MB/s eta 0:00:05\n",
284
+ " -- ------------------------------------- 1.0/15.8 MB 3.9 MB/s eta 0:00:04\n",
285
+ " -- ------------------------------------- 1.1/15.8 MB 3.8 MB/s eta 0:00:04\n",
286
+ " --- ------------------------------------ 1.3/15.8 MB 3.7 MB/s eta 0:00:04\n",
287
+ " --- ------------------------------------ 1.4/15.8 MB 3.8 MB/s eta 0:00:04\n",
288
+ " --- ------------------------------------ 1.6/15.8 MB 3.8 MB/s eta 0:00:04\n",
289
+ " ---- ----------------------------------- 1.7/15.8 MB 3.8 MB/s eta 0:00:04\n",
290
+ " ---- ----------------------------------- 1.9/15.8 MB 3.7 MB/s eta 0:00:04\n",
291
+ " ----- ---------------------------------- 2.0/15.8 MB 3.7 MB/s eta 0:00:04\n",
292
+ " ----- ---------------------------------- 2.1/15.8 MB 3.6 MB/s eta 0:00:04\n",
293
+ " ----- ---------------------------------- 2.3/15.8 MB 3.6 MB/s eta 0:00:04\n",
294
+ " ------ --------------------------------- 2.5/15.8 MB 3.7 MB/s eta 0:00:04\n",
295
+ " ------ --------------------------------- 2.6/15.8 MB 3.7 MB/s eta 0:00:04\n",
296
+ " ------- -------------------------------- 2.8/15.8 MB 3.7 MB/s eta 0:00:04\n",
297
+ " ------- -------------------------------- 3.0/15.8 MB 3.8 MB/s eta 0:00:04\n",
298
+ " -------- ------------------------------- 3.2/15.8 MB 3.9 MB/s eta 0:00:04\n",
299
+ " -------- ------------------------------- 3.4/15.8 MB 3.9 MB/s eta 0:00:04\n",
300
+ " --------- ------------------------------ 3.6/15.8 MB 3.9 MB/s eta 0:00:04\n",
301
+ " --------- ------------------------------ 3.8/15.8 MB 4.0 MB/s eta 0:00:04\n",
302
+ " ---------- ----------------------------- 4.0/15.8 MB 4.0 MB/s eta 0:00:03\n",
303
+ " ---------- ----------------------------- 4.2/15.8 MB 4.0 MB/s eta 0:00:03\n",
304
+ " ----------- ---------------------------- 4.4/15.8 MB 4.1 MB/s eta 0:00:03\n",
305
+ " ----------- ---------------------------- 4.6/15.8 MB 4.1 MB/s eta 0:00:03\n",
306
+ " ------------ --------------------------- 4.8/15.8 MB 4.2 MB/s eta 0:00:03\n",
307
+ " ------------- -------------------------- 5.1/15.8 MB 4.3 MB/s eta 0:00:03\n",
308
+ " ------------- -------------------------- 5.2/15.8 MB 4.2 MB/s eta 0:00:03\n",
309
+ " ------------- -------------------------- 5.4/15.8 MB 4.1 MB/s eta 0:00:03\n",
310
+ " -------------- ------------------------- 5.6/15.8 MB 4.2 MB/s eta 0:00:03\n",
311
+ " -------------- ------------------------- 5.8/15.8 MB 4.2 MB/s eta 0:00:03\n",
312
+ " --------------- ------------------------ 6.1/15.8 MB 4.3 MB/s eta 0:00:03\n",
313
+ " --------------- ------------------------ 6.3/15.8 MB 4.3 MB/s eta 0:00:03\n",
314
+ " ---------------- ----------------------- 6.5/15.8 MB 4.3 MB/s eta 0:00:03\n",
315
+ " ---------------- ----------------------- 6.6/15.8 MB 4.3 MB/s eta 0:00:03\n",
316
+ " ----------------- ---------------------- 6.7/15.8 MB 4.3 MB/s eta 0:00:03\n",
317
+ " ----------------- ---------------------- 6.7/15.8 MB 4.2 MB/s eta 0:00:03\n",
318
+ " ----------------- ---------------------- 7.0/15.8 MB 4.2 MB/s eta 0:00:03\n",
319
+ " ------------------ --------------------- 7.1/15.8 MB 4.2 MB/s eta 0:00:03\n",
320
+ " ------------------ --------------------- 7.2/15.8 MB 4.2 MB/s eta 0:00:03\n",
321
+ " ------------------ --------------------- 7.5/15.8 MB 4.2 MB/s eta 0:00:02\n",
322
+ " ------------------- -------------------- 7.7/15.8 MB 4.2 MB/s eta 0:00:02\n",
323
+ " -------------------- ------------------- 7.9/15.8 MB 4.2 MB/s eta 0:00:02\n",
324
+ " -------------------- ------------------- 8.0/15.8 MB 4.2 MB/s eta 0:00:02\n",
325
+ " -------------------- ------------------- 8.2/15.8 MB 4.2 MB/s eta 0:00:02\n",
326
+ " --------------------- ------------------ 8.3/15.8 MB 4.2 MB/s eta 0:00:02\n",
327
+ " --------------------- ------------------ 8.4/15.8 MB 4.2 MB/s eta 0:00:02\n",
328
+ " --------------------- ------------------ 8.6/15.8 MB 4.2 MB/s eta 0:00:02\n",
329
+ " ---------------------- ----------------- 8.7/15.8 MB 4.1 MB/s eta 0:00:02\n",
330
+ " ---------------------- ----------------- 8.9/15.8 MB 4.1 MB/s eta 0:00:02\n",
331
+ " ----------------------- ---------------- 9.1/15.8 MB 4.2 MB/s eta 0:00:02\n",
332
+ " ----------------------- ---------------- 9.3/15.8 MB 4.1 MB/s eta 0:00:02\n",
333
+ " ----------------------- ---------------- 9.5/15.8 MB 4.1 MB/s eta 0:00:02\n",
334
+ " ------------------------ --------------- 9.6/15.8 MB 4.1 MB/s eta 0:00:02\n",
335
+ " ------------------------ --------------- 9.8/15.8 MB 4.1 MB/s eta 0:00:02\n",
336
+ " ------------------------- -------------- 9.9/15.8 MB 4.1 MB/s eta 0:00:02\n",
337
+ " ------------------------- -------------- 10.1/15.8 MB 4.2 MB/s eta 0:00:02\n",
338
+ " -------------------------- ------------- 10.3/15.8 MB 4.1 MB/s eta 0:00:02\n",
339
+ " -------------------------- ------------- 10.4/15.8 MB 4.1 MB/s eta 0:00:02\n",
340
+ " -------------------------- ------------- 10.5/15.8 MB 4.1 MB/s eta 0:00:02\n",
341
+ " -------------------------- ------------- 10.7/15.8 MB 4.1 MB/s eta 0:00:02\n",
342
+ " --------------------------- ------------ 10.9/15.8 MB 4.1 MB/s eta 0:00:02\n",
343
+ " --------------------------- ------------ 10.9/15.8 MB 4.1 MB/s eta 0:00:02\n",
344
+ " ---------------------------- ----------- 11.1/15.8 MB 4.1 MB/s eta 0:00:02\n",
345
+ " ---------------------------- ----------- 11.3/15.8 MB 4.1 MB/s eta 0:00:02\n",
346
+ " ---------------------------- ----------- 11.4/15.8 MB 4.1 MB/s eta 0:00:02\n",
347
+ " ----------------------------- ---------- 11.6/15.8 MB 4.1 MB/s eta 0:00:02\n",
348
+ " ----------------------------- ---------- 11.8/15.8 MB 4.1 MB/s eta 0:00:01\n",
349
+ " ------------------------------ --------- 11.9/15.8 MB 4.1 MB/s eta 0:00:01\n",
350
+ " ------------------------------ --------- 12.1/15.8 MB 4.1 MB/s eta 0:00:01\n",
351
+ " ------------------------------ --------- 12.2/15.8 MB 4.1 MB/s eta 0:00:01\n",
352
+ " ------------------------------- -------- 12.3/15.8 MB 4.1 MB/s eta 0:00:01\n",
353
+ " ------------------------------- -------- 12.3/15.8 MB 4.1 MB/s eta 0:00:01\n",
354
+ " ------------------------------- -------- 12.4/15.8 MB 4.1 MB/s eta 0:00:01\n",
355
+ " ------------------------------- -------- 12.5/15.8 MB 4.0 MB/s eta 0:00:01\n",
356
+ " ------------------------------- -------- 12.6/15.8 MB 4.0 MB/s eta 0:00:01\n",
357
+ " ------------------------------- -------- 12.6/15.8 MB 3.9 MB/s eta 0:00:01\n",
358
+ " -------------------------------- ------- 12.7/15.8 MB 3.9 MB/s eta 0:00:01\n",
359
+ " -------------------------------- ------- 12.9/15.8 MB 3.9 MB/s eta 0:00:01\n",
360
+ " --------------------------------- ------ 13.0/15.8 MB 3.9 MB/s eta 0:00:01\n",
361
+ " --------------------------------- ------ 13.1/15.8 MB 3.9 MB/s eta 0:00:01\n",
362
+ " --------------------------------- ------ 13.3/15.8 MB 3.9 MB/s eta 0:00:01\n",
363
+ " ---------------------------------- ----- 13.5/15.8 MB 3.9 MB/s eta 0:00:01\n",
364
+ " ---------------------------------- ----- 13.6/15.8 MB 3.9 MB/s eta 0:00:01\n",
365
+ " ---------------------------------- ----- 13.8/15.8 MB 3.8 MB/s eta 0:00:01\n",
366
+ " ---------------------------------- ----- 13.8/15.8 MB 3.8 MB/s eta 0:00:01\n",
367
+ " ----------------------------------- ---- 14.0/15.8 MB 3.8 MB/s eta 0:00:01\n",
368
+ " ----------------------------------- ---- 14.2/15.8 MB 3.8 MB/s eta 0:00:01\n",
369
+ " ------------------------------------ --- 14.3/15.8 MB 3.8 MB/s eta 0:00:01\n",
370
+ " ------------------------------------ --- 14.5/15.8 MB 3.7 MB/s eta 0:00:01\n",
371
+ " ------------------------------------ --- 14.6/15.8 MB 3.7 MB/s eta 0:00:01\n",
372
+ " ------------------------------------- -- 14.8/15.8 MB 3.7 MB/s eta 0:00:01\n",
373
+ " ------------------------------------- -- 15.0/15.8 MB 3.7 MB/s eta 0:00:01\n",
374
+ " -------------------------------------- - 15.1/15.8 MB 3.7 MB/s eta 0:00:01\n",
375
+ " -------------------------------------- - 15.3/15.8 MB 3.7 MB/s eta 0:00:01\n",
376
+ " --------------------------------------- 15.4/15.8 MB 3.7 MB/s eta 0:00:01\n",
377
+ " --------------------------------------- 15.6/15.8 MB 3.7 MB/s eta 0:00:01\n",
378
+ " --------------------------------------- 15.7/15.8 MB 3.7 MB/s eta 0:00:01\n",
379
+ " --------------------------------------- 15.8/15.8 MB 3.7 MB/s eta 0:00:01\n",
380
+ " --------------------------------------- 15.8/15.8 MB 3.7 MB/s eta 0:00:01\n",
381
+ " ---------------------------------------- 15.8/15.8 MB 3.5 MB/s eta 0:00:00\n",
382
+ "Collecting tqdm>=4.62.1\n",
383
+ " Using cached tqdm-4.66.1-py3-none-any.whl (78 kB)\n",
384
+ "Collecting xxhash\n",
385
+ " Downloading xxhash-3.4.1-cp310-cp310-win_amd64.whl (29 kB)\n",
386
+ "Collecting pyarrow-hotfix\n",
387
+ " Using cached pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n",
388
+ "Collecting dill<0.3.8,>=0.3.0\n",
389
+ " Using cached dill-0.3.7-py3-none-any.whl (115 kB)\n",
390
+ "Requirement already satisfied: pyyaml>=5.1 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from datasets) (6.0.1)\n",
391
+ "Requirement already satisfied: packaging in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from datasets) (23.2)\n",
392
+ "Collecting frozenlist>=1.1.1\n",
393
+ " Downloading frozenlist-1.4.1-cp310-cp310-win_amd64.whl (50 kB)\n",
394
+ " ---------------------------------------- 0.0/50.4 kB ? eta -:--:--\n",
395
+ " ---------------------------------------- 50.4/50.4 kB 2.7 MB/s eta 0:00:00\n",
396
+ "Collecting aiosignal>=1.1.2\n",
397
+ " Using cached aiosignal-1.3.1-py3-none-any.whl (7.6 kB)\n",
398
+ "Collecting multidict<7.0,>=4.5\n",
399
+ " Downloading multidict-6.0.4-cp310-cp310-win_amd64.whl (28 kB)\n",
400
+ "Requirement already satisfied: attrs>=17.3.0 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from aiohttp->datasets) (23.2.0)\n",
401
+ "Collecting yarl<2.0,>=1.0\n",
402
+ " Downloading yarl-1.9.4-cp310-cp310-win_amd64.whl (76 kB)\n",
403
+ " ---------------------------------------- 0.0/76.4 kB ? eta -:--:--\n",
404
+ " ---------------------------------------- 76.4/76.4 kB 2.1 MB/s eta 0:00:00\n",
405
+ "Collecting async-timeout<5.0,>=4.0\n",
406
+ " Downloading async_timeout-4.0.3-py3-none-any.whl (5.7 kB)\n",
407
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface-hub>=0.19.4->datasets) (4.9.0)\n",
408
+ "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests>=2.19.0->datasets) (3.3.2)\n",
409
+ "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests>=2.19.0->datasets) (3.6)\n",
410
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests>=2.19.0->datasets) (2.1.0)\n",
411
+ "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests>=2.19.0->datasets) (2023.11.17)\n",
412
+ "Requirement already satisfied: colorama in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from tqdm>=4.62.1->datasets) (0.4.6)\n",
413
+ "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from pandas->datasets) (2.8.2)\n",
414
+ "Collecting pytz>=2020.1\n",
415
+ " Using cached pytz-2023.3.post1-py2.py3-none-any.whl (502 kB)\n",
416
+ "Collecting tzdata>=2022.7\n",
417
+ " Using cached tzdata-2023.4-py2.py3-none-any.whl (346 kB)\n",
418
+ "Requirement already satisfied: six>=1.5 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
419
+ "Installing collected packages: pytz, xxhash, tzdata, tqdm, pyarrow-hotfix, numpy, multidict, fsspec, frozenlist, filelock, dill, async-timeout, yarl, pyarrow, pandas, multiprocess, huggingface-hub, aiosignal, aiohttp, datasets\n",
420
+ "Successfully installed aiohttp-3.9.1 aiosignal-1.3.1 async-timeout-4.0.3 datasets-2.16.1 dill-0.3.7 filelock-3.13.1 frozenlist-1.4.1 fsspec-2023.10.0 huggingface-hub-0.20.3 multidict-6.0.4 multiprocess-0.70.15 numpy-1.26.3 pandas-2.2.0 pyarrow-15.0.0 pyarrow-hotfix-0.6 pytz-2023.3.post1 tqdm-4.66.1 tzdata-2023.4 xxhash-3.4.1 yarl-1.9.4\n"
421
+ ]
422
+ },
423
+ {
424
+ "name": "stderr",
425
+ "output_type": "stream",
426
+ "text": [
427
+ "\n",
428
+ "[notice] A new release of pip is available: 23.0.1 -> 23.3.2\n",
429
+ "[notice] To update, run: python.exe -m pip install --upgrade pip\n"
430
+ ]
431
+ }
432
+ ],
433
+ "source": [
434
+ "!pip install datasets"
435
+ ]
436
+ },
437
+ {
438
+ "cell_type": "code",
439
+ "execution_count": 6,
440
+ "id": "bd4b0343-d24f-4ca0-9fcb-94307eaaf732",
441
+ "metadata": {},
442
+ "outputs": [
443
+ {
444
+ "name": "stdout",
445
+ "output_type": "stream",
446
+ "text": [
447
+ "Collecting ipywidgetsNote: you may need to restart the kernel to use updated packages.\n",
448
+ "\n"
449
+ ]
450
+ },
451
+ {
452
+ "name": "stderr",
453
+ "output_type": "stream",
454
+ "text": [
455
+ "\n",
456
+ "[notice] A new release of pip is available: 23.0.1 -> 23.3.2\n",
457
+ "[notice] To update, run: python.exe -m pip install --upgrade pip\n"
458
+ ]
459
+ },
460
+ {
461
+ "name": "stdout",
462
+ "output_type": "stream",
463
+ "text": [
464
+ " Downloading ipywidgets-8.1.1-py3-none-any.whl (139 kB)\n",
465
+ " ---------------------------------------- 0.0/139.4 kB ? eta -:--:--\n",
466
+ " -- ------------------------------------- 10.2/139.4 kB ? eta -:--:--\n",
467
+ " -------- ---------------------------- 30.7/139.4 kB 445.2 kB/s eta 0:00:01\n",
468
+ " -------- ---------------------------- 30.7/139.4 kB 445.2 kB/s eta 0:00:01\n",
469
+ " ---------------- -------------------- 61.4/139.4 kB 299.4 kB/s eta 0:00:01\n",
470
+ " ----------------------------- ------ 112.6/139.4 kB 469.7 kB/s eta 0:00:01\n",
471
+ " ----------------------------- ------ 112.6/139.4 kB 469.7 kB/s eta 0:00:01\n",
472
+ " ------------------------------------ 139.4/139.4 kB 414.0 kB/s eta 0:00:00\n",
473
+ "Requirement already satisfied: traitlets>=4.3.1 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from ipywidgets) (5.14.1)\n",
474
+ "Requirement already satisfied: comm>=0.1.3 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from ipywidgets) (0.2.1)\n",
475
+ "Collecting jupyterlab-widgets~=3.0.9\n",
476
+ " Downloading jupyterlab_widgets-3.0.9-py3-none-any.whl (214 kB)\n",
477
+ " ---------------------------------------- 0.0/214.9 kB ? eta -:--:--\n",
478
+ " ----------- --------------------------- 61.4/214.9 kB 1.7 MB/s eta 0:00:01\n",
479
+ " ------------------- ------------------ 112.6/214.9 kB 2.2 MB/s eta 0:00:01\n",
480
+ " ------------------- ------------------ 112.6/214.9 kB 2.2 MB/s eta 0:00:01\n",
481
+ " ----------------------------- ------ 174.1/214.9 kB 958.1 kB/s eta 0:00:01\n",
482
+ " -------------------------------------- 214.9/214.9 kB 1.0 MB/s eta 0:00:00\n",
483
+ "Requirement already satisfied: ipython>=6.1.0 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from ipywidgets) (8.20.0)\n",
484
+ "Collecting widgetsnbextension~=4.0.9\n",
485
+ " Downloading widgetsnbextension-4.0.9-py3-none-any.whl (2.3 MB)\n",
486
+ " ---------------------------------------- 0.0/2.3 MB ? eta -:--:--\n",
487
+ " - -------------------------------------- 0.1/2.3 MB 3.2 MB/s eta 0:00:01\n",
488
+ " ---- ----------------------------------- 0.3/2.3 MB 3.1 MB/s eta 0:00:01\n",
489
+ " ---- ----------------------------------- 0.3/2.3 MB 3.1 MB/s eta 0:00:01\n",
490
+ " ----- ---------------------------------- 0.3/2.3 MB 1.9 MB/s eta 0:00:02\n",
491
+ " -------- ------------------------------- 0.5/2.3 MB 2.0 MB/s eta 0:00:01\n",
492
+ " ------------ --------------------------- 0.7/2.3 MB 2.7 MB/s eta 0:00:01\n",
493
+ " --------------- ------------------------ 0.9/2.3 MB 2.8 MB/s eta 0:00:01\n",
494
+ " ------------------- -------------------- 1.1/2.3 MB 3.0 MB/s eta 0:00:01\n",
495
+ " ---------------------- ----------------- 1.3/2.3 MB 3.1 MB/s eta 0:00:01\n",
496
+ " -------------------------- ------------- 1.5/2.3 MB 3.3 MB/s eta 0:00:01\n",
497
+ " ------------------------------ --------- 1.7/2.3 MB 3.5 MB/s eta 0:00:01\n",
498
+ " -------------------------------- ------- 1.8/2.3 MB 3.3 MB/s eta 0:00:01\n",
499
+ " ----------------------------------- ---- 2.1/2.3 MB 3.4 MB/s eta 0:00:01\n",
500
+ " --------------------------------------- 2.2/2.3 MB 3.5 MB/s eta 0:00:01\n",
501
+ " ---------------------------------------- 2.3/2.3 MB 3.4 MB/s eta 0:00:00\n",
502
+ "Requirement already satisfied: pygments>=2.4.0 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (2.17.2)\n",
503
+ "Requirement already satisfied: stack-data in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (0.6.3)\n",
504
+ "Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.41 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (3.0.43)\n",
505
+ "Requirement already satisfied: matplotlib-inline in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (0.1.6)\n",
506
+ "Requirement already satisfied: jedi>=0.16 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (0.19.1)\n",
507
+ "Requirement already satisfied: colorama in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (0.4.6)\n",
508
+ "Requirement already satisfied: decorator in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (5.1.1)\n",
509
+ "Requirement already satisfied: exceptiongroup in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (1.2.0)\n",
510
+ "Requirement already satisfied: parso<0.9.0,>=0.8.3 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets) (0.8.3)\n",
511
+ "Requirement already satisfied: wcwidth in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from prompt-toolkit<3.1.0,>=3.0.41->ipython>=6.1.0->ipywidgets) (0.2.13)\n",
512
+ "Requirement already satisfied: pure-eval in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (0.2.2)\n",
513
+ "Requirement already satisfied: asttokens>=2.1.0 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (2.4.1)\n",
514
+ "Requirement already satisfied: executing>=1.2.0 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (2.0.1)\n",
515
+ "Requirement already satisfied: six>=1.12.0 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from asttokens>=2.1.0->stack-data->ipython>=6.1.0->ipywidgets) (1.16.0)\n",
516
+ "Installing collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets\n",
517
+ "Successfully installed ipywidgets-8.1.1 jupyterlab-widgets-3.0.9 widgetsnbextension-4.0.9\n"
518
+ ]
519
+ }
520
+ ],
521
+ "source": [
522
+ "pip install ipywidgets"
523
+ ]
524
+ },
525
+ {
526
+ "cell_type": "code",
527
+ "execution_count": 8,
528
+ "id": "9995b754-58c9-4c1d-8e8f-9e55741186c5",
529
+ "metadata": {},
530
+ "outputs": [],
531
+ "source": [
532
+ "import datasets"
533
+ ]
534
+ },
535
+ {
536
+ "cell_type": "code",
537
+ "execution_count": 5,
538
+ "id": "87a513e0-f880-4219-9ffb-9a8c40707a69",
539
+ "metadata": {},
540
+ "outputs": [
541
+ {
542
+ "data": {
543
+ "text/plain": [
544
+ "100641"
545
+ ]
546
+ },
547
+ "execution_count": 5,
548
+ "metadata": {},
549
+ "output_type": "execute_result"
550
+ }
551
+ ],
552
+ "source": [
553
+ "all_ds = datasets.list_datasets()\n",
554
+ "len(all_ds)"
555
+ ]
556
+ },
557
+ {
558
+ "cell_type": "code",
559
+ "execution_count": 9,
560
+ "id": "f20dcb4d-bd17-4bf4-839f-3013b4f40c84",
561
+ "metadata": {},
562
+ "outputs": [
563
+ {
564
+ "data": {
565
+ "text/plain": [
566
+ "['acronym_identification',\n",
567
+ " 'ade_corpus_v2',\n",
568
+ " 'adversarial_qa',\n",
569
+ " 'aeslc',\n",
570
+ " 'afrikaans_ner_corpus']"
571
+ ]
572
+ },
573
+ "execution_count": 9,
574
+ "metadata": {},
575
+ "output_type": "execute_result"
576
+ }
577
+ ],
578
+ "source": [
579
+ "all_ds[:5]"
580
+ ]
581
+ },
582
+ {
583
+ "cell_type": "code",
584
+ "execution_count": 11,
585
+ "id": "63e676b3-0ce3-462a-b9e3-cc37c2f33703",
586
+ "metadata": {},
587
+ "outputs": [
588
+ {
589
+ "name": "stderr",
590
+ "output_type": "stream",
591
+ "text": [
592
+ "C:\\Users\\Dell\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\load.py:1429: FutureWarning: The repository for oscar contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/oscar\n",
593
+ "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
594
+ "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
595
+ " warnings.warn(\n",
596
+ "Downloading builder script: 100%|█████████████████████████████████████████████████| 14.8k/14.8k [00:00<00:00, 14.8MB/s]\n",
597
+ "Downloading readme: 100%|████████████████████████████████████████████████████████████| 303k/303k [00:00<00:00, 412kB/s]\n",
598
+ "Downloading data: 100%|█████████████████████████████████████████████████████████████████████| 81.0/81.0 [00:00<?, ?B/s]\n",
599
+ "Downloading data: 100%|█████████████████████████████████████████████████████████████| 176M/176M [00:54<00:00, 3.21MB/s]\n",
600
+ "Generating train split: 100%|█████████████████████████████████████████| 120684/120684 [00:15<00:00, 7824.51 examples/s]\n"
601
+ ]
602
+ }
603
+ ],
604
+ "source": [
605
+ "dataset = datasets.load_dataset('oscar', 'unshuffled_deduplicated_si')"
606
+ ]
607
+ },
608
+ {
609
+ "cell_type": "code",
610
+ "execution_count": 12,
611
+ "id": "b6593b70-5caa-4da2-92bb-594e67bd964c",
612
+ "metadata": {},
613
+ "outputs": [
614
+ {
615
+ "data": {
616
+ "text/plain": [
617
+ "DatasetDict({\n",
618
+ " train: Dataset({\n",
619
+ " features: ['id', 'text'],\n",
620
+ " num_rows: 120684\n",
621
+ " })\n",
622
+ "})"
623
+ ]
624
+ },
625
+ "execution_count": 12,
626
+ "metadata": {},
627
+ "output_type": "execute_result"
628
+ }
629
+ ],
630
+ "source": [
631
+ "dataset"
632
+ ]
633
+ },
634
+ {
635
+ "cell_type": "code",
636
+ "execution_count": 13,
637
+ "id": "d80f383a-3928-48cb-9218-50b2e6a5577b",
638
+ "metadata": {},
639
+ "outputs": [
640
+ {
641
+ "data": {
642
+ "text/plain": [
643
+ "{'id': 0,\n",
644
+ " 'text': 'ජනාධිපතිවරයා දුෂණයට ඔතරම් විරුද්ධ නම් ශ්\\u200dරී ලංකා ටෙලිකොම් – මොබිටෙල් සඳහා ජනාධිපති විමර්ශන කොමිසමක් පත්කළ යුතු බව එම ආයතනයේ සේවකයින් සඳහන් කරයි.\\nඒ මේ වන විට එම ආයතනය බරපතල සැක සහිත ගනුදෙනු වලින් පිරීගත් අන්ත දුෂිත මෙන්ම ඉතිහාසයේ කිසිදා නොවූවිරූ පාඩුවක් ලබන තැනට පත්ව ඇති නිසාය.\\nවාර්තා වන තොරතුරු අතුරින් ශ්\\u200dරී ලංකා ටෙලිකොම් – මොබිටෙල් හී සිදුව තිබෙන එවැනි අතිශය සැකසහිත ගනුදෙනුවක් වන්නේ Mobitel – stage 8 (upgrade) ටෙන්ඩරය Huawei සමාගමට සහ ZTE සමාගමට ලබාදී ඇති ආකාරයයි.\\nHuawei සමාගමට ලබාදී ඇති ටෙන්ඩරයේ වටිනාකම ඇමරිකානු ඩොලර් මිලියන 79 ක් වන අතර ZTE සමාගමට ලබාදී ඇති ටෙන්ඩරයේ වටිනාකම ඇමරිකානු ඩොලර් මිලියන 6 කි.\\nමෙහි ඇති ගැටළුව වන්නේ ඇමරිකානු ඩොලර් මිලියන 85ක් තරම් ඇති විශාල වටිනාකමකින් යුතු අදාළ ගනුදෙනුව සිදුකිරීමේදී කිසිඳු ටෙන්ඩර් පටිපාටියක් අනුගමනය කර නොතිබීමය. මෙවැනි අධික වැයක් දරන්නට සිදුවන ගනුදෙනුවකදී ඒ සඳහා කැබිනට් අනුමැතිය මෙන්ම විධිමත් ප්\\u200dරසම්පාදන ක්\\u200dරියාවලියක් තුළ සිදුකිරීම නෛතික අවශ්\\u200dයතාවක් වුවත් ඒ ගැන කිසිඳු තැකීමක් ටෙලිකොම්- මොබිටෙල් පරිපාලනය සිදුකර නොමැත.\\nසේවක පිරිස් සඳහන් කරන ආකාරයට මෙම ගනුදෙනුව මුළුමනින්ම පසුගිය වසරේ සිදුකොට ඇත්තේ ටෙලිකොම් සභාපති කුමාරසිංහ සිරිසේනගේ මැදිහත්වීම මතය.\\nඑසේම මෙම ගනුදෙනුව තවදුරටත් සැක උපදවන කාරණයක් වන්නේ පසුගිය සිංහල – හින්දු අවුරුදු සමයේ එනම් අප්\\u200dරේල් 16 සිට 22 දක්වා පී.ජී කුමාරසිංහගේ සමස්ථ පවුලටම චීන සංචාරයක් සඳහා Huawei සමාගම අනුග්\\u200dරහය දක්වා ඇති බවද සේවක පිරිස් එල්ල කරන චෝදනාවය.\\nඑපමණක් නොව මේ වන විට ශ්\\u200dරී ලංකා ටෙලිකොම් හී IT අංශය නංවාලීම සඳහා කියමින් Virtusa සමාගමට ලබාදීමේ සුදානමක් පවතින බවද වාර්තා වෙයි. ඒ පිළිබඳ සේවක පිරිස් අනාවරණය කරමින් සඳහන් කරන්නේ IT අංශයේ සමස්ථ ක්\\u200dරියාවලිය වසරකට රුපියල් බිලියන 1ක් ගෙවා අදාළ සමගම හරහා සිදුකර ගැනීමේ අපේක්ෂාවෙන් මෙම කටයුත්ත දියත්කර ඇති බවය.\\nඑහිදීද කිසිඳු ටෙන්ඩර් කැඳවීමක් හෝ විධිමත් ප්\\u200dරසම්පාදන ක්\\u200dරියාවලියක් සිදුව නොමැති බවද ඔවුන් සඳහන් කරයි. එපමණක් නොව ශ්\\u200dරී ලංකා ටෙලිකොම් හී හදවත බඳු IT අංශය පෞද්ගලික සමාගමක් හරහා කරවා ගැනීමට යාම තුළ සමස්ත ආයතන පද්ධතියම අවධානමේ වැටීමක් සිදුවන බවද ඔවුන් පෙන්වා දෙයි.\\nමේ පිළිබඳ තොරතුරු අනාවරණය කරන පාර්ශව සඳහන් කර සිටින්නේ මේ කිසිඳු සැකකටයුතු ගනුදෙනුවක් සම්බන්ධව මාධ්\\u200dය හෝ වගකිවයුතු පාර්ශව කතා නොකරන බවත් ශ්\\u200dරී ලංකා ටෙලිකොම් සභාපති පී.ජී.කුමාරසිංහ , ජනාධිපති මෛත්\\u200dරීපාල සිරිසේනගේ සහෝදරයාවීම ඊට හේතුවී ඇති බව තමන් විශ්වාස කරන බවය.\\nඑසේම වත්මන් සභාපතිවරයා යටතේ ශ්\\u200dරී ලංකා ටෙලිකොම් හී කඩාවැටීම පෙන්වා දෙමින් තොරතුරු මුලාශ්\\u200dර සඳහන් කරන්නේ මේ වන විට රුපියල් බිලියන 35 කට අධික ණයක් සහිතව ආයතනය මුල්\\u200dය අර්බුදයකට ගමන් කර ඇති බවත් ආයතනය පවත්වාගෙන යන්නේ මහජන බැංකුවෙන් ලබාගත් බැංකු අයිරාවකින් බවය.\\nඑහෙත් වත්මන් ආණ්ඩුව බලයට පත්වන විට ශ්\\u200dරී ටෙලිකොම් හී ණය ප්\\u200dරමාණය රුපියල් බිලියන 6.9 ක් පමණක් වූ බවත් ඒ ණයද සමාගම ගෙන ඇත්තේ ආයතනය වර්ධනය කිරීමට අවශ්\\u200dය ව්\\u200dයාපෘති සඳහා බවද වාර්තා වෙයි.\\nඒ අනුව ආයතනයේ ණය 5 ගුණයකින් පමණ වැඩිකිරීම හැර ආයතනයේ යහපත වෙනුවෙන් වත්මන් සභාපතිවරයා සිදුකළ කිසිඳු සේවයක් නොමැ��ි බවද සඳහන් වෙයි. එසේම ආයතනය රුපියල් මිලියන 500ක පමණ බරපතල පාඩුවක් ලබමින් සිටින බව ඒ හේතුවෙන් සේවකයින්ගේ බෝනස් ගෙවීමට පවා නොහැකි තත්ත්වයක් උද්ගතව පවතින බවත් සඳහන් කරන තොරතුරු මුලාශ්\\u200dර වැඩිදුරටත් සඳහන් කර සිටින්නේ මේ ආකාරයට තවත් කාලයක් ගියහොත් ශ්\\u200dරී ලංකා ටෙලිකොම් බංකොලොත්වීමේ අවධානමක්ද පවතින බවය.\\nමේ අතර ශ්\\u200dරී ලංකා ටෙලිකොම් හී විධායක නිලධාරී කිත්ති පෙරේරා මේ වන විටත් පුරප්පාඩුව පවතින ප්\\u200dරධාන විධායක නිලධාරී තනතුර සඳහා යොදා ගැනීමේ උත්සාහයකද සභාපතිවරයා නිරතව සිටින බව වාර්තා වෙයි.\\nටෙලිකොම් සභාපති කුමාරසිංහ මේ සම්බන්ධව “කිත්තිව CEO කරමු. මට ඒකට ඔයාලගේ උදව් ඕන” යනුවෙන් අධ්\\u200dයක්ෂ මණ්ඩලයටද පවසා ඇති බවත් වාර්තා වෙයි. අයතා ඉපයීම් සම්බන්ධව දුෂණ චෝදනා ලාභියකු වන කිත්ති පෙරේරා ප්\\u200dරධාන විධායක නිලධාරියා ලෙස පත්වුවහොත් ශ්\\u200dරී ලංකා ටෙලිකොම් පවතින තත්ත්වයද ඊට අහිමි වන බව මෙන්ම මේ වන විටත් ප්\\u200dරධාන විධායක නිලධාරී තනතුර සඳහා ඇබෑර්තු නිවේදනයේ දැක්වෙන සුදුසුකම් ද කිත්ති පෙරේරා සපුරා නොමැති බවද වැඩිදුරටත් වාර්තා වෙයි.\\nඑසේම කිත්ති පෙරේරා ඇතුළු රාජපක්ෂ හිතවාදී නඩයක්ම මේ වන විට සභාපතිවරයා සමඟ මෙම දුෂිත ගනුදෙනු රැසක නිරතව සිටින බවත් එය ජනාධිපතිවරයාගේ දේශපාලන කීර්ති නාමයට බරපතල හානිකර තත්ත්වයක් බවද ඔවුන් සඳහන් කරයි.\\nඒ අනුව තොරතුරු මුලාශ්\\u200dර සඳහන් කර සිටින්නේ මේවා ගැන කෝපනොවී එල්ලවන චෝදනා ඇත්තද නැත්තද යන්න ගැන විමර්ශනය කිරීමේ යුතුකමක් ජනාධිපතිවරයාට ඇති බවය. එසේම එවැනි විධිමත් විමර්ශනයක් සිදුකළ විට ශ්\\u200dරී ලංකා ටෙලිකොම්- මොබිටෙල් හී සිදුව තිබෙන දුෂණ සම්බන්ධවද ජනාධිපති විමර්ශන කොමිසමක් අවශ්\\u200dය බවට කරන යෝජනාවේ සාධාරණත්වයක් ඇති බව පෙනීයනු ඇති බවද ඔවුන් සඳහන් කරයි.\\n← තේරෙයි.., කල් යයි..! වැලිකඩ සමූහ ඝාතනයට සම්බන්ධ පොලිසියේ රංගජීව සහ බන්ධනාගාරයේ එමිල් රන්ජන්ට බඩු බනිස්..! ගෝඩ් ෆාදර්ට කවදා ද?\\nශ්\\u200dරී ලංකා ප්\\u200dරජාතන්ත්\\u200dරවාදී සමාජවාදී ජනරජයේ නිතියානුකුල අග්\\u200dරාමාත්\\u200dය ලෙස ඔබ පිළිගන්නේ? (මත විමසුම) October 28, 2018\\nලංකා ඊ නිව්ස් කර්තෘ සඳරුවන් සේනාධීර අත් අඩංගුවට ගන්න නැතිනම් පිටුවහල් කරන්න – ජනාධිපති ගමරාළ බ්\\u200dරිතාන්\\u200dයයෙන් ඉල්ලයි June 10, 2018\\nරටේ බිහි වූ දූෂිතම සහ භයංකරම රාජ්\\u200dය නිලධාරියා වූ ගෝඨා මැරී ඉපදිලා වගේ කතා කරමින් රට යලි විපත් මගකට ඇද දාන්න හදනවා – ෂැන්ග්\\u200dරිලා දේශණයට මුදල් ඇමැති මංගලගෙන් පිළිතුරක් May 20, 2018'}"
645
+ ]
646
+ },
647
+ "execution_count": 13,
648
+ "metadata": {},
649
+ "output_type": "execute_result"
650
+ }
651
+ ],
652
+ "source": [
653
+ "dataset['train'][0]"
654
+ ]
655
+ },
656
+ {
657
+ "cell_type": "code",
658
+ "execution_count": 18,
659
+ "id": "b6c82283-ce4c-491e-81c1-ba0e512f0704",
660
+ "metadata": {},
661
+ "outputs": [
662
+ {
663
+ "name": "stderr",
664
+ "output_type": "stream",
665
+ "text": [
666
+ "100%|████████████████████████████████████████████████████████████████████████| 120684/120684 [00:13<00:00, 8667.09it/s]\n"
667
+ ]
668
+ }
669
+ ],
670
+ "source": [
671
+ "from tqdm.auto import tqdm\n",
672
+ "\n",
673
+ "text_data = []\n",
674
+ "file_count = 0\n",
675
+ "\n",
676
+ "for sample in tqdm(dataset['train']):\n",
677
+ " sample = sample['text'].replace('\\n', '')\n",
678
+ " text_data.append(sample)\n",
679
+ " if len(text_data) == 10_000:\n",
680
+ " with open(f'sinhala_{file_count}.txt','w' , encoding='utf-8') as fp:\n",
681
+ " fp.write('\\n'.join(text_data))\n",
682
+ " text_data = []\n",
683
+ " file_count += 1\n",
684
+ "with open(f'sinhala_{file_count}.txt','w' , encoding='utf-8') as fp:\n",
685
+ " fp.write('\\n'.join(text_data))"
686
+ ]
687
+ }
688
+ ],
689
+ "metadata": {
690
+ "kernelspec": {
691
+ "display_name": "Python 3 (ipykernel)",
692
+ "language": "python",
693
+ "name": "python3"
694
+ },
695
+ "language_info": {
696
+ "codemirror_mode": {
697
+ "name": "ipython",
698
+ "version": 3
699
+ },
700
+ "file_extension": ".py",
701
+ "mimetype": "text/x-python",
702
+ "name": "python",
703
+ "nbconvert_exporter": "python",
704
+ "pygments_lexer": "ipython3",
705
+ "version": "3.10.11"
706
+ }
707
+ },
708
+ "nbformat": 4,
709
+ "nbformat_minor": 5
710
+ }
Tokenizer.ipynb ADDED
@@ -0,0 +1,720 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "id": "585f9800-984f-40fe-9b06-35cd40229d90",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from pathlib import Path"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 3,
16
+ "id": "503f9c3c-e348-478f-b743-cff3ce5f4465",
17
+ "metadata": {},
18
+ "outputs": [
19
+ {
20
+ "data": {
21
+ "text/plain": [
22
+ "['sinhala_0.txt',\n",
23
+ " 'sinhala_1.txt',\n",
24
+ " 'sinhala_10.txt',\n",
25
+ " 'sinhala_11.txt',\n",
26
+ " 'sinhala_12.txt']"
27
+ ]
28
+ },
29
+ "execution_count": 3,
30
+ "metadata": {},
31
+ "output_type": "execute_result"
32
+ }
33
+ ],
34
+ "source": [
35
+ "paths = [str(x) for x in Path('./').glob('*.txt')]\n",
36
+ "\n",
37
+ "paths[:5]"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "execution_count": 4,
43
+ "id": "5e9baa0f-6c33-45b8-8487-202627067436",
44
+ "metadata": {},
45
+ "outputs": [
46
+ {
47
+ "name": "stdout",
48
+ "output_type": "stream",
49
+ "text": [
50
+ "Requirement already satisfied: tokenizers in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (0.15.1)\n",
51
+ "Requirement already satisfied: huggingface_hub<1.0,>=0.16.4 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from tokenizers) (0.20.3)\n",
52
+ "Requirement already satisfied: requests in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (2.31.0)\n",
53
+ "Requirement already satisfied: filelock in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (3.13.1)\n",
54
+ "Requirement already satisfied: tqdm>=4.42.1 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (4.66.1)\n",
55
+ "Requirement already satisfied: fsspec>=2023.5.0 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (2023.10.0)\n",
56
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (4.9.0)\n",
57
+ "Requirement already satisfied: packaging>=20.9 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (23.2)\n",
58
+ "Requirement already satisfied: pyyaml>=5.1 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (6.0.1)\n",
59
+ "Requirement already satisfied: colorama in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from tqdm>=4.42.1->huggingface_hub<1.0,>=0.16.4->tokenizers) (0.4.6)\n",
60
+ "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests->huggingface_hub<1.0,>=0.16.4->tokenizers) (3.6)\n",
61
+ "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests->huggingface_hub<1.0,>=0.16.4->tokenizers) (2023.11.17)\n",
62
+ "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests->huggingface_hub<1.0,>=0.16.4->tokenizers) (3.3.2)\n",
63
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests->huggingface_hub<1.0,>=0.16.4->tokenizers) (2.1.0)\n"
64
+ ]
65
+ },
66
+ {
67
+ "name": "stderr",
68
+ "output_type": "stream",
69
+ "text": [
70
+ "\n",
71
+ "[notice] A new release of pip is available: 23.0.1 -> 23.3.2\n",
72
+ "[notice] To update, run: python.exe -m pip install --upgrade pip\n"
73
+ ]
74
+ }
75
+ ],
76
+ "source": [
77
+ "!pip install tokenizers"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "code",
82
+ "execution_count": 5,
83
+ "id": "225c9c9a-0182-4bfe-92fc-2234e3515560",
84
+ "metadata": {},
85
+ "outputs": [],
86
+ "source": [
87
+ "from tokenizers import ByteLevelBPETokenizer"
88
+ ]
89
+ },
90
+ {
91
+ "cell_type": "code",
92
+ "execution_count": 11,
93
+ "id": "501098a0-5df1-448b-99e6-52143cb6751f",
94
+ "metadata": {},
95
+ "outputs": [],
96
+ "source": [
97
+ "tokenizer = ByteLevelBPETokenizer()"
98
+ ]
99
+ },
100
+ {
101
+ "cell_type": "code",
102
+ "execution_count": 13,
103
+ "id": "24ef02fe-5703-4b87-a92a-e6e936f7fd96",
104
+ "metadata": {},
105
+ "outputs": [],
106
+ "source": [
107
+ "tokenizer.train(files=paths, vocab_size=30_522, min_frequency=2,\n",
108
+ " special_tokens=['<s>', '<pad>', '</s>', '<mask>'\n",
109
+ " ])"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": 8,
115
+ "id": "da5bfba4-6c97-4256-b669-f75026b93e09",
116
+ "metadata": {},
117
+ "outputs": [
118
+ {
119
+ "ename": "FileExistsError",
120
+ "evalue": "[WinError 183] Cannot create a file when that file already exists: 'sinhalaMLM'",
121
+ "output_type": "error",
122
+ "traceback": [
123
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
124
+ "\u001b[1;31mFileExistsError\u001b[0m Traceback (most recent call last)",
125
+ "Cell \u001b[1;32mIn[8], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmkdir\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msinhalaMLM\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n",
126
+ "\u001b[1;31mFileExistsError\u001b[0m: [WinError 183] Cannot create a file when that file already exists: 'sinhalaMLM'"
127
+ ]
128
+ }
129
+ ],
130
+ "source": [
131
+ "import os\n",
132
+ "os.mkdir('sinhalaMLM')"
133
+ ]
134
+ },
135
+ {
136
+ "cell_type": "code",
137
+ "execution_count": null,
138
+ "id": "e25beeaa-e969-4c33-98e2-65523d827d95",
139
+ "metadata": {},
140
+ "outputs": [],
141
+ "source": [
142
+ "tokenizer.save_model('sinhalaMLM')"
143
+ ]
144
+ },
145
+ {
146
+ "cell_type": "code",
147
+ "execution_count": null,
148
+ "id": "222a9edf-15ed-44a8-aaba-2afee76b3cbf",
149
+ "metadata": {},
150
+ "outputs": [],
151
+ "source": [
152
+ "!pip install transformers"
153
+ ]
154
+ },
155
+ {
156
+ "cell_type": "code",
157
+ "execution_count": null,
158
+ "id": "864266b5-77d5-451e-9c01-096588ff62e4",
159
+ "metadata": {},
160
+ "outputs": [],
161
+ "source": [
162
+ "pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cpu"
163
+ ]
164
+ },
165
+ {
166
+ "cell_type": "code",
167
+ "execution_count": 15,
168
+ "id": "67d2fc2e-9cb0-4306-9769-0c34a2111c37",
169
+ "metadata": {},
170
+ "outputs": [],
171
+ "source": [
172
+ "from transformers import RobertaTokenizerFast"
173
+ ]
174
+ },
175
+ {
176
+ "cell_type": "code",
177
+ "execution_count": 16,
178
+ "id": "474f1e8c-e499-4205-96cc-44f4a4c9e4e3",
179
+ "metadata": {},
180
+ "outputs": [],
181
+ "source": [
182
+ "tokenizer = RobertaTokenizerFast.from_pretrained('sinhalaMLM')"
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": 17,
188
+ "id": "3ea07602-9cd6-4c12-a860-9302e4db7607",
189
+ "metadata": {},
190
+ "outputs": [
191
+ {
192
+ "data": {
193
+ "text/plain": [
194
+ "{'input_ids': [0, 4689, 267, 300, 275, 469, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}"
195
+ ]
196
+ },
197
+ "execution_count": 17,
198
+ "metadata": {},
199
+ "output_type": "execute_result"
200
+ }
201
+ ],
202
+ "source": [
203
+ "tokenizer('ළමයා ගෙදර')"
204
+ ]
205
+ },
206
+ {
207
+ "cell_type": "code",
208
+ "execution_count": null,
209
+ "id": "0a9a1acf-fd7d-4d4b-9222-f2f25a39efff",
210
+ "metadata": {},
211
+ "outputs": [],
212
+ "source": [
213
+ "lables == input_ids\n",
214
+ "\n",
215
+ "input_ids -> MLM"
216
+ ]
217
+ },
218
+ {
219
+ "cell_type": "code",
220
+ "execution_count": 18,
221
+ "id": "90cec429-ee77-4c34-814b-3fec3f5b035e",
222
+ "metadata": {},
223
+ "outputs": [],
224
+ "source": [
225
+ "import torch\n",
226
+ "def mlm(tensor):\n",
227
+ " rand = torch.rand(tensor.shape) \n",
228
+ " mask_arr = (rand < 0.15) * (tensor > 2)\n",
229
+ " for i in range( tensor.shape[0]):\n",
230
+ " selection = torch.flatten(mask_arr[i].nonzero()).tolist() #[[2,5,8]]\n",
231
+ " tensor[i, selection] = 3\n",
232
+ " return tensor"
233
+ ]
234
+ },
235
+ {
236
+ "cell_type": "code",
237
+ "execution_count": 19,
238
+ "id": "2c22dc9d-cf4f-4cf0-a7b4-364464c525d7",
239
+ "metadata": {},
240
+ "outputs": [
241
+ {
242
+ "data": {
243
+ "text/plain": [
244
+ "['sinhala_0.txt',\n",
245
+ " 'sinhala_1.txt',\n",
246
+ " 'sinhala_10.txt',\n",
247
+ " 'sinhala_11.txt',\n",
248
+ " 'sinhala_12.txt']"
249
+ ]
250
+ },
251
+ "execution_count": 19,
252
+ "metadata": {},
253
+ "output_type": "execute_result"
254
+ }
255
+ ],
256
+ "source": [
257
+ "from pathlib import Path\n",
258
+ "\n",
259
+ "paths = [str(x) for x in Path('./').glob('*.txt')]\n",
260
+ "paths[:5]"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "code",
265
+ "execution_count": 20,
266
+ "id": "d19907e4-e54d-4596-a8a8-099e6965bcef",
267
+ "metadata": {},
268
+ "outputs": [
269
+ {
270
+ "data": {
271
+ "application/vnd.jupyter.widget-view+json": {
272
+ "model_id": "9bb6ce3b55a644b4805cca4d442ada93",
273
+ "version_major": 2,
274
+ "version_minor": 0
275
+ },
276
+ "text/plain": [
277
+ " 0%| | 0/13 [00:00<?, ?it/s]"
278
+ ]
279
+ },
280
+ "metadata": {},
281
+ "output_type": "display_data"
282
+ }
283
+ ],
284
+ "source": [
285
+ "from tqdm.auto import tqdm\n",
286
+ "\n",
287
+ "input_ids = []\n",
288
+ "mask = []\n",
289
+ "labels = []\n",
290
+ "\n",
291
+ "for path in tqdm(paths[:50]):\n",
292
+ " with open(path, 'r', encoding='utf-8') as f:\n",
293
+ " lines = f.read().split('\\n')\n",
294
+ " sample = tokenizer(lines, max_length=512, padding='max_length',\n",
295
+ "truncation=True, return_tensors='pt')\n",
296
+ " labels.append(sample.input_ids)\n",
297
+ " mask.append(sample.attention_mask)\n",
298
+ " input_ids.append(mlm(sample.input_ids.detach().clone()))"
299
+ ]
300
+ },
301
+ {
302
+ "cell_type": "code",
303
+ "execution_count": 96,
304
+ "id": "ef40f10e-ca0b-416a-8481-6ff6e343372e",
305
+ "metadata": {},
306
+ "outputs": [],
307
+ "source": [
308
+ "#input_ids = torch.cat(input_ids)\n",
309
+ "#mask = torch.cat(mask)\n",
310
+ "#labels = torch.cat(labels)"
311
+ ]
312
+ },
313
+ {
314
+ "cell_type": "code",
315
+ "execution_count": 89,
316
+ "id": "3b173e1a-96d0-40e6-9883-4f91772c47a1",
317
+ "metadata": {},
318
+ "outputs": [
319
+ {
320
+ "data": {
321
+ "text/plain": [
322
+ "tensor([ 0, 528, 267, 312, 264, 353, 264, 470, 267, 293,\n",
323
+ " 271, 1920, 20312, 263, 281, 264, 269, 271, 282, 263])"
324
+ ]
325
+ },
326
+ "execution_count": 89,
327
+ "metadata": {},
328
+ "output_type": "execute_result"
329
+ }
330
+ ],
331
+ "source": [
332
+ "input_ids[0][:20]"
333
+ ]
334
+ },
335
+ {
336
+ "cell_type": "code",
337
+ "execution_count": 90,
338
+ "id": "3915fa7d-ae56-4d86-b10d-4431a4739c0e",
339
+ "metadata": {},
340
+ "outputs": [
341
+ {
342
+ "data": {
343
+ "text/plain": [
344
+ "tensor([ 0, 528, 267, 312, 264, 353, 264, 470, 267, 293])"
345
+ ]
346
+ },
347
+ "execution_count": 90,
348
+ "metadata": {},
349
+ "output_type": "execute_result"
350
+ }
351
+ ],
352
+ "source": [
353
+ "labels[0][:10]"
354
+ ]
355
+ },
356
+ {
357
+ "cell_type": "code",
358
+ "execution_count": 91,
359
+ "id": "9202a316-001b-4602-9b8f-9ea93aa5083d",
360
+ "metadata": {},
361
+ "outputs": [],
362
+ "source": [
363
+ "encodings = {\n",
364
+ " 'input_ids': input_ids,\n",
365
+ " 'attention_mask': mask,\n",
366
+ " 'labels': labels\n",
367
+ "}"
368
+ ]
369
+ },
370
+ {
371
+ "cell_type": "code",
372
+ "execution_count": 116,
373
+ "id": "1c5a01c5-14e2-4bdb-8bfa-12becbe02de6",
374
+ "metadata": {},
375
+ "outputs": [],
376
+ "source": [
377
+ "#from torch.utils.data import Dataset\n",
378
+ "\n",
379
+ "class Dataset(torch.utils.data.Dataset):\n",
380
+ " def __init__(self, encodings):\n",
381
+ " self.encodings = encodings\n",
382
+ " def __len__(self):\n",
383
+ " return self.encodings['input_ids'].shape[0]\n",
384
+ " def __getitem__(self, i):\n",
385
+ " return {key: tensor[i] for key, tensor in self.encodings.items()}"
386
+ ]
387
+ },
388
+ {
389
+ "cell_type": "code",
390
+ "execution_count": 111,
391
+ "id": "e113560d-b382-49d1-afc0-9ad03da9b212",
392
+ "metadata": {},
393
+ "outputs": [
394
+ {
395
+ "data": {
396
+ "text/plain": [
397
+ "120684"
398
+ ]
399
+ },
400
+ "execution_count": 111,
401
+ "metadata": {},
402
+ "output_type": "execute_result"
403
+ }
404
+ ],
405
+ "source": [
406
+ "input_ids.shape[0]"
407
+ ]
408
+ },
409
+ {
410
+ "cell_type": "code",
411
+ "execution_count": 112,
412
+ "id": "ebc01aa1-6a6e-4e20-b88a-9d91deca0c0a",
413
+ "metadata": {},
414
+ "outputs": [
415
+ {
416
+ "data": {
417
+ "text/plain": [
418
+ "dict_items([('input_ids', tensor([[ 0, 528, 267, ..., 283, 302, 2],\n",
419
+ " [ 0, 277, 560, ..., 1, 1, 1],\n",
420
+ " [ 0, 278, 264, ..., 296, 269, 2],\n",
421
+ " ...,\n",
422
+ " [ 0, 272, 276, ..., 1, 1, 1],\n",
423
+ " [ 0, 292, 296, ..., 620, 271, 2],\n",
424
+ " [ 0, 307, 295, ..., 269, 281, 2]])), ('attention_mask', tensor([[1, 1, 1, ..., 1, 1, 1],\n",
425
+ " [1, 1, 1, ..., 0, 0, 0],\n",
426
+ " [1, 1, 1, ..., 1, 1, 1],\n",
427
+ " ...,\n",
428
+ " [1, 1, 1, ..., 0, 0, 0],\n",
429
+ " [1, 1, 1, ..., 1, 1, 1],\n",
430
+ " [1, 1, 1, ..., 1, 1, 1]])), ('labels', tensor([[ 0, 528, 267, ..., 283, 302, 2],\n",
431
+ " [ 0, 277, 560, ..., 1, 1, 1],\n",
432
+ " [ 0, 278, 264, ..., 296, 269, 2],\n",
433
+ " ...,\n",
434
+ " [ 0, 272, 276, ..., 1, 1, 1],\n",
435
+ " [ 0, 292, 296, ..., 620, 271, 2],\n",
436
+ " [ 0, 307, 295, ..., 269, 281, 2]]))])"
437
+ ]
438
+ },
439
+ "execution_count": 112,
440
+ "metadata": {},
441
+ "output_type": "execute_result"
442
+ }
443
+ ],
444
+ "source": [
445
+ "encodings.items()"
446
+ ]
447
+ },
448
+ {
449
+ "cell_type": "code",
450
+ "execution_count": 117,
451
+ "id": "9a329bc6-2286-4d9a-89f4-041ba2f78951",
452
+ "metadata": {},
453
+ "outputs": [],
454
+ "source": [
455
+ "dataset = Dataset(encodings)"
456
+ ]
457
+ },
458
+ {
459
+ "cell_type": "code",
460
+ "execution_count": 118,
461
+ "id": "1ad3e3f5-bb45-4d51-8ccb-62ab73a1788f",
462
+ "metadata": {},
463
+ "outputs": [],
464
+ "source": [
465
+ "dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True)"
466
+ ]
467
+ },
468
+ {
469
+ "cell_type": "code",
470
+ "execution_count": 120,
471
+ "id": "900571cd-5061-41f8-b361-e7889ba534cf",
472
+ "metadata": {},
473
+ "outputs": [],
474
+ "source": [
475
+ "from transformers import RobertaConfig"
476
+ ]
477
+ },
478
+ {
479
+ "cell_type": "code",
480
+ "execution_count": 121,
481
+ "id": "d35e11be-3ee6-4e7c-b49c-6900434553cf",
482
+ "metadata": {},
483
+ "outputs": [
484
+ {
485
+ "data": {
486
+ "text/plain": [
487
+ "30522"
488
+ ]
489
+ },
490
+ "execution_count": 121,
491
+ "metadata": {},
492
+ "output_type": "execute_result"
493
+ }
494
+ ],
495
+ "source": [
496
+ "tokenizer.vocab_size\n"
497
+ ]
498
+ },
499
+ {
500
+ "cell_type": "code",
501
+ "execution_count": 137,
502
+ "id": "69d081bc-a4a5-4970-a361-05b426d5afa2",
503
+ "metadata": {},
504
+ "outputs": [],
505
+ "source": [
506
+ "config = RobertaConfig(\n",
507
+ " vocab_size = tokenizer.vocab_size,\n",
508
+ " max_position_embeddings=514,\n",
509
+ " hidden_size=768,\n",
510
+ " num_attention_heads=12,\n",
511
+ " num_hidden_layers=6,\n",
512
+ " type_vocab_size=1\n",
513
+ ")"
514
+ ]
515
+ },
516
+ {
517
+ "cell_type": "code",
518
+ "execution_count": 138,
519
+ "id": "4386905e-2647-4082-aa50-e884cded82b4",
520
+ "metadata": {},
521
+ "outputs": [],
522
+ "source": [
523
+ "from transformers import RobertaForMaskedLM"
524
+ ]
525
+ },
526
+ {
527
+ "cell_type": "code",
528
+ "execution_count": 139,
529
+ "id": "8a6fec9d-31da-4657-906d-a5b356a32328",
530
+ "metadata": {},
531
+ "outputs": [],
532
+ "source": [
533
+ "model = RobertaForMaskedLM(config)"
534
+ ]
535
+ },
536
+ {
537
+ "cell_type": "code",
538
+ "execution_count": 140,
539
+ "id": "3ae60e59-6351-4cee-a780-8d753ba55c0a",
540
+ "metadata": {},
541
+ "outputs": [],
542
+ "source": [
543
+ "device = torch.device('cuda') if torch.cuda.is_available() else torch.device ('cpu')"
544
+ ]
545
+ },
546
+ {
547
+ "cell_type": "code",
548
+ "execution_count": 141,
549
+ "id": "3558fcc7-64c5-46f3-be9a-3199aae9f59b",
550
+ "metadata": {},
551
+ "outputs": [
552
+ {
553
+ "data": {
554
+ "text/plain": [
555
+ "RobertaForMaskedLM(\n",
556
+ " (roberta): RobertaModel(\n",
557
+ " (embeddings): RobertaEmbeddings(\n",
558
+ " (word_embeddings): Embedding(30522, 768, padding_idx=1)\n",
559
+ " (position_embeddings): Embedding(514, 768, padding_idx=1)\n",
560
+ " (token_type_embeddings): Embedding(1, 768)\n",
561
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
562
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
563
+ " )\n",
564
+ " (encoder): RobertaEncoder(\n",
565
+ " (layer): ModuleList(\n",
566
+ " (0-5): 6 x RobertaLayer(\n",
567
+ " (attention): RobertaAttention(\n",
568
+ " (self): RobertaSelfAttention(\n",
569
+ " (query): Linear(in_features=768, out_features=768, bias=True)\n",
570
+ " (key): Linear(in_features=768, out_features=768, bias=True)\n",
571
+ " (value): Linear(in_features=768, out_features=768, bias=True)\n",
572
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
573
+ " )\n",
574
+ " (output): RobertaSelfOutput(\n",
575
+ " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
576
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
577
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
578
+ " )\n",
579
+ " )\n",
580
+ " (intermediate): RobertaIntermediate(\n",
581
+ " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
582
+ " (intermediate_act_fn): GELUActivation()\n",
583
+ " )\n",
584
+ " (output): RobertaOutput(\n",
585
+ " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
586
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
587
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
588
+ " )\n",
589
+ " )\n",
590
+ " )\n",
591
+ " )\n",
592
+ " )\n",
593
+ " (lm_head): RobertaLMHead(\n",
594
+ " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
595
+ " (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
596
+ " (decoder): Linear(in_features=768, out_features=30522, bias=True)\n",
597
+ " )\n",
598
+ ")"
599
+ ]
600
+ },
601
+ "execution_count": 141,
602
+ "metadata": {},
603
+ "output_type": "execute_result"
604
+ }
605
+ ],
606
+ "source": [
607
+ "model.to(device)"
608
+ ]
609
+ },
610
+ {
611
+ "cell_type": "code",
612
+ "execution_count": 142,
613
+ "id": "c9bfc830-d445-4203-97b9-a67f23f4b7f1",
614
+ "metadata": {},
615
+ "outputs": [],
616
+ "source": [
617
+ "from transformers import AdamW"
618
+ ]
619
+ },
620
+ {
621
+ "cell_type": "code",
622
+ "execution_count": 150,
623
+ "id": "651b53ff-dbdb-4665-9ee8-4c15e96e4ba9",
624
+ "metadata": {},
625
+ "outputs": [],
626
+ "source": [
627
+ "optim = AdamW(model.parameters(), lr=1e-4)"
628
+ ]
629
+ },
630
+ {
631
+ "cell_type": "code",
632
+ "execution_count": 151,
633
+ "id": "81d298ed-4bff-443b-97b7-8dd7cff5f2aa",
634
+ "metadata": {},
635
+ "outputs": [],
636
+ "source": [
637
+ "from tqdm.auto import tqdm"
638
+ ]
639
+ },
640
+ {
641
+ "cell_type": "code",
642
+ "execution_count": 152,
643
+ "id": "7ec88289-b38d-4317-8797-b2b2192ff407",
644
+ "metadata": {},
645
+ "outputs": [],
646
+ "source": [
647
+ "epochs = 1"
648
+ ]
649
+ },
650
+ {
651
+ "cell_type": "code",
652
+ "execution_count": null,
653
+ "id": "6fe6ccda-fc31-4785-b9f0-ec70e8bfb09a",
654
+ "metadata": {},
655
+ "outputs": [
656
+ {
657
+ "data": {
658
+ "application/vnd.jupyter.widget-view+json": {
659
+ "model_id": "42dd8d1f843544e28d56e5d8b574c0a6",
660
+ "version_major": 2,
661
+ "version_minor": 0
662
+ },
663
+ "text/plain": [
664
+ " 0%| | 0/120684 [00:00<?, ?it/s]"
665
+ ]
666
+ },
667
+ "metadata": {},
668
+ "output_type": "display_data"
669
+ }
670
+ ],
671
+ "source": [
672
+ "loop = tqdm(dataloader, leave=True)\n",
673
+ "for batch in loop:\n",
674
+ " optim.zero_grad()\n",
675
+ " input_ids = batch['input_ids'].to(device)\n",
676
+ " mask = batch['attention_mask'].to(device)\n",
677
+ " labels = batch['labels'].to(device)\n",
678
+ " outputs = model(input_ids, attention_mask=mask, \n",
679
+ " labels=labels)\n",
680
+ " loss = outputs.loss\n",
681
+ " loss.backward()\n",
682
+ " optim.step()\n",
683
+ "\n",
684
+ " loop.set_description(f'Epoch: {epochs}')\n",
685
+ " loop.set_postfix(loss=loss.item())"
686
+ ]
687
+ },
688
+ {
689
+ "cell_type": "code",
690
+ "execution_count": null,
691
+ "id": "e9c49ba7-a98d-47d3-8c7f-8807c591ca03",
692
+ "metadata": {},
693
+ "outputs": [],
694
+ "source": [
695
+ "model.save_pretrained('./sinhalaMLM') # and don't forget to save sinhalaMLM"
696
+ ]
697
+ }
698
+ ],
699
+ "metadata": {
700
+ "kernelspec": {
701
+ "display_name": "Python 3 (ipykernel)",
702
+ "language": "python",
703
+ "name": "python3"
704
+ },
705
+ "language_info": {
706
+ "codemirror_mode": {
707
+ "name": "ipython",
708
+ "version": 3
709
+ },
710
+ "file_extension": ".py",
711
+ "mimetype": "text/x-python",
712
+ "name": "python",
713
+ "nbconvert_exporter": "python",
714
+ "pygments_lexer": "ipython3",
715
+ "version": "3.10.11"
716
+ }
717
+ },
718
+ "nbformat": 4,
719
+ "nbformat_minor": 5
720
+ }