Spaces:
No application file
No application file
Virajtharinda
commited on
Upload 2 files
Browse files- SinhalaMLM.ipynb +710 -0
- Tokenizer.ipynb +720 -0
SinhalaMLM.ipynb
ADDED
@@ -0,0 +1,710 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"id": "3564591f-25e4-4be6-9104-acec74b5a120",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [
|
9 |
+
{
|
10 |
+
"name": "stdout",
|
11 |
+
"output_type": "stream",
|
12 |
+
"text": [
|
13 |
+
"Collecting datasets\n",
|
14 |
+
" Using cached datasets-2.16.1-py3-none-any.whl (507 kB)\n",
|
15 |
+
"Collecting pandas\n",
|
16 |
+
" Downloading pandas-2.2.0-cp310-cp310-win_amd64.whl (11.6 MB)\n",
|
17 |
+
" ---------------------------------------- 0.0/11.6 MB ? eta -:--:--\n",
|
18 |
+
" --------------------------------------- 0.0/11.6 MB 640.0 kB/s eta 0:00:19\n",
|
19 |
+
" --------------------------------------- 0.0/11.6 MB 640.0 kB/s eta 0:00:19\n",
|
20 |
+
" --------------------------------------- 0.0/11.6 MB 195.7 kB/s eta 0:00:59\n",
|
21 |
+
" --------------------------------------- 0.1/11.6 MB 381.3 kB/s eta 0:00:31\n",
|
22 |
+
" --------------------------------------- 0.1/11.6 MB 435.7 kB/s eta 0:00:27\n",
|
23 |
+
" --------------------------------------- 0.1/11.6 MB 399.4 kB/s eta 0:00:29\n",
|
24 |
+
" -------------------------------------- 0.2/11.6 MB 591.4 kB/s eta 0:00:20\n",
|
25 |
+
" -------------------------------------- 0.2/11.6 MB 602.4 kB/s eta 0:00:19\n",
|
26 |
+
" -------------------------------------- 0.2/11.6 MB 602.4 kB/s eta 0:00:19\n",
|
27 |
+
" - ------------------------------------- 0.3/11.6 MB 696.7 kB/s eta 0:00:17\n",
|
28 |
+
" - ------------------------------------- 0.4/11.6 MB 673.7 kB/s eta 0:00:17\n",
|
29 |
+
" - ------------------------------------- 0.4/11.6 MB 636.7 kB/s eta 0:00:18\n",
|
30 |
+
" - ------------------------------------- 0.5/11.6 MB 702.9 kB/s eta 0:00:16\n",
|
31 |
+
" -- ------------------------------------ 0.6/11.6 MB 893.0 kB/s eta 0:00:13\n",
|
32 |
+
" -- ------------------------------------ 0.6/11.6 MB 863.8 kB/s eta 0:00:13\n",
|
33 |
+
" -- ------------------------------------ 0.8/11.6 MB 955.9 kB/s eta 0:00:12\n",
|
34 |
+
" --- ------------------------------------ 0.9/11.6 MB 1.1 MB/s eta 0:00:10\n",
|
35 |
+
" ---- ----------------------------------- 1.2/11.6 MB 1.3 MB/s eta 0:00:09\n",
|
36 |
+
" ---- ----------------------------------- 1.2/11.6 MB 1.3 MB/s eta 0:00:08\n",
|
37 |
+
" ----- ---------------------------------- 1.5/11.6 MB 1.5 MB/s eta 0:00:07\n",
|
38 |
+
" ----- ---------------------------------- 1.6/11.6 MB 1.6 MB/s eta 0:00:07\n",
|
39 |
+
" ------ --------------------------------- 1.8/11.6 MB 1.6 MB/s eta 0:00:06\n",
|
40 |
+
" ------ --------------------------------- 2.0/11.6 MB 1.7 MB/s eta 0:00:06\n",
|
41 |
+
" ------- -------------------------------- 2.2/11.6 MB 1.9 MB/s eta 0:00:06\n",
|
42 |
+
" -------- ------------------------------- 2.3/11.6 MB 1.9 MB/s eta 0:00:05\n",
|
43 |
+
" -------- ------------------------------- 2.4/11.6 MB 1.9 MB/s eta 0:00:05\n",
|
44 |
+
" --------- ------------------------------ 2.6/11.6 MB 1.9 MB/s eta 0:00:05\n",
|
45 |
+
" --------- ------------------------------ 2.7/11.6 MB 1.9 MB/s eta 0:00:05\n",
|
46 |
+
" --------- ------------------------------ 2.8/11.6 MB 2.0 MB/s eta 0:00:05\n",
|
47 |
+
" ---------- ----------------------------- 3.0/11.6 MB 2.0 MB/s eta 0:00:05\n",
|
48 |
+
" ---------- ----------------------------- 3.1/11.6 MB 2.0 MB/s eta 0:00:05\n",
|
49 |
+
" ---------- ----------------------------- 3.2/11.6 MB 2.0 MB/s eta 0:00:05\n",
|
50 |
+
" ----------- ---------------------------- 3.3/11.6 MB 2.0 MB/s eta 0:00:05\n",
|
51 |
+
" ----------- ---------------------------- 3.3/11.6 MB 2.0 MB/s eta 0:00:05\n",
|
52 |
+
" ----------- ---------------------------- 3.4/11.6 MB 2.0 MB/s eta 0:00:05\n",
|
53 |
+
" ------------ --------------------------- 3.5/11.6 MB 1.9 MB/s eta 0:00:05\n",
|
54 |
+
" ------------ --------------------------- 3.6/11.6 MB 2.0 MB/s eta 0:00:05\n",
|
55 |
+
" ------------- -------------------------- 3.9/11.6 MB 2.0 MB/s eta 0:00:04\n",
|
56 |
+
" ------------- -------------------------- 4.0/11.6 MB 2.1 MB/s eta 0:00:04\n",
|
57 |
+
" -------------- ------------------------- 4.2/11.6 MB 2.1 MB/s eta 0:00:04\n",
|
58 |
+
" -------------- ------------------------- 4.3/11.6 MB 2.1 MB/s eta 0:00:04\n",
|
59 |
+
" --------------- ------------------------ 4.6/11.6 MB 2.2 MB/s eta 0:00:04\n",
|
60 |
+
" ---------------- ----------------------- 4.8/11.6 MB 2.2 MB/s eta 0:00:04\n",
|
61 |
+
" ---------------- ----------------------- 4.9/11.6 MB 2.2 MB/s eta 0:00:03\n",
|
62 |
+
" ------------------ --------------------- 5.3/11.6 MB 2.4 MB/s eta 0:00:03\n",
|
63 |
+
" ------------------- -------------------- 5.7/11.6 MB 2.5 MB/s eta 0:00:03\n",
|
64 |
+
" -------------------- ------------------- 5.8/11.6 MB 2.5 MB/s eta 0:00:03\n",
|
65 |
+
" -------------------- ------------------- 5.9/11.6 MB 2.5 MB/s eta 0:00:03\n",
|
66 |
+
" -------------------- ------------------- 6.1/11.6 MB 2.5 MB/s eta 0:00:03\n",
|
67 |
+
" --------------------- ------------------ 6.3/11.6 MB 2.5 MB/s eta 0:00:03\n",
|
68 |
+
" --------------------- ------------------ 6.3/11.6 MB 2.5 MB/s eta 0:00:03\n",
|
69 |
+
" ---------------------- ----------------- 6.5/11.6 MB 2.5 MB/s eta 0:00:03\n",
|
70 |
+
" ----------------------- ---------------- 6.7/11.6 MB 2.5 MB/s eta 0:00:02\n",
|
71 |
+
" ------------------------ --------------- 7.0/11.6 MB 2.6 MB/s eta 0:00:02\n",
|
72 |
+
" ------------------------ --------------- 7.1/11.6 MB 2.6 MB/s eta 0:00:02\n",
|
73 |
+
" ------------------------- -------------- 7.4/11.6 MB 2.6 MB/s eta 0:00:02\n",
|
74 |
+
" -------------------------- ------------- 7.6/11.6 MB 2.7 MB/s eta 0:00:02\n",
|
75 |
+
" --------------------------- ------------ 7.9/11.6 MB 2.7 MB/s eta 0:00:02\n",
|
76 |
+
" ---------------------------- ----------- 8.1/11.6 MB 2.8 MB/s eta 0:00:02\n",
|
77 |
+
" ---------------------------- ----------- 8.3/11.6 MB 2.8 MB/s eta 0:00:02\n",
|
78 |
+
" ----------------------------- ---------- 8.6/11.6 MB 2.8 MB/s eta 0:00:02\n",
|
79 |
+
" ------------------------------ --------- 8.8/11.6 MB 2.8 MB/s eta 0:00:01\n",
|
80 |
+
" ------------------------------- -------- 9.1/11.6 MB 2.9 MB/s eta 0:00:01\n",
|
81 |
+
" -------------------------------- ------- 9.3/11.6 MB 2.9 MB/s eta 0:00:01\n",
|
82 |
+
" -------------------------------- ------- 9.4/11.6 MB 2.9 MB/s eta 0:00:01\n",
|
83 |
+
" -------------------------------- ------- 9.5/11.6 MB 2.9 MB/s eta 0:00:01\n",
|
84 |
+
" --------------------------------- ------ 9.6/11.6 MB 2.9 MB/s eta 0:00:01\n",
|
85 |
+
" --------------------------------- ------ 9.8/11.6 MB 2.9 MB/s eta 0:00:01\n",
|
86 |
+
" ---------------------------------- ----- 10.1/11.6 MB 3.0 MB/s eta 0:00:01\n",
|
87 |
+
" ----------------------------------- ---- 10.3/11.6 MB 3.1 MB/s eta 0:00:01\n",
|
88 |
+
" ------------------------------------ --- 10.5/11.6 MB 3.3 MB/s eta 0:00:01\n",
|
89 |
+
" ------------------------------------ --- 10.7/11.6 MB 3.5 MB/s eta 0:00:01\n",
|
90 |
+
" ------------------------------------- -- 10.9/11.6 MB 3.5 MB/s eta 0:00:01\n",
|
91 |
+
" -------------------------------------- - 11.0/11.6 MB 3.6 MB/s eta 0:00:01\n",
|
92 |
+
" --------------------------------------- 11.3/11.6 MB 3.6 MB/s eta 0:00:01\n",
|
93 |
+
" --------------------------------------- 11.5/11.6 MB 3.7 MB/s eta 0:00:01\n",
|
94 |
+
" --------------------------------------- 11.6/11.6 MB 3.6 MB/s eta 0:00:01\n",
|
95 |
+
" ---------------------------------------- 11.6/11.6 MB 3.6 MB/s eta 0:00:00\n",
|
96 |
+
"Collecting filelock\n",
|
97 |
+
" Using cached filelock-3.13.1-py3-none-any.whl (11 kB)\n",
|
98 |
+
"Requirement already satisfied: requests>=2.19.0 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from datasets) (2.31.0)\n",
|
99 |
+
"Collecting multiprocess\n",
|
100 |
+
" Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n",
|
101 |
+
" ---------------------------------------- 0.0/134.8 kB ? eta -:--:--\n",
|
102 |
+
" ---------------------------------- --- 122.9/134.8 kB 3.5 MB/s eta 0:00:01\n",
|
103 |
+
" -------------------------------------- 134.8/134.8 kB 2.0 MB/s eta 0:00:00\n",
|
104 |
+
"Collecting huggingface-hub>=0.19.4\n",
|
105 |
+
" Using cached huggingface_hub-0.20.3-py3-none-any.whl (330 kB)\n",
|
106 |
+
"Collecting pyarrow>=8.0.0\n",
|
107 |
+
" Downloading pyarrow-15.0.0-cp310-cp310-win_amd64.whl (24.8 MB)\n",
|
108 |
+
" ---------------------------------------- 0.0/24.8 MB ? eta -:--:--\n",
|
109 |
+
" ---------------------------------------- 0.1/24.8 MB 3.3 MB/s eta 0:00:08\n",
|
110 |
+
" ---------------------------------------- 0.2/24.8 MB 3.5 MB/s eta 0:00:08\n",
|
111 |
+
" --------------------------------------- 0.5/24.8 MB 4.2 MB/s eta 0:00:06\n",
|
112 |
+
" - -------------------------------------- 0.6/24.8 MB 4.1 MB/s eta 0:00:06\n",
|
113 |
+
" - -------------------------------------- 0.8/24.8 MB 4.2 MB/s eta 0:00:06\n",
|
114 |
+
" - -------------------------------------- 1.0/24.8 MB 4.2 MB/s eta 0:00:06\n",
|
115 |
+
" - -------------------------------------- 1.2/24.8 MB 4.4 MB/s eta 0:00:06\n",
|
116 |
+
" -- ------------------------------------- 1.4/24.8 MB 4.4 MB/s eta 0:00:06\n",
|
117 |
+
" -- ------------------------------------- 1.6/24.8 MB 4.5 MB/s eta 0:00:06\n",
|
118 |
+
" -- ------------------------------------- 1.8/24.8 MB 4.6 MB/s eta 0:00:05\n",
|
119 |
+
" --- ------------------------------------ 2.0/24.8 MB 4.5 MB/s eta 0:00:06\n",
|
120 |
+
" --- ------------------------------------ 2.2/24.8 MB 4.6 MB/s eta 0:00:05\n",
|
121 |
+
" --- ------------------------------------ 2.3/24.8 MB 4.5 MB/s eta 0:00:06\n",
|
122 |
+
" ---- ----------------------------------- 2.6/24.8 MB 4.5 MB/s eta 0:00:05\n",
|
123 |
+
" ---- ----------------------------------- 2.7/24.8 MB 4.5 MB/s eta 0:00:05\n",
|
124 |
+
" ---- ----------------------------------- 2.8/24.8 MB 4.4 MB/s eta 0:00:06\n",
|
125 |
+
" ---- ----------------------------------- 3.0/24.8 MB 4.5 MB/s eta 0:00:05\n",
|
126 |
+
" ---- ----------------------------------- 3.1/24.8 MB 4.4 MB/s eta 0:00:05\n",
|
127 |
+
" ----- ---------------------------------- 3.3/24.8 MB 4.4 MB/s eta 0:00:05\n",
|
128 |
+
" ----- ---------------------------------- 3.4/24.8 MB 4.3 MB/s eta 0:00:05\n",
|
129 |
+
" ----- ---------------------------------- 3.7/24.8 MB 4.3 MB/s eta 0:00:05\n",
|
130 |
+
" ------ --------------------------------- 3.9/24.8 MB 4.5 MB/s eta 0:00:05\n",
|
131 |
+
" ------ --------------------------------- 4.2/24.8 MB 4.5 MB/s eta 0:00:05\n",
|
132 |
+
" ------- -------------------------------- 4.4/24.8 MB 4.7 MB/s eta 0:00:05\n",
|
133 |
+
" ------- -------------------------------- 4.7/24.8 MB 4.7 MB/s eta 0:00:05\n",
|
134 |
+
" ------- -------------------------------- 4.8/24.8 MB 4.6 MB/s eta 0:00:05\n",
|
135 |
+
" ------- -------------------------------- 4.9/24.8 MB 4.5 MB/s eta 0:00:05\n",
|
136 |
+
" -------- ------------------------------- 5.0/24.8 MB 4.5 MB/s eta 0:00:05\n",
|
137 |
+
" -------- ------------------------------- 5.1/24.8 MB 4.4 MB/s eta 0:00:05\n",
|
138 |
+
" -------- ------------------------------- 5.2/24.8 MB 4.3 MB/s eta 0:00:05\n",
|
139 |
+
" -------- ------------------------------- 5.3/24.8 MB 4.3 MB/s eta 0:00:05\n",
|
140 |
+
" -------- ------------------------------- 5.5/24.8 MB 4.3 MB/s eta 0:00:05\n",
|
141 |
+
" --------- ------------------------------ 5.7/24.8 MB 4.3 MB/s eta 0:00:05\n",
|
142 |
+
" --------- ------------------------------ 5.9/24.8 MB 4.4 MB/s eta 0:00:05\n",
|
143 |
+
" --------- ------------------------------ 6.1/24.8 MB 4.3 MB/s eta 0:00:05\n",
|
144 |
+
" ---------- ----------------------------- 6.3/24.8 MB 4.4 MB/s eta 0:00:05\n",
|
145 |
+
" ---------- ----------------------------- 6.5/24.8 MB 4.3 MB/s eta 0:00:05\n",
|
146 |
+
" ---------- ----------------------------- 6.6/24.8 MB 4.3 MB/s eta 0:00:05\n",
|
147 |
+
" ---------- ----------------------------- 6.8/24.8 MB 4.3 MB/s eta 0:00:05\n",
|
148 |
+
" ----------- ---------------------------- 7.0/24.8 MB 4.3 MB/s eta 0:00:05\n",
|
149 |
+
" ----------- ---------------------------- 7.1/24.8 MB 4.3 MB/s eta 0:00:05\n",
|
150 |
+
" ----------- ---------------------------- 7.2/24.8 MB 4.3 MB/s eta 0:00:05\n",
|
151 |
+
" ----------- ---------------------------- 7.3/24.8 MB 4.2 MB/s eta 0:00:05\n",
|
152 |
+
" ------------ --------------------------- 7.5/24.8 MB 4.3 MB/s eta 0:00:05\n",
|
153 |
+
" ------------ --------------------------- 7.7/24.8 MB 4.2 MB/s eta 0:00:05\n",
|
154 |
+
" ------------ --------------------------- 7.9/24.8 MB 4.3 MB/s eta 0:00:04\n",
|
155 |
+
" ------------- -------------------------- 8.1/24.8 MB 4.3 MB/s eta 0:00:04\n",
|
156 |
+
" ------------- -------------------------- 8.2/24.8 MB 4.2 MB/s eta 0:00:04\n",
|
157 |
+
" ------------- -------------------------- 8.5/24.8 MB 4.3 MB/s eta 0:00:04\n",
|
158 |
+
" -------------- ------------------------- 8.7/24.8 MB 4.3 MB/s eta 0:00:04\n",
|
159 |
+
" -------------- ------------------------- 8.9/24.8 MB 4.3 MB/s eta 0:00:04\n",
|
160 |
+
" -------------- ------------------------- 9.1/24.8 MB 4.3 MB/s eta 0:00:04\n",
|
161 |
+
" -------------- ------------------------- 9.2/24.8 MB 4.3 MB/s eta 0:00:04\n",
|
162 |
+
" --------------- ------------------------ 9.5/24.8 MB 4.3 MB/s eta 0:00:04\n",
|
163 |
+
" --------------- ------------------------ 9.6/24.8 MB 4.3 MB/s eta 0:00:04\n",
|
164 |
+
" --------------- ------------------------ 9.9/24.8 MB 4.4 MB/s eta 0:00:04\n",
|
165 |
+
" ---------------- ----------------------- 10.0/24.8 MB 4.4 MB/s eta 0:00:04\n",
|
166 |
+
" ---------------- ----------------------- 10.3/24.8 MB 4.4 MB/s eta 0:00:04\n",
|
167 |
+
" ---------------- ----------------------- 10.4/24.8 MB 4.4 MB/s eta 0:00:04\n",
|
168 |
+
" ----------------- ---------------------- 10.6/24.8 MB 4.4 MB/s eta 0:00:04\n",
|
169 |
+
" ----------------- ---------------------- 10.7/24.8 MB 4.4 MB/s eta 0:00:04\n",
|
170 |
+
" ----------------- ---------------------- 10.9/24.8 MB 4.4 MB/s eta 0:00:04\n",
|
171 |
+
" ----------------- ---------------------- 11.1/24.8 MB 4.4 MB/s eta 0:00:04\n",
|
172 |
+
" ------------------ --------------------- 11.3/24.8 MB 4.4 MB/s eta 0:00:04\n",
|
173 |
+
" ------------------ --------------------- 11.6/24.8 MB 4.4 MB/s eta 0:00:03\n",
|
174 |
+
" ------------------ --------------------- 11.6/24.8 MB 4.4 MB/s eta 0:00:03\n",
|
175 |
+
" ------------------ --------------------- 11.7/24.8 MB 4.3 MB/s eta 0:00:04\n",
|
176 |
+
" ------------------- -------------------- 12.0/24.8 MB 4.4 MB/s eta 0:00:03\n",
|
177 |
+
" ------------------- -------------------- 12.2/24.8 MB 4.4 MB/s eta 0:00:03\n",
|
178 |
+
" -------------------- ------------------- 12.5/24.8 MB 4.4 MB/s eta 0:00:03\n",
|
179 |
+
" -------------------- ------------------- 12.7/24.8 MB 4.5 MB/s eta 0:00:03\n",
|
180 |
+
" -------------------- ------------------- 12.9/24.8 MB 4.5 MB/s eta 0:00:03\n",
|
181 |
+
" --------------------- ------------------ 13.1/24.8 MB 4.5 MB/s eta 0:00:03\n",
|
182 |
+
" --------------------- ------------------ 13.3/24.8 MB 4.5 MB/s eta 0:00:03\n",
|
183 |
+
" --------------------- ------------------ 13.5/24.8 MB 4.5 MB/s eta 0:00:03\n",
|
184 |
+
" ---------------------- ----------------- 13.7/24.8 MB 4.6 MB/s eta 0:00:03\n",
|
185 |
+
" ---------------------- ----------------- 13.8/24.8 MB 4.5 MB/s eta 0:00:03\n",
|
186 |
+
" ---------------------- ----------------- 14.0/24.8 MB 4.5 MB/s eta 0:00:03\n",
|
187 |
+
" ---------------------- ----------------- 14.2/24.8 MB 4.4 MB/s eta 0:00:03\n",
|
188 |
+
" ----------------------- ---------------- 14.3/24.8 MB 4.5 MB/s eta 0:00:03\n",
|
189 |
+
" ----------------------- ---------------- 14.5/24.8 MB 4.4 MB/s eta 0:00:03\n",
|
190 |
+
" ----------------------- ---------------- 14.7/24.8 MB 4.4 MB/s eta 0:00:03\n",
|
191 |
+
" ------------------------ --------------- 15.0/24.8 MB 4.4 MB/s eta 0:00:03\n",
|
192 |
+
" ------------------------ --------------- 15.2/24.8 MB 4.5 MB/s eta 0:00:03\n",
|
193 |
+
" ------------------------ --------------- 15.4/24.8 MB 4.6 MB/s eta 0:00:03\n",
|
194 |
+
" ------------------------- -------------- 15.5/24.8 MB 4.5 MB/s eta 0:00:03\n",
|
195 |
+
" ------------------------- -------------- 15.7/24.8 MB 4.5 MB/s eta 0:00:03\n",
|
196 |
+
" ------------------------- -------------- 15.9/24.8 MB 4.5 MB/s eta 0:00:02\n",
|
197 |
+
" -------------------------- ------------- 16.1/24.8 MB 4.5 MB/s eta 0:00:02\n",
|
198 |
+
" -------------------------- ------------- 16.3/24.8 MB 4.5 MB/s eta 0:00:02\n",
|
199 |
+
" -------------------------- ------------- 16.4/24.8 MB 4.5 MB/s eta 0:00:02\n",
|
200 |
+
" -------------------------- ------------- 16.6/24.8 MB 4.4 MB/s eta 0:00:02\n",
|
201 |
+
" --------------------------- ------------ 16.8/24.8 MB 4.5 MB/s eta 0:00:02\n",
|
202 |
+
" --------------------------- ------------ 17.0/24.8 MB 4.5 MB/s eta 0:00:02\n",
|
203 |
+
" --------------------------- ------------ 17.1/24.8 MB 4.5 MB/s eta 0:00:02\n",
|
204 |
+
" --------------------------- ------------ 17.2/24.8 MB 4.4 MB/s eta 0:00:02\n",
|
205 |
+
" ---------------------------- ----------- 17.4/24.8 MB 4.5 MB/s eta 0:00:02\n",
|
206 |
+
" ---------------------------- ----------- 17.6/24.8 MB 4.5 MB/s eta 0:00:02\n",
|
207 |
+
" ---------------------------- ----------- 17.8/24.8 MB 4.5 MB/s eta 0:00:02\n",
|
208 |
+
" ----------------------------- ---------- 18.0/24.8 MB 4.5 MB/s eta 0:00:02\n",
|
209 |
+
" ----------------------------- ---------- 18.2/24.8 MB 4.5 MB/s eta 0:00:02\n",
|
210 |
+
" ----------------------------- ---------- 18.4/24.8 MB 4.5 MB/s eta 0:00:02\n",
|
211 |
+
" ----------------------------- ---------- 18.6/24.8 MB 4.4 MB/s eta 0:00:02\n",
|
212 |
+
" ------------------------------ --------- 18.7/24.8 MB 4.4 MB/s eta 0:00:02\n",
|
213 |
+
" ------------------------------ --------- 18.9/24.8 MB 4.4 MB/s eta 0:00:02\n",
|
214 |
+
" ------------------------------ --------- 19.0/24.8 MB 4.3 MB/s eta 0:00:02\n",
|
215 |
+
" ------------------------------ --------- 19.2/24.8 MB 4.3 MB/s eta 0:00:02\n",
|
216 |
+
" ------------------------------- -------- 19.4/24.8 MB 4.3 MB/s eta 0:00:02\n",
|
217 |
+
" ------------------------------- -------- 19.6/24.8 MB 4.3 MB/s eta 0:00:02\n",
|
218 |
+
" ------------------------------- -------- 19.8/24.8 MB 4.3 MB/s eta 0:00:02\n",
|
219 |
+
" -------------------------------- ------- 20.0/24.8 MB 4.3 MB/s eta 0:00:02\n",
|
220 |
+
" -------------------------------- ------- 20.2/24.8 MB 4.3 MB/s eta 0:00:02\n",
|
221 |
+
" -------------------------------- ------- 20.4/24.8 MB 4.2 MB/s eta 0:00:02\n",
|
222 |
+
" --------------------------------- ------ 20.5/24.8 MB 4.2 MB/s eta 0:00:02\n",
|
223 |
+
" --------------------------------- ------ 20.6/24.8 MB 4.2 MB/s eta 0:00:01\n",
|
224 |
+
" --------------------------------- ------ 20.7/24.8 MB 4.1 MB/s eta 0:00:01\n",
|
225 |
+
" --------------------------------- ------ 20.8/24.8 MB 4.1 MB/s eta 0:00:01\n",
|
226 |
+
" --------------------------------- ------ 21.0/24.8 MB 4.1 MB/s eta 0:00:01\n",
|
227 |
+
" ---------------------------------- ----- 21.1/24.8 MB 4.1 MB/s eta 0:00:01\n",
|
228 |
+
" ---------------------------------- ----- 21.3/24.8 MB 4.1 MB/s eta 0:00:01\n",
|
229 |
+
" ---------------------------------- ----- 21.4/24.8 MB 4.1 MB/s eta 0:00:01\n",
|
230 |
+
" ---------------------------------- ----- 21.4/24.8 MB 4.1 MB/s eta 0:00:01\n",
|
231 |
+
" ---------------------------------- ----- 21.6/24.8 MB 4.0 MB/s eta 0:00:01\n",
|
232 |
+
" ----------------------------------- ---- 21.8/24.8 MB 4.0 MB/s eta 0:00:01\n",
|
233 |
+
" ----------------------------------- ---- 22.0/24.8 MB 4.0 MB/s eta 0:00:01\n",
|
234 |
+
" ----------------------------------- ---- 22.1/24.8 MB 4.0 MB/s eta 0:00:01\n",
|
235 |
+
" ----------------------------------- ---- 22.3/24.8 MB 4.0 MB/s eta 0:00:01\n",
|
236 |
+
" ------------------------------------ --- 22.5/24.8 MB 3.9 MB/s eta 0:00:01\n",
|
237 |
+
" ------------------------------------ --- 22.6/24.8 MB 3.9 MB/s eta 0:00:01\n",
|
238 |
+
" ------------------------------------ --- 22.7/24.8 MB 3.9 MB/s eta 0:00:01\n",
|
239 |
+
" ------------------------------------ --- 22.8/24.8 MB 3.8 MB/s eta 0:00:01\n",
|
240 |
+
" ------------------------------------ --- 22.9/24.8 MB 3.8 MB/s eta 0:00:01\n",
|
241 |
+
" ------------------------------------- -- 23.0/24.8 MB 3.8 MB/s eta 0:00:01\n",
|
242 |
+
" ------------------------------------- -- 23.0/24.8 MB 3.8 MB/s eta 0:00:01\n",
|
243 |
+
" ------------------------------------- -- 23.0/24.8 MB 3.8 MB/s eta 0:00:01\n",
|
244 |
+
" ------------------------------------- -- 23.0/24.8 MB 3.8 MB/s eta 0:00:01\n",
|
245 |
+
" ------------------------------------- -- 23.0/24.8 MB 3.8 MB/s eta 0:00:01\n",
|
246 |
+
" ------------------------------------- -- 23.0/24.8 MB 3.8 MB/s eta 0:00:01\n",
|
247 |
+
" ------------------------------------- -- 23.1/24.8 MB 3.5 MB/s eta 0:00:01\n",
|
248 |
+
" ------------------------------------- -- 23.2/24.8 MB 3.5 MB/s eta 0:00:01\n",
|
249 |
+
" ------------------------------------- -- 23.4/24.8 MB 3.5 MB/s eta 0:00:01\n",
|
250 |
+
" ------------------------------------- -- 23.5/24.8 MB 3.4 MB/s eta 0:00:01\n",
|
251 |
+
" ------------------------------------- -- 23.5/24.8 MB 3.4 MB/s eta 0:00:01\n",
|
252 |
+
" -------------------------------------- - 23.6/24.8 MB 3.4 MB/s eta 0:00:01\n",
|
253 |
+
" -------------------------------------- - 23.8/24.8 MB 3.4 MB/s eta 0:00:01\n",
|
254 |
+
" -------------------------------------- - 24.0/24.8 MB 3.4 MB/s eta 0:00:01\n",
|
255 |
+
" --------------------------------------- 24.2/24.8 MB 3.4 MB/s eta 0:00:01\n",
|
256 |
+
" --------------------------------------- 24.4/24.8 MB 3.4 MB/s eta 0:00:01\n",
|
257 |
+
" --------------------------------------- 24.6/24.8 MB 3.4 MB/s eta 0:00:01\n",
|
258 |
+
" --------------------------------------- 24.7/24.8 MB 3.4 MB/s eta 0:00:01\n",
|
259 |
+
" --------------------------------------- 24.8/24.8 MB 3.4 MB/s eta 0:00:01\n",
|
260 |
+
" --------------------------------------- 24.8/24.8 MB 3.4 MB/s eta 0:00:01\n",
|
261 |
+
" --------------------------------------- 24.8/24.8 MB 3.4 MB/s eta 0:00:01\n",
|
262 |
+
" --------------------------------------- 24.8/24.8 MB 3.4 MB/s eta 0:00:01\n",
|
263 |
+
" --------------------------------------- 24.8/24.8 MB 3.4 MB/s eta 0:00:01\n",
|
264 |
+
" --------------------------------------- 24.8/24.8 MB 3.4 MB/s eta 0:00:01\n",
|
265 |
+
" --------------------------------------- 24.8/24.8 MB 3.4 MB/s eta 0:00:01\n",
|
266 |
+
" ---------------------------------------- 24.8/24.8 MB 3.1 MB/s eta 0:00:00\n",
|
267 |
+
"Collecting fsspec[http]<=2023.10.0,>=2023.1.0\n",
|
268 |
+
" Using cached fsspec-2023.10.0-py3-none-any.whl (166 kB)\n",
|
269 |
+
"Collecting aiohttp\n",
|
270 |
+
" Downloading aiohttp-3.9.1-cp310-cp310-win_amd64.whl (364 kB)\n",
|
271 |
+
" ---------------------------------------- 0.0/364.6 kB ? eta -:--:--\n",
|
272 |
+
" ----------- -------------------------- 112.6/364.6 kB 3.2 MB/s eta 0:00:01\n",
|
273 |
+
" --------------------- ---------------- 204.8/364.6 kB 3.1 MB/s eta 0:00:01\n",
|
274 |
+
" ----------------------------------- -- 337.9/364.6 kB 3.0 MB/s eta 0:00:01\n",
|
275 |
+
" -------------------------------------- 364.6/364.6 kB 2.8 MB/s eta 0:00:00\n",
|
276 |
+
"Collecting numpy>=1.17\n",
|
277 |
+
" Downloading numpy-1.26.3-cp310-cp310-win_amd64.whl (15.8 MB)\n",
|
278 |
+
" ---------------------------------------- 0.0/15.8 MB ? eta -:--:--\n",
|
279 |
+
" --------------------------------------- 0.2/15.8 MB 4.6 MB/s eta 0:00:04\n",
|
280 |
+
" --------------------------------------- 0.3/15.8 MB 4.1 MB/s eta 0:00:04\n",
|
281 |
+
" - -------------------------------------- 0.5/15.8 MB 3.9 MB/s eta 0:00:04\n",
|
282 |
+
" - -------------------------------------- 0.7/15.8 MB 3.8 MB/s eta 0:00:04\n",
|
283 |
+
" -- ------------------------------------- 0.8/15.8 MB 3.6 MB/s eta 0:00:05\n",
|
284 |
+
" -- ------------------------------------- 1.0/15.8 MB 3.9 MB/s eta 0:00:04\n",
|
285 |
+
" -- ------------------------------------- 1.1/15.8 MB 3.8 MB/s eta 0:00:04\n",
|
286 |
+
" --- ------------------------------------ 1.3/15.8 MB 3.7 MB/s eta 0:00:04\n",
|
287 |
+
" --- ------------------------------------ 1.4/15.8 MB 3.8 MB/s eta 0:00:04\n",
|
288 |
+
" --- ------------------------------------ 1.6/15.8 MB 3.8 MB/s eta 0:00:04\n",
|
289 |
+
" ---- ----------------------------------- 1.7/15.8 MB 3.8 MB/s eta 0:00:04\n",
|
290 |
+
" ---- ----------------------------------- 1.9/15.8 MB 3.7 MB/s eta 0:00:04\n",
|
291 |
+
" ----- ---------------------------------- 2.0/15.8 MB 3.7 MB/s eta 0:00:04\n",
|
292 |
+
" ----- ---------------------------------- 2.1/15.8 MB 3.6 MB/s eta 0:00:04\n",
|
293 |
+
" ----- ---------------------------------- 2.3/15.8 MB 3.6 MB/s eta 0:00:04\n",
|
294 |
+
" ------ --------------------------------- 2.5/15.8 MB 3.7 MB/s eta 0:00:04\n",
|
295 |
+
" ------ --------------------------------- 2.6/15.8 MB 3.7 MB/s eta 0:00:04\n",
|
296 |
+
" ------- -------------------------------- 2.8/15.8 MB 3.7 MB/s eta 0:00:04\n",
|
297 |
+
" ------- -------------------------------- 3.0/15.8 MB 3.8 MB/s eta 0:00:04\n",
|
298 |
+
" -------- ------------------------------- 3.2/15.8 MB 3.9 MB/s eta 0:00:04\n",
|
299 |
+
" -------- ------------------------------- 3.4/15.8 MB 3.9 MB/s eta 0:00:04\n",
|
300 |
+
" --------- ------------------------------ 3.6/15.8 MB 3.9 MB/s eta 0:00:04\n",
|
301 |
+
" --------- ------------------------------ 3.8/15.8 MB 4.0 MB/s eta 0:00:04\n",
|
302 |
+
" ---------- ----------------------------- 4.0/15.8 MB 4.0 MB/s eta 0:00:03\n",
|
303 |
+
" ---------- ----------------------------- 4.2/15.8 MB 4.0 MB/s eta 0:00:03\n",
|
304 |
+
" ----------- ---------------------------- 4.4/15.8 MB 4.1 MB/s eta 0:00:03\n",
|
305 |
+
" ----------- ---------------------------- 4.6/15.8 MB 4.1 MB/s eta 0:00:03\n",
|
306 |
+
" ------------ --------------------------- 4.8/15.8 MB 4.2 MB/s eta 0:00:03\n",
|
307 |
+
" ------------- -------------------------- 5.1/15.8 MB 4.3 MB/s eta 0:00:03\n",
|
308 |
+
" ------------- -------------------------- 5.2/15.8 MB 4.2 MB/s eta 0:00:03\n",
|
309 |
+
" ------------- -------------------------- 5.4/15.8 MB 4.1 MB/s eta 0:00:03\n",
|
310 |
+
" -------------- ------------------------- 5.6/15.8 MB 4.2 MB/s eta 0:00:03\n",
|
311 |
+
" -------------- ------------------------- 5.8/15.8 MB 4.2 MB/s eta 0:00:03\n",
|
312 |
+
" --------------- ------------------------ 6.1/15.8 MB 4.3 MB/s eta 0:00:03\n",
|
313 |
+
" --------------- ------------------------ 6.3/15.8 MB 4.3 MB/s eta 0:00:03\n",
|
314 |
+
" ---------------- ----------------------- 6.5/15.8 MB 4.3 MB/s eta 0:00:03\n",
|
315 |
+
" ---------------- ----------------------- 6.6/15.8 MB 4.3 MB/s eta 0:00:03\n",
|
316 |
+
" ----------------- ---------------------- 6.7/15.8 MB 4.3 MB/s eta 0:00:03\n",
|
317 |
+
" ----------------- ---------------------- 6.7/15.8 MB 4.2 MB/s eta 0:00:03\n",
|
318 |
+
" ----------------- ---------------------- 7.0/15.8 MB 4.2 MB/s eta 0:00:03\n",
|
319 |
+
" ------------------ --------------------- 7.1/15.8 MB 4.2 MB/s eta 0:00:03\n",
|
320 |
+
" ------------------ --------------------- 7.2/15.8 MB 4.2 MB/s eta 0:00:03\n",
|
321 |
+
" ------------------ --------------------- 7.5/15.8 MB 4.2 MB/s eta 0:00:02\n",
|
322 |
+
" ------------------- -------------------- 7.7/15.8 MB 4.2 MB/s eta 0:00:02\n",
|
323 |
+
" -------------------- ------------------- 7.9/15.8 MB 4.2 MB/s eta 0:00:02\n",
|
324 |
+
" -------------------- ------------------- 8.0/15.8 MB 4.2 MB/s eta 0:00:02\n",
|
325 |
+
" -------------------- ------------------- 8.2/15.8 MB 4.2 MB/s eta 0:00:02\n",
|
326 |
+
" --------------------- ------------------ 8.3/15.8 MB 4.2 MB/s eta 0:00:02\n",
|
327 |
+
" --------------------- ------------------ 8.4/15.8 MB 4.2 MB/s eta 0:00:02\n",
|
328 |
+
" --------------------- ------------------ 8.6/15.8 MB 4.2 MB/s eta 0:00:02\n",
|
329 |
+
" ---------------------- ----------------- 8.7/15.8 MB 4.1 MB/s eta 0:00:02\n",
|
330 |
+
" ---------------------- ----------------- 8.9/15.8 MB 4.1 MB/s eta 0:00:02\n",
|
331 |
+
" ----------------------- ---------------- 9.1/15.8 MB 4.2 MB/s eta 0:00:02\n",
|
332 |
+
" ----------------------- ---------------- 9.3/15.8 MB 4.1 MB/s eta 0:00:02\n",
|
333 |
+
" ----------------------- ---------------- 9.5/15.8 MB 4.1 MB/s eta 0:00:02\n",
|
334 |
+
" ------------------------ --------------- 9.6/15.8 MB 4.1 MB/s eta 0:00:02\n",
|
335 |
+
" ------------------------ --------------- 9.8/15.8 MB 4.1 MB/s eta 0:00:02\n",
|
336 |
+
" ------------------------- -------------- 9.9/15.8 MB 4.1 MB/s eta 0:00:02\n",
|
337 |
+
" ------------------------- -------------- 10.1/15.8 MB 4.2 MB/s eta 0:00:02\n",
|
338 |
+
" -------------------------- ------------- 10.3/15.8 MB 4.1 MB/s eta 0:00:02\n",
|
339 |
+
" -------------------------- ------------- 10.4/15.8 MB 4.1 MB/s eta 0:00:02\n",
|
340 |
+
" -------------------------- ------------- 10.5/15.8 MB 4.1 MB/s eta 0:00:02\n",
|
341 |
+
" -------------------------- ------------- 10.7/15.8 MB 4.1 MB/s eta 0:00:02\n",
|
342 |
+
" --------------------------- ------------ 10.9/15.8 MB 4.1 MB/s eta 0:00:02\n",
|
343 |
+
" --------------------------- ------------ 10.9/15.8 MB 4.1 MB/s eta 0:00:02\n",
|
344 |
+
" ---------------------------- ----------- 11.1/15.8 MB 4.1 MB/s eta 0:00:02\n",
|
345 |
+
" ---------------------------- ----------- 11.3/15.8 MB 4.1 MB/s eta 0:00:02\n",
|
346 |
+
" ---------------------------- ----------- 11.4/15.8 MB 4.1 MB/s eta 0:00:02\n",
|
347 |
+
" ----------------------------- ---------- 11.6/15.8 MB 4.1 MB/s eta 0:00:02\n",
|
348 |
+
" ----------------------------- ---------- 11.8/15.8 MB 4.1 MB/s eta 0:00:01\n",
|
349 |
+
" ------------------------------ --------- 11.9/15.8 MB 4.1 MB/s eta 0:00:01\n",
|
350 |
+
" ------------------------------ --------- 12.1/15.8 MB 4.1 MB/s eta 0:00:01\n",
|
351 |
+
" ------------------------------ --------- 12.2/15.8 MB 4.1 MB/s eta 0:00:01\n",
|
352 |
+
" ------------------------------- -------- 12.3/15.8 MB 4.1 MB/s eta 0:00:01\n",
|
353 |
+
" ------------------------------- -------- 12.3/15.8 MB 4.1 MB/s eta 0:00:01\n",
|
354 |
+
" ------------------------------- -------- 12.4/15.8 MB 4.1 MB/s eta 0:00:01\n",
|
355 |
+
" ------------------------------- -------- 12.5/15.8 MB 4.0 MB/s eta 0:00:01\n",
|
356 |
+
" ------------------------------- -------- 12.6/15.8 MB 4.0 MB/s eta 0:00:01\n",
|
357 |
+
" ------------------------------- -------- 12.6/15.8 MB 3.9 MB/s eta 0:00:01\n",
|
358 |
+
" -------------------------------- ------- 12.7/15.8 MB 3.9 MB/s eta 0:00:01\n",
|
359 |
+
" -------------------------------- ------- 12.9/15.8 MB 3.9 MB/s eta 0:00:01\n",
|
360 |
+
" --------------------------------- ------ 13.0/15.8 MB 3.9 MB/s eta 0:00:01\n",
|
361 |
+
" --------------------------------- ------ 13.1/15.8 MB 3.9 MB/s eta 0:00:01\n",
|
362 |
+
" --------------------------------- ------ 13.3/15.8 MB 3.9 MB/s eta 0:00:01\n",
|
363 |
+
" ---------------------------------- ----- 13.5/15.8 MB 3.9 MB/s eta 0:00:01\n",
|
364 |
+
" ---------------------------------- ----- 13.6/15.8 MB 3.9 MB/s eta 0:00:01\n",
|
365 |
+
" ---------------------------------- ----- 13.8/15.8 MB 3.8 MB/s eta 0:00:01\n",
|
366 |
+
" ---------------------------------- ----- 13.8/15.8 MB 3.8 MB/s eta 0:00:01\n",
|
367 |
+
" ----------------------------------- ---- 14.0/15.8 MB 3.8 MB/s eta 0:00:01\n",
|
368 |
+
" ----------------------------------- ---- 14.2/15.8 MB 3.8 MB/s eta 0:00:01\n",
|
369 |
+
" ------------------------------------ --- 14.3/15.8 MB 3.8 MB/s eta 0:00:01\n",
|
370 |
+
" ------------------------------------ --- 14.5/15.8 MB 3.7 MB/s eta 0:00:01\n",
|
371 |
+
" ------------------------------------ --- 14.6/15.8 MB 3.7 MB/s eta 0:00:01\n",
|
372 |
+
" ------------------------------------- -- 14.8/15.8 MB 3.7 MB/s eta 0:00:01\n",
|
373 |
+
" ------------------------------------- -- 15.0/15.8 MB 3.7 MB/s eta 0:00:01\n",
|
374 |
+
" -------------------------------------- - 15.1/15.8 MB 3.7 MB/s eta 0:00:01\n",
|
375 |
+
" -------------------------------------- - 15.3/15.8 MB 3.7 MB/s eta 0:00:01\n",
|
376 |
+
" --------------------------------------- 15.4/15.8 MB 3.7 MB/s eta 0:00:01\n",
|
377 |
+
" --------------------------------------- 15.6/15.8 MB 3.7 MB/s eta 0:00:01\n",
|
378 |
+
" --------------------------------------- 15.7/15.8 MB 3.7 MB/s eta 0:00:01\n",
|
379 |
+
" --------------------------------------- 15.8/15.8 MB 3.7 MB/s eta 0:00:01\n",
|
380 |
+
" --------------------------------------- 15.8/15.8 MB 3.7 MB/s eta 0:00:01\n",
|
381 |
+
" ---------------------------------------- 15.8/15.8 MB 3.5 MB/s eta 0:00:00\n",
|
382 |
+
"Collecting tqdm>=4.62.1\n",
|
383 |
+
" Using cached tqdm-4.66.1-py3-none-any.whl (78 kB)\n",
|
384 |
+
"Collecting xxhash\n",
|
385 |
+
" Downloading xxhash-3.4.1-cp310-cp310-win_amd64.whl (29 kB)\n",
|
386 |
+
"Collecting pyarrow-hotfix\n",
|
387 |
+
" Using cached pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n",
|
388 |
+
"Collecting dill<0.3.8,>=0.3.0\n",
|
389 |
+
" Using cached dill-0.3.7-py3-none-any.whl (115 kB)\n",
|
390 |
+
"Requirement already satisfied: pyyaml>=5.1 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from datasets) (6.0.1)\n",
|
391 |
+
"Requirement already satisfied: packaging in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from datasets) (23.2)\n",
|
392 |
+
"Collecting frozenlist>=1.1.1\n",
|
393 |
+
" Downloading frozenlist-1.4.1-cp310-cp310-win_amd64.whl (50 kB)\n",
|
394 |
+
" ---------------------------------------- 0.0/50.4 kB ? eta -:--:--\n",
|
395 |
+
" ---------------------------------------- 50.4/50.4 kB 2.7 MB/s eta 0:00:00\n",
|
396 |
+
"Collecting aiosignal>=1.1.2\n",
|
397 |
+
" Using cached aiosignal-1.3.1-py3-none-any.whl (7.6 kB)\n",
|
398 |
+
"Collecting multidict<7.0,>=4.5\n",
|
399 |
+
" Downloading multidict-6.0.4-cp310-cp310-win_amd64.whl (28 kB)\n",
|
400 |
+
"Requirement already satisfied: attrs>=17.3.0 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from aiohttp->datasets) (23.2.0)\n",
|
401 |
+
"Collecting yarl<2.0,>=1.0\n",
|
402 |
+
" Downloading yarl-1.9.4-cp310-cp310-win_amd64.whl (76 kB)\n",
|
403 |
+
" ---------------------------------------- 0.0/76.4 kB ? eta -:--:--\n",
|
404 |
+
" ---------------------------------------- 76.4/76.4 kB 2.1 MB/s eta 0:00:00\n",
|
405 |
+
"Collecting async-timeout<5.0,>=4.0\n",
|
406 |
+
" Downloading async_timeout-4.0.3-py3-none-any.whl (5.7 kB)\n",
|
407 |
+
"Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface-hub>=0.19.4->datasets) (4.9.0)\n",
|
408 |
+
"Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests>=2.19.0->datasets) (3.3.2)\n",
|
409 |
+
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests>=2.19.0->datasets) (3.6)\n",
|
410 |
+
"Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests>=2.19.0->datasets) (2.1.0)\n",
|
411 |
+
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests>=2.19.0->datasets) (2023.11.17)\n",
|
412 |
+
"Requirement already satisfied: colorama in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from tqdm>=4.62.1->datasets) (0.4.6)\n",
|
413 |
+
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from pandas->datasets) (2.8.2)\n",
|
414 |
+
"Collecting pytz>=2020.1\n",
|
415 |
+
" Using cached pytz-2023.3.post1-py2.py3-none-any.whl (502 kB)\n",
|
416 |
+
"Collecting tzdata>=2022.7\n",
|
417 |
+
" Using cached tzdata-2023.4-py2.py3-none-any.whl (346 kB)\n",
|
418 |
+
"Requirement already satisfied: six>=1.5 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
|
419 |
+
"Installing collected packages: pytz, xxhash, tzdata, tqdm, pyarrow-hotfix, numpy, multidict, fsspec, frozenlist, filelock, dill, async-timeout, yarl, pyarrow, pandas, multiprocess, huggingface-hub, aiosignal, aiohttp, datasets\n",
|
420 |
+
"Successfully installed aiohttp-3.9.1 aiosignal-1.3.1 async-timeout-4.0.3 datasets-2.16.1 dill-0.3.7 filelock-3.13.1 frozenlist-1.4.1 fsspec-2023.10.0 huggingface-hub-0.20.3 multidict-6.0.4 multiprocess-0.70.15 numpy-1.26.3 pandas-2.2.0 pyarrow-15.0.0 pyarrow-hotfix-0.6 pytz-2023.3.post1 tqdm-4.66.1 tzdata-2023.4 xxhash-3.4.1 yarl-1.9.4\n"
|
421 |
+
]
|
422 |
+
},
|
423 |
+
{
|
424 |
+
"name": "stderr",
|
425 |
+
"output_type": "stream",
|
426 |
+
"text": [
|
427 |
+
"\n",
|
428 |
+
"[notice] A new release of pip is available: 23.0.1 -> 23.3.2\n",
|
429 |
+
"[notice] To update, run: python.exe -m pip install --upgrade pip\n"
|
430 |
+
]
|
431 |
+
}
|
432 |
+
],
|
433 |
+
"source": [
|
434 |
+
"!pip install datasets"
|
435 |
+
]
|
436 |
+
},
|
437 |
+
{
|
438 |
+
"cell_type": "code",
|
439 |
+
"execution_count": 6,
|
440 |
+
"id": "bd4b0343-d24f-4ca0-9fcb-94307eaaf732",
|
441 |
+
"metadata": {},
|
442 |
+
"outputs": [
|
443 |
+
{
|
444 |
+
"name": "stdout",
|
445 |
+
"output_type": "stream",
|
446 |
+
"text": [
|
447 |
+
"Collecting ipywidgetsNote: you may need to restart the kernel to use updated packages.\n",
|
448 |
+
"\n"
|
449 |
+
]
|
450 |
+
},
|
451 |
+
{
|
452 |
+
"name": "stderr",
|
453 |
+
"output_type": "stream",
|
454 |
+
"text": [
|
455 |
+
"\n",
|
456 |
+
"[notice] A new release of pip is available: 23.0.1 -> 23.3.2\n",
|
457 |
+
"[notice] To update, run: python.exe -m pip install --upgrade pip\n"
|
458 |
+
]
|
459 |
+
},
|
460 |
+
{
|
461 |
+
"name": "stdout",
|
462 |
+
"output_type": "stream",
|
463 |
+
"text": [
|
464 |
+
" Downloading ipywidgets-8.1.1-py3-none-any.whl (139 kB)\n",
|
465 |
+
" ---------------------------------------- 0.0/139.4 kB ? eta -:--:--\n",
|
466 |
+
" -- ------------------------------------- 10.2/139.4 kB ? eta -:--:--\n",
|
467 |
+
" -------- ---------------------------- 30.7/139.4 kB 445.2 kB/s eta 0:00:01\n",
|
468 |
+
" -------- ---------------------------- 30.7/139.4 kB 445.2 kB/s eta 0:00:01\n",
|
469 |
+
" ---------------- -------------------- 61.4/139.4 kB 299.4 kB/s eta 0:00:01\n",
|
470 |
+
" ----------------------------- ------ 112.6/139.4 kB 469.7 kB/s eta 0:00:01\n",
|
471 |
+
" ----------------------------- ------ 112.6/139.4 kB 469.7 kB/s eta 0:00:01\n",
|
472 |
+
" ------------------------------------ 139.4/139.4 kB 414.0 kB/s eta 0:00:00\n",
|
473 |
+
"Requirement already satisfied: traitlets>=4.3.1 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from ipywidgets) (5.14.1)\n",
|
474 |
+
"Requirement already satisfied: comm>=0.1.3 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from ipywidgets) (0.2.1)\n",
|
475 |
+
"Collecting jupyterlab-widgets~=3.0.9\n",
|
476 |
+
" Downloading jupyterlab_widgets-3.0.9-py3-none-any.whl (214 kB)\n",
|
477 |
+
" ---------------------------------------- 0.0/214.9 kB ? eta -:--:--\n",
|
478 |
+
" ----------- --------------------------- 61.4/214.9 kB 1.7 MB/s eta 0:00:01\n",
|
479 |
+
" ------------------- ------------------ 112.6/214.9 kB 2.2 MB/s eta 0:00:01\n",
|
480 |
+
" ------------------- ------------------ 112.6/214.9 kB 2.2 MB/s eta 0:00:01\n",
|
481 |
+
" ----------------------------- ------ 174.1/214.9 kB 958.1 kB/s eta 0:00:01\n",
|
482 |
+
" -------------------------------------- 214.9/214.9 kB 1.0 MB/s eta 0:00:00\n",
|
483 |
+
"Requirement already satisfied: ipython>=6.1.0 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from ipywidgets) (8.20.0)\n",
|
484 |
+
"Collecting widgetsnbextension~=4.0.9\n",
|
485 |
+
" Downloading widgetsnbextension-4.0.9-py3-none-any.whl (2.3 MB)\n",
|
486 |
+
" ---------------------------------------- 0.0/2.3 MB ? eta -:--:--\n",
|
487 |
+
" - -------------------------------------- 0.1/2.3 MB 3.2 MB/s eta 0:00:01\n",
|
488 |
+
" ---- ----------------------------------- 0.3/2.3 MB 3.1 MB/s eta 0:00:01\n",
|
489 |
+
" ---- ----------------------------------- 0.3/2.3 MB 3.1 MB/s eta 0:00:01\n",
|
490 |
+
" ----- ---------------------------------- 0.3/2.3 MB 1.9 MB/s eta 0:00:02\n",
|
491 |
+
" -------- ------------------------------- 0.5/2.3 MB 2.0 MB/s eta 0:00:01\n",
|
492 |
+
" ------------ --------------------------- 0.7/2.3 MB 2.7 MB/s eta 0:00:01\n",
|
493 |
+
" --------------- ------------------------ 0.9/2.3 MB 2.8 MB/s eta 0:00:01\n",
|
494 |
+
" ------------------- -------------------- 1.1/2.3 MB 3.0 MB/s eta 0:00:01\n",
|
495 |
+
" ---------------------- ----------------- 1.3/2.3 MB 3.1 MB/s eta 0:00:01\n",
|
496 |
+
" -------------------------- ------------- 1.5/2.3 MB 3.3 MB/s eta 0:00:01\n",
|
497 |
+
" ------------------------------ --------- 1.7/2.3 MB 3.5 MB/s eta 0:00:01\n",
|
498 |
+
" -------------------------------- ------- 1.8/2.3 MB 3.3 MB/s eta 0:00:01\n",
|
499 |
+
" ----------------------------------- ---- 2.1/2.3 MB 3.4 MB/s eta 0:00:01\n",
|
500 |
+
" --------------------------------------- 2.2/2.3 MB 3.5 MB/s eta 0:00:01\n",
|
501 |
+
" ---------------------------------------- 2.3/2.3 MB 3.4 MB/s eta 0:00:00\n",
|
502 |
+
"Requirement already satisfied: pygments>=2.4.0 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (2.17.2)\n",
|
503 |
+
"Requirement already satisfied: stack-data in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (0.6.3)\n",
|
504 |
+
"Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.41 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (3.0.43)\n",
|
505 |
+
"Requirement already satisfied: matplotlib-inline in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (0.1.6)\n",
|
506 |
+
"Requirement already satisfied: jedi>=0.16 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (0.19.1)\n",
|
507 |
+
"Requirement already satisfied: colorama in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (0.4.6)\n",
|
508 |
+
"Requirement already satisfied: decorator in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (5.1.1)\n",
|
509 |
+
"Requirement already satisfied: exceptiongroup in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (1.2.0)\n",
|
510 |
+
"Requirement already satisfied: parso<0.9.0,>=0.8.3 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets) (0.8.3)\n",
|
511 |
+
"Requirement already satisfied: wcwidth in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from prompt-toolkit<3.1.0,>=3.0.41->ipython>=6.1.0->ipywidgets) (0.2.13)\n",
|
512 |
+
"Requirement already satisfied: pure-eval in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (0.2.2)\n",
|
513 |
+
"Requirement already satisfied: asttokens>=2.1.0 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (2.4.1)\n",
|
514 |
+
"Requirement already satisfied: executing>=1.2.0 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (2.0.1)\n",
|
515 |
+
"Requirement already satisfied: six>=1.12.0 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from asttokens>=2.1.0->stack-data->ipython>=6.1.0->ipywidgets) (1.16.0)\n",
|
516 |
+
"Installing collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets\n",
|
517 |
+
"Successfully installed ipywidgets-8.1.1 jupyterlab-widgets-3.0.9 widgetsnbextension-4.0.9\n"
|
518 |
+
]
|
519 |
+
}
|
520 |
+
],
|
521 |
+
"source": [
|
522 |
+
"pip install ipywidgets"
|
523 |
+
]
|
524 |
+
},
|
525 |
+
{
|
526 |
+
"cell_type": "code",
|
527 |
+
"execution_count": 8,
|
528 |
+
"id": "9995b754-58c9-4c1d-8e8f-9e55741186c5",
|
529 |
+
"metadata": {},
|
530 |
+
"outputs": [],
|
531 |
+
"source": [
|
532 |
+
"import datasets"
|
533 |
+
]
|
534 |
+
},
|
535 |
+
{
|
536 |
+
"cell_type": "code",
|
537 |
+
"execution_count": 5,
|
538 |
+
"id": "87a513e0-f880-4219-9ffb-9a8c40707a69",
|
539 |
+
"metadata": {},
|
540 |
+
"outputs": [
|
541 |
+
{
|
542 |
+
"data": {
|
543 |
+
"text/plain": [
|
544 |
+
"100641"
|
545 |
+
]
|
546 |
+
},
|
547 |
+
"execution_count": 5,
|
548 |
+
"metadata": {},
|
549 |
+
"output_type": "execute_result"
|
550 |
+
}
|
551 |
+
],
|
552 |
+
"source": [
|
553 |
+
"all_ds = datasets.list_datasets()\n",
|
554 |
+
"len(all_ds)"
|
555 |
+
]
|
556 |
+
},
|
557 |
+
{
|
558 |
+
"cell_type": "code",
|
559 |
+
"execution_count": 9,
|
560 |
+
"id": "f20dcb4d-bd17-4bf4-839f-3013b4f40c84",
|
561 |
+
"metadata": {},
|
562 |
+
"outputs": [
|
563 |
+
{
|
564 |
+
"data": {
|
565 |
+
"text/plain": [
|
566 |
+
"['acronym_identification',\n",
|
567 |
+
" 'ade_corpus_v2',\n",
|
568 |
+
" 'adversarial_qa',\n",
|
569 |
+
" 'aeslc',\n",
|
570 |
+
" 'afrikaans_ner_corpus']"
|
571 |
+
]
|
572 |
+
},
|
573 |
+
"execution_count": 9,
|
574 |
+
"metadata": {},
|
575 |
+
"output_type": "execute_result"
|
576 |
+
}
|
577 |
+
],
|
578 |
+
"source": [
|
579 |
+
"all_ds[:5]"
|
580 |
+
]
|
581 |
+
},
|
582 |
+
{
|
583 |
+
"cell_type": "code",
|
584 |
+
"execution_count": 11,
|
585 |
+
"id": "63e676b3-0ce3-462a-b9e3-cc37c2f33703",
|
586 |
+
"metadata": {},
|
587 |
+
"outputs": [
|
588 |
+
{
|
589 |
+
"name": "stderr",
|
590 |
+
"output_type": "stream",
|
591 |
+
"text": [
|
592 |
+
"C:\\Users\\Dell\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\load.py:1429: FutureWarning: The repository for oscar contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/oscar\n",
|
593 |
+
"You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
|
594 |
+
"Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
|
595 |
+
" warnings.warn(\n",
|
596 |
+
"Downloading builder script: 100%|█████████████████████████████████████████████████| 14.8k/14.8k [00:00<00:00, 14.8MB/s]\n",
|
597 |
+
"Downloading readme: 100%|████████████████████████████████████████████████████████████| 303k/303k [00:00<00:00, 412kB/s]\n",
|
598 |
+
"Downloading data: 100%|█████████████████████████████████████████████████████████████████████| 81.0/81.0 [00:00<?, ?B/s]\n",
|
599 |
+
"Downloading data: 100%|█████████████████████████████████████████████████████████████| 176M/176M [00:54<00:00, 3.21MB/s]\n",
|
600 |
+
"Generating train split: 100%|█████████████████████████████████████████| 120684/120684 [00:15<00:00, 7824.51 examples/s]\n"
|
601 |
+
]
|
602 |
+
}
|
603 |
+
],
|
604 |
+
"source": [
|
605 |
+
"dataset = datasets.load_dataset('oscar', 'unshuffled_deduplicated_si')"
|
606 |
+
]
|
607 |
+
},
|
608 |
+
{
|
609 |
+
"cell_type": "code",
|
610 |
+
"execution_count": 12,
|
611 |
+
"id": "b6593b70-5caa-4da2-92bb-594e67bd964c",
|
612 |
+
"metadata": {},
|
613 |
+
"outputs": [
|
614 |
+
{
|
615 |
+
"data": {
|
616 |
+
"text/plain": [
|
617 |
+
"DatasetDict({\n",
|
618 |
+
" train: Dataset({\n",
|
619 |
+
" features: ['id', 'text'],\n",
|
620 |
+
" num_rows: 120684\n",
|
621 |
+
" })\n",
|
622 |
+
"})"
|
623 |
+
]
|
624 |
+
},
|
625 |
+
"execution_count": 12,
|
626 |
+
"metadata": {},
|
627 |
+
"output_type": "execute_result"
|
628 |
+
}
|
629 |
+
],
|
630 |
+
"source": [
|
631 |
+
"dataset"
|
632 |
+
]
|
633 |
+
},
|
634 |
+
{
|
635 |
+
"cell_type": "code",
|
636 |
+
"execution_count": 13,
|
637 |
+
"id": "d80f383a-3928-48cb-9218-50b2e6a5577b",
|
638 |
+
"metadata": {},
|
639 |
+
"outputs": [
|
640 |
+
{
|
641 |
+
"data": {
|
642 |
+
"text/plain": [
|
643 |
+
"{'id': 0,\n",
|
644 |
+
" 'text': 'ජනාධිපතිවරයා දුෂණයට ඔතරම් විරුද්ධ නම් ශ්\\u200dරී ලංකා ටෙලිකොම් – මොබිටෙල් සඳහා ජනාධිපති විමර්ශන කොමිසමක් පත්කළ යුතු බව එම ආයතනයේ සේවකයින් සඳහන් කරයි.\\nඒ මේ වන විට එම ආයතනය බරපතල සැක සහිත ගනුදෙනු වලින් පිරීගත් අන්ත දුෂිත මෙන්ම ඉතිහාසයේ කිසිදා නොවූවිරූ පාඩුවක් ලබන තැනට පත්ව ඇති නිසාය.\\nවාර්තා වන තොරතුරු අතුරින් ශ්\\u200dරී ලංකා ටෙලිකොම් – මොබිටෙල් හී සිදුව තිබෙන එවැනි අතිශය සැකසහිත ගනුදෙනුවක් වන්නේ Mobitel – stage 8 (upgrade) ටෙන්ඩරය Huawei සමාගමට සහ ZTE සමාගමට ලබාදී ඇති ආකාරයයි.\\nHuawei සමාගමට ලබාදී ඇති ටෙන්ඩරයේ වටිනාකම ඇමරිකානු ඩොලර් මිලියන 79 ක් වන අතර ZTE සමාගමට ලබාදී ඇති ටෙන්ඩරයේ වටිනාකම ඇමරිකානු ඩොලර් මිලියන 6 කි.\\nමෙහි ඇති ගැටළුව වන්නේ ඇමරිකානු ඩොලර් මිලියන 85ක් තරම් ඇති විශාල වටිනාකමකින් යුතු අදාළ ගනුදෙනුව සිදුකිරීමේදී කිසිඳු ටෙන්ඩර් පටිපාටියක් අනුගමනය කර නොතිබීමය. මෙවැනි අධික වැයක් දරන්නට සිදුවන ගනුදෙනුවකදී ඒ සඳහා කැබිනට් අනුමැතිය මෙන්ම විධිමත් ප්\\u200dරසම්පාදන ක්\\u200dරියාවලියක් තුළ සිදුකිරීම නෛතික අවශ්\\u200dයතාවක් වුවත් ඒ ගැන කිසිඳු තැකීමක් ටෙලිකොම්- මොබිටෙල් පරිපාලනය සිදුකර නොමැත.\\nසේවක පිරිස් සඳහන් කරන ආකාරයට මෙම ගනුදෙනුව මුළුමනින්ම පසුගිය වසරේ සිදුකොට ඇත්තේ ටෙලිකොම් සභාපති කුමාරසිංහ සිරිසේනගේ මැදිහත්වීම මතය.\\nඑසේම මෙම ගනුදෙනුව තවදුරටත් සැක උපදවන කාරණයක් වන්නේ පසුගිය සිංහල – හින්දු අවුරුදු සමයේ එනම් අප්\\u200dරේල් 16 සිට 22 දක්වා පී.ජී කුමාරසිංහගේ සමස්ථ පවුලටම චීන සංචාරයක් සඳහා Huawei සමාගම අනුග්\\u200dරහය දක්වා ඇති බවද සේවක පිරිස් එල්ල කරන චෝදනාවය.\\nඑපමණක් නොව මේ වන විට ශ්\\u200dරී ලංකා ටෙලිකොම් හී IT අංශය නංවාලීම සඳහා කියමින් Virtusa සමාගමට ලබාදීමේ සුදානමක් පවතින බවද වාර්තා වෙයි. ඒ පිළිබඳ සේවක පිරිස් අනාවරණය කරමින් සඳහන් කරන්නේ IT අංශයේ සමස්ථ ක්\\u200dරියාවලිය වසරකට රුපියල් බිලියන 1ක් ගෙවා අදාළ සමගම හරහා සිදුකර ගැනීමේ අපේක්ෂාවෙන් මෙම කටයුත්ත දියත්කර ඇති බවය.\\nඑහිදීද කිසිඳු ටෙන්ඩර් කැඳවීමක් හෝ විධිමත් ප්\\u200dරසම්පාදන ක්\\u200dරියාවලියක් සිදුව නොමැති බවද ඔවුන් සඳහන් කරයි. එපමණක් නොව ශ්\\u200dරී ලංකා ටෙලිකොම් හී හදවත බඳු IT අංශය පෞද්ගලික සමාගමක් හරහා කරවා ගැනීමට යාම තුළ සමස්ත ආයතන පද්ධතියම අවධානමේ වැටීමක් සිදුවන බවද ඔවුන් පෙන්වා දෙයි.\\nමේ පිළිබඳ තොරතුරු අනාවරණය කරන පාර්ශව සඳහන් කර සිටින්නේ මේ කිසිඳු සැකකටයුතු ගනුදෙනුවක් සම්බන්ධව මාධ්\\u200dය හෝ වගකිවයුතු පාර්ශව කතා නොකරන බවත් ශ්\\u200dරී ලංකා ටෙලිකොම් සභාපති පී.ජී.කුමාරසිංහ , ජනාධිපති මෛත්\\u200dරීපාල සිරිසේනගේ සහෝදරයාවීම ඊට හේතුවී ඇති බව තමන් විශ්වාස කරන බවය.\\nඑසේම වත්මන් සභාපතිවරයා යටතේ ශ්\\u200dරී ලංකා ටෙලිකොම් හී කඩාවැටීම පෙන්වා දෙමින් තොරතුරු මුලාශ්\\u200dර සඳහන් කරන්නේ මේ වන විට රුපියල් බිලියන 35 කට අධික ණයක් සහිතව ආයතනය මුල්\\u200dය අර්බුදයකට ගමන් කර ඇති බවත් ආයතනය පවත්වාගෙන යන්නේ මහජන බැංකුවෙන් ලබාගත් බැංකු අයිරාවකින් බවය.\\nඑහෙත් වත්මන් ආණ්ඩුව බලයට පත්වන විට ශ්\\u200dරී ටෙලිකොම් හී ණය ප්\\u200dරමාණය රුපියල් බිලියන 6.9 ක් පමණක් වූ බවත් ඒ ණයද සමාගම ගෙන ඇත්තේ ආයතනය වර්ධනය කිරීමට අවශ්\\u200dය ව්\\u200dයාපෘති සඳහා බවද වාර්තා වෙයි.\\nඒ අනුව ආයතනයේ ණය 5 ගුණයකින් පමණ වැඩිකිරීම හැර ආයතනයේ යහපත වෙනුවෙන් වත්මන් සභාපතිවරයා සිදුකළ කිසිඳු සේවයක් නොමැ��ි බවද සඳහන් වෙයි. එසේම ආයතනය රුපියල් මිලියන 500ක පමණ බරපතල පාඩුවක් ලබමින් සිටින බව ඒ හේතුවෙන් සේවකයින්ගේ බෝනස් ගෙවීමට පවා නොහැකි තත්ත්වයක් උද්ගතව පවතින බවත් සඳහන් කරන තොරතුරු මුලාශ්\\u200dර වැඩිදුරටත් සඳහන් කර සිටින්නේ මේ ආකාරයට තවත් කාලයක් ගියහොත් ශ්\\u200dරී ලංකා ටෙලිකොම් බංකොලොත්වීමේ අවධානමක්ද පවතින බවය.\\nමේ අතර ශ්\\u200dරී ලංකා ටෙලිකොම් හී විධායක නිලධාරී කිත්ති පෙරේරා මේ වන විටත් පුරප්පාඩුව පවතින ප්\\u200dරධාන විධායක නිලධාරී තනතුර සඳහා යොදා ගැනීමේ උත්සාහයකද සභාපතිවරයා නිරතව සිටින බව වාර්තා වෙයි.\\nටෙලිකොම් සභාපති කුමාරසිංහ මේ සම්බන්ධව “කිත්තිව CEO කරමු. මට ඒකට ඔයාලගේ උදව් ඕන” යනුවෙන් අධ්\\u200dයක්ෂ මණ්ඩලයටද පවසා ඇති බවත් වාර්තා වෙයි. අයතා ඉපයීම් සම්බන්ධව දුෂණ චෝදනා ලාභියකු වන කිත්ති පෙරේරා ප්\\u200dරධාන විධායක නිලධාරියා ලෙස පත්වුවහොත් ශ්\\u200dරී ලංකා ටෙලිකොම් පවතින තත්ත්වයද ඊට අහිමි වන බව මෙන්ම මේ වන විටත් ප්\\u200dරධාන විධායක නිලධාරී තනතුර සඳහා ඇබෑර්තු නිවේදනයේ දැක්වෙන සුදුසුකම් ද කිත්ති පෙරේරා සපුරා නොමැති බවද වැඩිදුරටත් වාර්තා වෙයි.\\nඑසේම කිත්ති පෙරේරා ඇතුළු රාජපක්ෂ හිතවාදී නඩයක්ම මේ වන විට සභාපතිවරයා සමඟ මෙම දුෂිත ගනුදෙනු රැසක නිරතව සිටින බවත් එය ජනාධිපතිවරයාගේ දේශපාලන කීර්ති නාමයට බරපතල හානිකර තත්ත්වයක් බවද ඔවුන් සඳහන් කරයි.\\nඒ අනුව තොරතුරු මුලාශ්\\u200dර සඳහන් කර සිටින්නේ මේවා ගැන කෝපනොවී එල්ලවන චෝදනා ඇත්තද නැත්තද යන්න ගැන විමර්ශනය කිරීමේ යුතුකමක් ජනාධිපතිවරයාට ඇති බවය. එසේම එවැනි විධිමත් විමර්ශනයක් සිදුකළ විට ශ්\\u200dරී ලංකා ටෙලිකොම්- මොබිටෙල් හී සිදුව තිබෙන දුෂණ සම්බන්ධවද ජනාධිපති විමර්ශන කොමිසමක් අවශ්\\u200dය බවට කරන යෝජනාවේ සාධාරණත්වයක් ඇති බව පෙනීයනු ඇති බවද ඔවුන් සඳහන් කරයි.\\n← තේරෙයි.., කල් යයි..! වැලිකඩ සමූහ ඝාතනයට සම්බන්ධ පොලිසියේ රංගජීව සහ බන්ධනාගාරයේ එමිල් රන්ජන්ට බඩු බනිස්..! ගෝඩ් ෆාදර්ට කවදා ද?\\nශ්\\u200dරී ලංකා ප්\\u200dරජාතන්ත්\\u200dරවාදී සමාජවාදී ජනරජයේ නිතියානුකුල අග්\\u200dරාමාත්\\u200dය ලෙස ඔබ පිළිගන්නේ? (මත විමසුම) October 28, 2018\\nලංකා ඊ නිව්ස් කර්තෘ සඳරුවන් සේනාධීර අත් අඩංගුවට ගන්න නැතිනම් පිටුවහල් කරන්න – ජනාධිපති ගමරාළ බ්\\u200dරිතාන්\\u200dයයෙන් ඉල්ලයි June 10, 2018\\nරටේ බිහි වූ දූෂිතම සහ භයංකරම රාජ්\\u200dය නිලධාරියා වූ ගෝඨා මැරී ඉපදිලා වගේ කතා කරමින් රට යලි විපත් මගකට ඇද දාන්න හදනවා – ෂැන්ග්\\u200dරිලා දේශණයට මුදල් ඇමැති මංගලගෙන් පිළිතුරක් May 20, 2018'}"
|
645 |
+
]
|
646 |
+
},
|
647 |
+
"execution_count": 13,
|
648 |
+
"metadata": {},
|
649 |
+
"output_type": "execute_result"
|
650 |
+
}
|
651 |
+
],
|
652 |
+
"source": [
|
653 |
+
"dataset['train'][0]"
|
654 |
+
]
|
655 |
+
},
|
656 |
+
{
|
657 |
+
"cell_type": "code",
|
658 |
+
"execution_count": 18,
|
659 |
+
"id": "b6c82283-ce4c-491e-81c1-ba0e512f0704",
|
660 |
+
"metadata": {},
|
661 |
+
"outputs": [
|
662 |
+
{
|
663 |
+
"name": "stderr",
|
664 |
+
"output_type": "stream",
|
665 |
+
"text": [
|
666 |
+
"100%|████████████████████████████████████████████████████████████████████████| 120684/120684 [00:13<00:00, 8667.09it/s]\n"
|
667 |
+
]
|
668 |
+
}
|
669 |
+
],
|
670 |
+
"source": [
|
671 |
+
"from tqdm.auto import tqdm\n",
|
672 |
+
"\n",
|
673 |
+
"text_data = []\n",
|
674 |
+
"file_count = 0\n",
|
675 |
+
"\n",
|
676 |
+
"for sample in tqdm(dataset['train']):\n",
|
677 |
+
" sample = sample['text'].replace('\\n', '')\n",
|
678 |
+
" text_data.append(sample)\n",
|
679 |
+
" if len(text_data) == 10_000:\n",
|
680 |
+
" with open(f'sinhala_{file_count}.txt','w' , encoding='utf-8') as fp:\n",
|
681 |
+
" fp.write('\\n'.join(text_data))\n",
|
682 |
+
" text_data = []\n",
|
683 |
+
" file_count += 1\n",
|
684 |
+
"with open(f'sinhala_{file_count}.txt','w' , encoding='utf-8') as fp:\n",
|
685 |
+
" fp.write('\\n'.join(text_data))"
|
686 |
+
]
|
687 |
+
}
|
688 |
+
],
|
689 |
+
"metadata": {
|
690 |
+
"kernelspec": {
|
691 |
+
"display_name": "Python 3 (ipykernel)",
|
692 |
+
"language": "python",
|
693 |
+
"name": "python3"
|
694 |
+
},
|
695 |
+
"language_info": {
|
696 |
+
"codemirror_mode": {
|
697 |
+
"name": "ipython",
|
698 |
+
"version": 3
|
699 |
+
},
|
700 |
+
"file_extension": ".py",
|
701 |
+
"mimetype": "text/x-python",
|
702 |
+
"name": "python",
|
703 |
+
"nbconvert_exporter": "python",
|
704 |
+
"pygments_lexer": "ipython3",
|
705 |
+
"version": "3.10.11"
|
706 |
+
}
|
707 |
+
},
|
708 |
+
"nbformat": 4,
|
709 |
+
"nbformat_minor": 5
|
710 |
+
}
|
Tokenizer.ipynb
ADDED
@@ -0,0 +1,720 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 2,
|
6 |
+
"id": "585f9800-984f-40fe-9b06-35cd40229d90",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"from pathlib import Path"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "code",
|
15 |
+
"execution_count": 3,
|
16 |
+
"id": "503f9c3c-e348-478f-b743-cff3ce5f4465",
|
17 |
+
"metadata": {},
|
18 |
+
"outputs": [
|
19 |
+
{
|
20 |
+
"data": {
|
21 |
+
"text/plain": [
|
22 |
+
"['sinhala_0.txt',\n",
|
23 |
+
" 'sinhala_1.txt',\n",
|
24 |
+
" 'sinhala_10.txt',\n",
|
25 |
+
" 'sinhala_11.txt',\n",
|
26 |
+
" 'sinhala_12.txt']"
|
27 |
+
]
|
28 |
+
},
|
29 |
+
"execution_count": 3,
|
30 |
+
"metadata": {},
|
31 |
+
"output_type": "execute_result"
|
32 |
+
}
|
33 |
+
],
|
34 |
+
"source": [
|
35 |
+
"paths = [str(x) for x in Path('./').glob('*.txt')]\n",
|
36 |
+
"\n",
|
37 |
+
"paths[:5]"
|
38 |
+
]
|
39 |
+
},
|
40 |
+
{
|
41 |
+
"cell_type": "code",
|
42 |
+
"execution_count": 4,
|
43 |
+
"id": "5e9baa0f-6c33-45b8-8487-202627067436",
|
44 |
+
"metadata": {},
|
45 |
+
"outputs": [
|
46 |
+
{
|
47 |
+
"name": "stdout",
|
48 |
+
"output_type": "stream",
|
49 |
+
"text": [
|
50 |
+
"Requirement already satisfied: tokenizers in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (0.15.1)\n",
|
51 |
+
"Requirement already satisfied: huggingface_hub<1.0,>=0.16.4 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from tokenizers) (0.20.3)\n",
|
52 |
+
"Requirement already satisfied: requests in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (2.31.0)\n",
|
53 |
+
"Requirement already satisfied: filelock in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (3.13.1)\n",
|
54 |
+
"Requirement already satisfied: tqdm>=4.42.1 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (4.66.1)\n",
|
55 |
+
"Requirement already satisfied: fsspec>=2023.5.0 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (2023.10.0)\n",
|
56 |
+
"Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (4.9.0)\n",
|
57 |
+
"Requirement already satisfied: packaging>=20.9 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (23.2)\n",
|
58 |
+
"Requirement already satisfied: pyyaml>=5.1 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (6.0.1)\n",
|
59 |
+
"Requirement already satisfied: colorama in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from tqdm>=4.42.1->huggingface_hub<1.0,>=0.16.4->tokenizers) (0.4.6)\n",
|
60 |
+
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests->huggingface_hub<1.0,>=0.16.4->tokenizers) (3.6)\n",
|
61 |
+
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests->huggingface_hub<1.0,>=0.16.4->tokenizers) (2023.11.17)\n",
|
62 |
+
"Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests->huggingface_hub<1.0,>=0.16.4->tokenizers) (3.3.2)\n",
|
63 |
+
"Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests->huggingface_hub<1.0,>=0.16.4->tokenizers) (2.1.0)\n"
|
64 |
+
]
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"name": "stderr",
|
68 |
+
"output_type": "stream",
|
69 |
+
"text": [
|
70 |
+
"\n",
|
71 |
+
"[notice] A new release of pip is available: 23.0.1 -> 23.3.2\n",
|
72 |
+
"[notice] To update, run: python.exe -m pip install --upgrade pip\n"
|
73 |
+
]
|
74 |
+
}
|
75 |
+
],
|
76 |
+
"source": [
|
77 |
+
"!pip install tokenizers"
|
78 |
+
]
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"cell_type": "code",
|
82 |
+
"execution_count": 5,
|
83 |
+
"id": "225c9c9a-0182-4bfe-92fc-2234e3515560",
|
84 |
+
"metadata": {},
|
85 |
+
"outputs": [],
|
86 |
+
"source": [
|
87 |
+
"from tokenizers import ByteLevelBPETokenizer"
|
88 |
+
]
|
89 |
+
},
|
90 |
+
{
|
91 |
+
"cell_type": "code",
|
92 |
+
"execution_count": 11,
|
93 |
+
"id": "501098a0-5df1-448b-99e6-52143cb6751f",
|
94 |
+
"metadata": {},
|
95 |
+
"outputs": [],
|
96 |
+
"source": [
|
97 |
+
"tokenizer = ByteLevelBPETokenizer()"
|
98 |
+
]
|
99 |
+
},
|
100 |
+
{
|
101 |
+
"cell_type": "code",
|
102 |
+
"execution_count": 13,
|
103 |
+
"id": "24ef02fe-5703-4b87-a92a-e6e936f7fd96",
|
104 |
+
"metadata": {},
|
105 |
+
"outputs": [],
|
106 |
+
"source": [
|
107 |
+
"tokenizer.train(files=paths, vocab_size=30_522, min_frequency=2,\n",
|
108 |
+
" special_tokens=['<s>', '<pad>', '</s>', '<mask>'\n",
|
109 |
+
" ])"
|
110 |
+
]
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"cell_type": "code",
|
114 |
+
"execution_count": 8,
|
115 |
+
"id": "da5bfba4-6c97-4256-b669-f75026b93e09",
|
116 |
+
"metadata": {},
|
117 |
+
"outputs": [
|
118 |
+
{
|
119 |
+
"ename": "FileExistsError",
|
120 |
+
"evalue": "[WinError 183] Cannot create a file when that file already exists: 'sinhalaMLM'",
|
121 |
+
"output_type": "error",
|
122 |
+
"traceback": [
|
123 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
124 |
+
"\u001b[1;31mFileExistsError\u001b[0m Traceback (most recent call last)",
|
125 |
+
"Cell \u001b[1;32mIn[8], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmkdir\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msinhalaMLM\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n",
|
126 |
+
"\u001b[1;31mFileExistsError\u001b[0m: [WinError 183] Cannot create a file when that file already exists: 'sinhalaMLM'"
|
127 |
+
]
|
128 |
+
}
|
129 |
+
],
|
130 |
+
"source": [
|
131 |
+
"import os\n",
|
132 |
+
"os.mkdir('sinhalaMLM')"
|
133 |
+
]
|
134 |
+
},
|
135 |
+
{
|
136 |
+
"cell_type": "code",
|
137 |
+
"execution_count": null,
|
138 |
+
"id": "e25beeaa-e969-4c33-98e2-65523d827d95",
|
139 |
+
"metadata": {},
|
140 |
+
"outputs": [],
|
141 |
+
"source": [
|
142 |
+
"tokenizer.save_model('sinhalaMLM')"
|
143 |
+
]
|
144 |
+
},
|
145 |
+
{
|
146 |
+
"cell_type": "code",
|
147 |
+
"execution_count": null,
|
148 |
+
"id": "222a9edf-15ed-44a8-aaba-2afee76b3cbf",
|
149 |
+
"metadata": {},
|
150 |
+
"outputs": [],
|
151 |
+
"source": [
|
152 |
+
"!pip install transformers"
|
153 |
+
]
|
154 |
+
},
|
155 |
+
{
|
156 |
+
"cell_type": "code",
|
157 |
+
"execution_count": null,
|
158 |
+
"id": "864266b5-77d5-451e-9c01-096588ff62e4",
|
159 |
+
"metadata": {},
|
160 |
+
"outputs": [],
|
161 |
+
"source": [
|
162 |
+
"pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cpu"
|
163 |
+
]
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"cell_type": "code",
|
167 |
+
"execution_count": 15,
|
168 |
+
"id": "67d2fc2e-9cb0-4306-9769-0c34a2111c37",
|
169 |
+
"metadata": {},
|
170 |
+
"outputs": [],
|
171 |
+
"source": [
|
172 |
+
"from transformers import RobertaTokenizerFast"
|
173 |
+
]
|
174 |
+
},
|
175 |
+
{
|
176 |
+
"cell_type": "code",
|
177 |
+
"execution_count": 16,
|
178 |
+
"id": "474f1e8c-e499-4205-96cc-44f4a4c9e4e3",
|
179 |
+
"metadata": {},
|
180 |
+
"outputs": [],
|
181 |
+
"source": [
|
182 |
+
"tokenizer = RobertaTokenizerFast.from_pretrained('sinhalaMLM')"
|
183 |
+
]
|
184 |
+
},
|
185 |
+
{
|
186 |
+
"cell_type": "code",
|
187 |
+
"execution_count": 17,
|
188 |
+
"id": "3ea07602-9cd6-4c12-a860-9302e4db7607",
|
189 |
+
"metadata": {},
|
190 |
+
"outputs": [
|
191 |
+
{
|
192 |
+
"data": {
|
193 |
+
"text/plain": [
|
194 |
+
"{'input_ids': [0, 4689, 267, 300, 275, 469, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}"
|
195 |
+
]
|
196 |
+
},
|
197 |
+
"execution_count": 17,
|
198 |
+
"metadata": {},
|
199 |
+
"output_type": "execute_result"
|
200 |
+
}
|
201 |
+
],
|
202 |
+
"source": [
|
203 |
+
"tokenizer('ළමයා ගෙදර')"
|
204 |
+
]
|
205 |
+
},
|
206 |
+
{
|
207 |
+
"cell_type": "code",
|
208 |
+
"execution_count": null,
|
209 |
+
"id": "0a9a1acf-fd7d-4d4b-9222-f2f25a39efff",
|
210 |
+
"metadata": {},
|
211 |
+
"outputs": [],
|
212 |
+
"source": [
|
213 |
+
"lables == input_ids\n",
|
214 |
+
"\n",
|
215 |
+
"input_ids -> MLM"
|
216 |
+
]
|
217 |
+
},
|
218 |
+
{
|
219 |
+
"cell_type": "code",
|
220 |
+
"execution_count": 18,
|
221 |
+
"id": "90cec429-ee77-4c34-814b-3fec3f5b035e",
|
222 |
+
"metadata": {},
|
223 |
+
"outputs": [],
|
224 |
+
"source": [
|
225 |
+
"import torch\n",
|
226 |
+
"def mlm(tensor):\n",
|
227 |
+
" rand = torch.rand(tensor.shape) \n",
|
228 |
+
" mask_arr = (rand < 0.15) * (tensor > 2)\n",
|
229 |
+
" for i in range( tensor.shape[0]):\n",
|
230 |
+
" selection = torch.flatten(mask_arr[i].nonzero()).tolist() #[[2,5,8]]\n",
|
231 |
+
" tensor[i, selection] = 3\n",
|
232 |
+
" return tensor"
|
233 |
+
]
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"cell_type": "code",
|
237 |
+
"execution_count": 19,
|
238 |
+
"id": "2c22dc9d-cf4f-4cf0-a7b4-364464c525d7",
|
239 |
+
"metadata": {},
|
240 |
+
"outputs": [
|
241 |
+
{
|
242 |
+
"data": {
|
243 |
+
"text/plain": [
|
244 |
+
"['sinhala_0.txt',\n",
|
245 |
+
" 'sinhala_1.txt',\n",
|
246 |
+
" 'sinhala_10.txt',\n",
|
247 |
+
" 'sinhala_11.txt',\n",
|
248 |
+
" 'sinhala_12.txt']"
|
249 |
+
]
|
250 |
+
},
|
251 |
+
"execution_count": 19,
|
252 |
+
"metadata": {},
|
253 |
+
"output_type": "execute_result"
|
254 |
+
}
|
255 |
+
],
|
256 |
+
"source": [
|
257 |
+
"from pathlib import Path\n",
|
258 |
+
"\n",
|
259 |
+
"paths = [str(x) for x in Path('./').glob('*.txt')]\n",
|
260 |
+
"paths[:5]"
|
261 |
+
]
|
262 |
+
},
|
263 |
+
{
|
264 |
+
"cell_type": "code",
|
265 |
+
"execution_count": 20,
|
266 |
+
"id": "d19907e4-e54d-4596-a8a8-099e6965bcef",
|
267 |
+
"metadata": {},
|
268 |
+
"outputs": [
|
269 |
+
{
|
270 |
+
"data": {
|
271 |
+
"application/vnd.jupyter.widget-view+json": {
|
272 |
+
"model_id": "9bb6ce3b55a644b4805cca4d442ada93",
|
273 |
+
"version_major": 2,
|
274 |
+
"version_minor": 0
|
275 |
+
},
|
276 |
+
"text/plain": [
|
277 |
+
" 0%| | 0/13 [00:00<?, ?it/s]"
|
278 |
+
]
|
279 |
+
},
|
280 |
+
"metadata": {},
|
281 |
+
"output_type": "display_data"
|
282 |
+
}
|
283 |
+
],
|
284 |
+
"source": [
|
285 |
+
"from tqdm.auto import tqdm\n",
|
286 |
+
"\n",
|
287 |
+
"input_ids = []\n",
|
288 |
+
"mask = []\n",
|
289 |
+
"labels = []\n",
|
290 |
+
"\n",
|
291 |
+
"for path in tqdm(paths[:50]):\n",
|
292 |
+
" with open(path, 'r', encoding='utf-8') as f:\n",
|
293 |
+
" lines = f.read().split('\\n')\n",
|
294 |
+
" sample = tokenizer(lines, max_length=512, padding='max_length',\n",
|
295 |
+
"truncation=True, return_tensors='pt')\n",
|
296 |
+
" labels.append(sample.input_ids)\n",
|
297 |
+
" mask.append(sample.attention_mask)\n",
|
298 |
+
" input_ids.append(mlm(sample.input_ids.detach().clone()))"
|
299 |
+
]
|
300 |
+
},
|
301 |
+
{
|
302 |
+
"cell_type": "code",
|
303 |
+
"execution_count": 96,
|
304 |
+
"id": "ef40f10e-ca0b-416a-8481-6ff6e343372e",
|
305 |
+
"metadata": {},
|
306 |
+
"outputs": [],
|
307 |
+
"source": [
|
308 |
+
"#input_ids = torch.cat(input_ids)\n",
|
309 |
+
"#mask = torch.cat(mask)\n",
|
310 |
+
"#labels = torch.cat(labels)"
|
311 |
+
]
|
312 |
+
},
|
313 |
+
{
|
314 |
+
"cell_type": "code",
|
315 |
+
"execution_count": 89,
|
316 |
+
"id": "3b173e1a-96d0-40e6-9883-4f91772c47a1",
|
317 |
+
"metadata": {},
|
318 |
+
"outputs": [
|
319 |
+
{
|
320 |
+
"data": {
|
321 |
+
"text/plain": [
|
322 |
+
"tensor([ 0, 528, 267, 312, 264, 353, 264, 470, 267, 293,\n",
|
323 |
+
" 271, 1920, 20312, 263, 281, 264, 269, 271, 282, 263])"
|
324 |
+
]
|
325 |
+
},
|
326 |
+
"execution_count": 89,
|
327 |
+
"metadata": {},
|
328 |
+
"output_type": "execute_result"
|
329 |
+
}
|
330 |
+
],
|
331 |
+
"source": [
|
332 |
+
"input_ids[0][:20]"
|
333 |
+
]
|
334 |
+
},
|
335 |
+
{
|
336 |
+
"cell_type": "code",
|
337 |
+
"execution_count": 90,
|
338 |
+
"id": "3915fa7d-ae56-4d86-b10d-4431a4739c0e",
|
339 |
+
"metadata": {},
|
340 |
+
"outputs": [
|
341 |
+
{
|
342 |
+
"data": {
|
343 |
+
"text/plain": [
|
344 |
+
"tensor([ 0, 528, 267, 312, 264, 353, 264, 470, 267, 293])"
|
345 |
+
]
|
346 |
+
},
|
347 |
+
"execution_count": 90,
|
348 |
+
"metadata": {},
|
349 |
+
"output_type": "execute_result"
|
350 |
+
}
|
351 |
+
],
|
352 |
+
"source": [
|
353 |
+
"labels[0][:10]"
|
354 |
+
]
|
355 |
+
},
|
356 |
+
{
|
357 |
+
"cell_type": "code",
|
358 |
+
"execution_count": 91,
|
359 |
+
"id": "9202a316-001b-4602-9b8f-9ea93aa5083d",
|
360 |
+
"metadata": {},
|
361 |
+
"outputs": [],
|
362 |
+
"source": [
|
363 |
+
"encodings = {\n",
|
364 |
+
" 'input_ids': input_ids,\n",
|
365 |
+
" 'attention_mask': mask,\n",
|
366 |
+
" 'labels': labels\n",
|
367 |
+
"}"
|
368 |
+
]
|
369 |
+
},
|
370 |
+
{
|
371 |
+
"cell_type": "code",
|
372 |
+
"execution_count": 116,
|
373 |
+
"id": "1c5a01c5-14e2-4bdb-8bfa-12becbe02de6",
|
374 |
+
"metadata": {},
|
375 |
+
"outputs": [],
|
376 |
+
"source": [
|
377 |
+
"#from torch.utils.data import Dataset\n",
|
378 |
+
"\n",
|
379 |
+
"class Dataset(torch.utils.data.Dataset):\n",
|
380 |
+
" def __init__(self, encodings):\n",
|
381 |
+
" self.encodings = encodings\n",
|
382 |
+
" def __len__(self):\n",
|
383 |
+
" return self.encodings['input_ids'].shape[0]\n",
|
384 |
+
" def __getitem__(self, i):\n",
|
385 |
+
" return {key: tensor[i] for key, tensor in self.encodings.items()}"
|
386 |
+
]
|
387 |
+
},
|
388 |
+
{
|
389 |
+
"cell_type": "code",
|
390 |
+
"execution_count": 111,
|
391 |
+
"id": "e113560d-b382-49d1-afc0-9ad03da9b212",
|
392 |
+
"metadata": {},
|
393 |
+
"outputs": [
|
394 |
+
{
|
395 |
+
"data": {
|
396 |
+
"text/plain": [
|
397 |
+
"120684"
|
398 |
+
]
|
399 |
+
},
|
400 |
+
"execution_count": 111,
|
401 |
+
"metadata": {},
|
402 |
+
"output_type": "execute_result"
|
403 |
+
}
|
404 |
+
],
|
405 |
+
"source": [
|
406 |
+
"input_ids.shape[0]"
|
407 |
+
]
|
408 |
+
},
|
409 |
+
{
|
410 |
+
"cell_type": "code",
|
411 |
+
"execution_count": 112,
|
412 |
+
"id": "ebc01aa1-6a6e-4e20-b88a-9d91deca0c0a",
|
413 |
+
"metadata": {},
|
414 |
+
"outputs": [
|
415 |
+
{
|
416 |
+
"data": {
|
417 |
+
"text/plain": [
|
418 |
+
"dict_items([('input_ids', tensor([[ 0, 528, 267, ..., 283, 302, 2],\n",
|
419 |
+
" [ 0, 277, 560, ..., 1, 1, 1],\n",
|
420 |
+
" [ 0, 278, 264, ..., 296, 269, 2],\n",
|
421 |
+
" ...,\n",
|
422 |
+
" [ 0, 272, 276, ..., 1, 1, 1],\n",
|
423 |
+
" [ 0, 292, 296, ..., 620, 271, 2],\n",
|
424 |
+
" [ 0, 307, 295, ..., 269, 281, 2]])), ('attention_mask', tensor([[1, 1, 1, ..., 1, 1, 1],\n",
|
425 |
+
" [1, 1, 1, ..., 0, 0, 0],\n",
|
426 |
+
" [1, 1, 1, ..., 1, 1, 1],\n",
|
427 |
+
" ...,\n",
|
428 |
+
" [1, 1, 1, ..., 0, 0, 0],\n",
|
429 |
+
" [1, 1, 1, ..., 1, 1, 1],\n",
|
430 |
+
" [1, 1, 1, ..., 1, 1, 1]])), ('labels', tensor([[ 0, 528, 267, ..., 283, 302, 2],\n",
|
431 |
+
" [ 0, 277, 560, ..., 1, 1, 1],\n",
|
432 |
+
" [ 0, 278, 264, ..., 296, 269, 2],\n",
|
433 |
+
" ...,\n",
|
434 |
+
" [ 0, 272, 276, ..., 1, 1, 1],\n",
|
435 |
+
" [ 0, 292, 296, ..., 620, 271, 2],\n",
|
436 |
+
" [ 0, 307, 295, ..., 269, 281, 2]]))])"
|
437 |
+
]
|
438 |
+
},
|
439 |
+
"execution_count": 112,
|
440 |
+
"metadata": {},
|
441 |
+
"output_type": "execute_result"
|
442 |
+
}
|
443 |
+
],
|
444 |
+
"source": [
|
445 |
+
"encodings.items()"
|
446 |
+
]
|
447 |
+
},
|
448 |
+
{
|
449 |
+
"cell_type": "code",
|
450 |
+
"execution_count": 117,
|
451 |
+
"id": "9a329bc6-2286-4d9a-89f4-041ba2f78951",
|
452 |
+
"metadata": {},
|
453 |
+
"outputs": [],
|
454 |
+
"source": [
|
455 |
+
"dataset = Dataset(encodings)"
|
456 |
+
]
|
457 |
+
},
|
458 |
+
{
|
459 |
+
"cell_type": "code",
|
460 |
+
"execution_count": 118,
|
461 |
+
"id": "1ad3e3f5-bb45-4d51-8ccb-62ab73a1788f",
|
462 |
+
"metadata": {},
|
463 |
+
"outputs": [],
|
464 |
+
"source": [
|
465 |
+
"dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True)"
|
466 |
+
]
|
467 |
+
},
|
468 |
+
{
|
469 |
+
"cell_type": "code",
|
470 |
+
"execution_count": 120,
|
471 |
+
"id": "900571cd-5061-41f8-b361-e7889ba534cf",
|
472 |
+
"metadata": {},
|
473 |
+
"outputs": [],
|
474 |
+
"source": [
|
475 |
+
"from transformers import RobertaConfig"
|
476 |
+
]
|
477 |
+
},
|
478 |
+
{
|
479 |
+
"cell_type": "code",
|
480 |
+
"execution_count": 121,
|
481 |
+
"id": "d35e11be-3ee6-4e7c-b49c-6900434553cf",
|
482 |
+
"metadata": {},
|
483 |
+
"outputs": [
|
484 |
+
{
|
485 |
+
"data": {
|
486 |
+
"text/plain": [
|
487 |
+
"30522"
|
488 |
+
]
|
489 |
+
},
|
490 |
+
"execution_count": 121,
|
491 |
+
"metadata": {},
|
492 |
+
"output_type": "execute_result"
|
493 |
+
}
|
494 |
+
],
|
495 |
+
"source": [
|
496 |
+
"tokenizer.vocab_size\n"
|
497 |
+
]
|
498 |
+
},
|
499 |
+
{
|
500 |
+
"cell_type": "code",
|
501 |
+
"execution_count": 137,
|
502 |
+
"id": "69d081bc-a4a5-4970-a361-05b426d5afa2",
|
503 |
+
"metadata": {},
|
504 |
+
"outputs": [],
|
505 |
+
"source": [
|
506 |
+
"config = RobertaConfig(\n",
|
507 |
+
" vocab_size = tokenizer.vocab_size,\n",
|
508 |
+
" max_position_embeddings=514,\n",
|
509 |
+
" hidden_size=768,\n",
|
510 |
+
" num_attention_heads=12,\n",
|
511 |
+
" num_hidden_layers=6,\n",
|
512 |
+
" type_vocab_size=1\n",
|
513 |
+
")"
|
514 |
+
]
|
515 |
+
},
|
516 |
+
{
|
517 |
+
"cell_type": "code",
|
518 |
+
"execution_count": 138,
|
519 |
+
"id": "4386905e-2647-4082-aa50-e884cded82b4",
|
520 |
+
"metadata": {},
|
521 |
+
"outputs": [],
|
522 |
+
"source": [
|
523 |
+
"from transformers import RobertaForMaskedLM"
|
524 |
+
]
|
525 |
+
},
|
526 |
+
{
|
527 |
+
"cell_type": "code",
|
528 |
+
"execution_count": 139,
|
529 |
+
"id": "8a6fec9d-31da-4657-906d-a5b356a32328",
|
530 |
+
"metadata": {},
|
531 |
+
"outputs": [],
|
532 |
+
"source": [
|
533 |
+
"model = RobertaForMaskedLM(config)"
|
534 |
+
]
|
535 |
+
},
|
536 |
+
{
|
537 |
+
"cell_type": "code",
|
538 |
+
"execution_count": 140,
|
539 |
+
"id": "3ae60e59-6351-4cee-a780-8d753ba55c0a",
|
540 |
+
"metadata": {},
|
541 |
+
"outputs": [],
|
542 |
+
"source": [
|
543 |
+
"device = torch.device('cuda') if torch.cuda.is_available() else torch.device ('cpu')"
|
544 |
+
]
|
545 |
+
},
|
546 |
+
{
|
547 |
+
"cell_type": "code",
|
548 |
+
"execution_count": 141,
|
549 |
+
"id": "3558fcc7-64c5-46f3-be9a-3199aae9f59b",
|
550 |
+
"metadata": {},
|
551 |
+
"outputs": [
|
552 |
+
{
|
553 |
+
"data": {
|
554 |
+
"text/plain": [
|
555 |
+
"RobertaForMaskedLM(\n",
|
556 |
+
" (roberta): RobertaModel(\n",
|
557 |
+
" (embeddings): RobertaEmbeddings(\n",
|
558 |
+
" (word_embeddings): Embedding(30522, 768, padding_idx=1)\n",
|
559 |
+
" (position_embeddings): Embedding(514, 768, padding_idx=1)\n",
|
560 |
+
" (token_type_embeddings): Embedding(1, 768)\n",
|
561 |
+
" (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
562 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
563 |
+
" )\n",
|
564 |
+
" (encoder): RobertaEncoder(\n",
|
565 |
+
" (layer): ModuleList(\n",
|
566 |
+
" (0-5): 6 x RobertaLayer(\n",
|
567 |
+
" (attention): RobertaAttention(\n",
|
568 |
+
" (self): RobertaSelfAttention(\n",
|
569 |
+
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
|
570 |
+
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
|
571 |
+
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
|
572 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
573 |
+
" )\n",
|
574 |
+
" (output): RobertaSelfOutput(\n",
|
575 |
+
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
|
576 |
+
" (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
577 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
578 |
+
" )\n",
|
579 |
+
" )\n",
|
580 |
+
" (intermediate): RobertaIntermediate(\n",
|
581 |
+
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
|
582 |
+
" (intermediate_act_fn): GELUActivation()\n",
|
583 |
+
" )\n",
|
584 |
+
" (output): RobertaOutput(\n",
|
585 |
+
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
|
586 |
+
" (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
587 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
588 |
+
" )\n",
|
589 |
+
" )\n",
|
590 |
+
" )\n",
|
591 |
+
" )\n",
|
592 |
+
" )\n",
|
593 |
+
" (lm_head): RobertaLMHead(\n",
|
594 |
+
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
|
595 |
+
" (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
596 |
+
" (decoder): Linear(in_features=768, out_features=30522, bias=True)\n",
|
597 |
+
" )\n",
|
598 |
+
")"
|
599 |
+
]
|
600 |
+
},
|
601 |
+
"execution_count": 141,
|
602 |
+
"metadata": {},
|
603 |
+
"output_type": "execute_result"
|
604 |
+
}
|
605 |
+
],
|
606 |
+
"source": [
|
607 |
+
"model.to(device)"
|
608 |
+
]
|
609 |
+
},
|
610 |
+
{
|
611 |
+
"cell_type": "code",
|
612 |
+
"execution_count": 142,
|
613 |
+
"id": "c9bfc830-d445-4203-97b9-a67f23f4b7f1",
|
614 |
+
"metadata": {},
|
615 |
+
"outputs": [],
|
616 |
+
"source": [
|
617 |
+
"from transformers import AdamW"
|
618 |
+
]
|
619 |
+
},
|
620 |
+
{
|
621 |
+
"cell_type": "code",
|
622 |
+
"execution_count": 150,
|
623 |
+
"id": "651b53ff-dbdb-4665-9ee8-4c15e96e4ba9",
|
624 |
+
"metadata": {},
|
625 |
+
"outputs": [],
|
626 |
+
"source": [
|
627 |
+
"optim = AdamW(model.parameters(), lr=1e-4)"
|
628 |
+
]
|
629 |
+
},
|
630 |
+
{
|
631 |
+
"cell_type": "code",
|
632 |
+
"execution_count": 151,
|
633 |
+
"id": "81d298ed-4bff-443b-97b7-8dd7cff5f2aa",
|
634 |
+
"metadata": {},
|
635 |
+
"outputs": [],
|
636 |
+
"source": [
|
637 |
+
"from tqdm.auto import tqdm"
|
638 |
+
]
|
639 |
+
},
|
640 |
+
{
|
641 |
+
"cell_type": "code",
|
642 |
+
"execution_count": 152,
|
643 |
+
"id": "7ec88289-b38d-4317-8797-b2b2192ff407",
|
644 |
+
"metadata": {},
|
645 |
+
"outputs": [],
|
646 |
+
"source": [
|
647 |
+
"epochs = 1"
|
648 |
+
]
|
649 |
+
},
|
650 |
+
{
|
651 |
+
"cell_type": "code",
|
652 |
+
"execution_count": null,
|
653 |
+
"id": "6fe6ccda-fc31-4785-b9f0-ec70e8bfb09a",
|
654 |
+
"metadata": {},
|
655 |
+
"outputs": [
|
656 |
+
{
|
657 |
+
"data": {
|
658 |
+
"application/vnd.jupyter.widget-view+json": {
|
659 |
+
"model_id": "42dd8d1f843544e28d56e5d8b574c0a6",
|
660 |
+
"version_major": 2,
|
661 |
+
"version_minor": 0
|
662 |
+
},
|
663 |
+
"text/plain": [
|
664 |
+
" 0%| | 0/120684 [00:00<?, ?it/s]"
|
665 |
+
]
|
666 |
+
},
|
667 |
+
"metadata": {},
|
668 |
+
"output_type": "display_data"
|
669 |
+
}
|
670 |
+
],
|
671 |
+
"source": [
|
672 |
+
"loop = tqdm(dataloader, leave=True)\n",
|
673 |
+
"for batch in loop:\n",
|
674 |
+
" optim.zero_grad()\n",
|
675 |
+
" input_ids = batch['input_ids'].to(device)\n",
|
676 |
+
" mask = batch['attention_mask'].to(device)\n",
|
677 |
+
" labels = batch['labels'].to(device)\n",
|
678 |
+
" outputs = model(input_ids, attention_mask=mask, \n",
|
679 |
+
" labels=labels)\n",
|
680 |
+
" loss = outputs.loss\n",
|
681 |
+
" loss.backward()\n",
|
682 |
+
" optim.step()\n",
|
683 |
+
"\n",
|
684 |
+
" loop.set_description(f'Epoch: {epochs}')\n",
|
685 |
+
" loop.set_postfix(loss=loss.item())"
|
686 |
+
]
|
687 |
+
},
|
688 |
+
{
|
689 |
+
"cell_type": "code",
|
690 |
+
"execution_count": null,
|
691 |
+
"id": "e9c49ba7-a98d-47d3-8c7f-8807c591ca03",
|
692 |
+
"metadata": {},
|
693 |
+
"outputs": [],
|
694 |
+
"source": [
|
695 |
+
"model.save_pretrained('./sinhalaMLM') # and don't forget to save sinhalaMLM"
|
696 |
+
]
|
697 |
+
}
|
698 |
+
],
|
699 |
+
"metadata": {
|
700 |
+
"kernelspec": {
|
701 |
+
"display_name": "Python 3 (ipykernel)",
|
702 |
+
"language": "python",
|
703 |
+
"name": "python3"
|
704 |
+
},
|
705 |
+
"language_info": {
|
706 |
+
"codemirror_mode": {
|
707 |
+
"name": "ipython",
|
708 |
+
"version": 3
|
709 |
+
},
|
710 |
+
"file_extension": ".py",
|
711 |
+
"mimetype": "text/x-python",
|
712 |
+
"name": "python",
|
713 |
+
"nbconvert_exporter": "python",
|
714 |
+
"pygments_lexer": "ipython3",
|
715 |
+
"version": "3.10.11"
|
716 |
+
}
|
717 |
+
},
|
718 |
+
"nbformat": 4,
|
719 |
+
"nbformat_minor": 5
|
720 |
+
}
|