Upload 17 files
Browse files- checkpoints/best_model.safetensors +3 -0
- checkpoints/checkpoint_step_100.safetensors +3 -0
- checkpoints/checkpoint_step_25.safetensors +3 -0
- checkpoints/checkpoint_step_50.safetensors +3 -0
- checkpoints/final_model.safetensors +3 -0
- config.json +19 -0
- dataset.py +347 -0
- generate.py +330 -0
- model.py +403 -0
- readme.md +303 -0
- requirements.txt +22 -0
- tokenizer.json +0 -0
- tokenizer.pkl +3 -0
- tokenizer.py +396 -0
- train.py +402 -0
- utils.py +359 -0
- vocab.json +2002 -0
checkpoints/best_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1bf35017912c87c7681e3c232ffe5a2481c97ec4e166ef55e5a4f7f9e780c5a5
|
| 3 |
+
size 13068032
|
checkpoints/checkpoint_step_100.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a384cf7782c53e58fef5a5d5ba3ffa0c1724fa45e707fe94a9cb413620a99e68
|
| 3 |
+
size 13068032
|
checkpoints/checkpoint_step_25.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ff3858e7c4ea74c99ee40ac94096a284f443274bcd5bb3c3c650fa4083a1f723
|
| 3 |
+
size 13068032
|
checkpoints/checkpoint_step_50.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b98d0bded63833efd902dcc514e0472352ae972912c4e763b17ced507c9b405f
|
| 3 |
+
size 13068032
|
checkpoints/final_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:459b6ab5c6fe9c32084c28426145ed187e8b9c50e40fdecdedbdb2b170525672
|
| 3 |
+
size 13068016
|
config.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"VicAIModel"
|
| 4 |
+
],
|
| 5 |
+
"vocab_size": 2000,
|
| 6 |
+
"dim": 128,
|
| 7 |
+
"n_layers": 4,
|
| 8 |
+
"n_heads": 4,
|
| 9 |
+
"n_kv_heads": 4,
|
| 10 |
+
"hidden_dim": 256,
|
| 11 |
+
"max_seq_len": 512,
|
| 12 |
+
"tie_weights": false,
|
| 13 |
+
"model_type": "vicai",
|
| 14 |
+
"tokenizer_class": "ByteLevelBPETokenizer",
|
| 15 |
+
"pad_token_id": 1,
|
| 16 |
+
"eos_token_id": 0,
|
| 17 |
+
"unk_token_id": 2,
|
| 18 |
+
"bos_token_id": 3
|
| 19 |
+
}
|
dataset.py
ADDED
|
@@ -0,0 +1,347 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
VicAI Dataset
|
| 3 |
+
Dataset handling for training on Wikipedia and other text sources.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import random
|
| 8 |
+
import re
|
| 9 |
+
from typing import Dict, Iterator, List, Optional
|
| 10 |
+
|
| 11 |
+
import requests
|
| 12 |
+
import torch
|
| 13 |
+
from torch.utils.data import Dataset, IterableDataset
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class WikipediaDataset(IterableDataset):
|
| 17 |
+
"""Stream Wikipedia articles for training."""
|
| 18 |
+
|
| 19 |
+
def __init__(
|
| 20 |
+
self,
|
| 21 |
+
tokenizer,
|
| 22 |
+
max_length: int = 2048,
|
| 23 |
+
languages: List[str] = ['en'],
|
| 24 |
+
min_article_length: int = 100,
|
| 25 |
+
):
|
| 26 |
+
self.tokenizer = tokenizer
|
| 27 |
+
self.max_length = max_length
|
| 28 |
+
self.languages = languages
|
| 29 |
+
self.min_article_length = min_article_length
|
| 30 |
+
self.base_url = "https://en.wikipedia.org/w/api.php"
|
| 31 |
+
|
| 32 |
+
def _fetch_random_article(self) -> Optional[str]:
|
| 33 |
+
"""Fetch a random Wikipedia article."""
|
| 34 |
+
try:
|
| 35 |
+
params = {
|
| 36 |
+
'action': 'query',
|
| 37 |
+
'format': 'json',
|
| 38 |
+
'generator': 'random',
|
| 39 |
+
'grnnamespace': 0,
|
| 40 |
+
'grnlimit': 1,
|
| 41 |
+
'prop': 'extracts',
|
| 42 |
+
'explaintext': True,
|
| 43 |
+
'exsentences': 50,
|
| 44 |
+
}
|
| 45 |
+
response = requests.get(self.base_url, params=params, timeout=10)
|
| 46 |
+
data = response.json()
|
| 47 |
+
|
| 48 |
+
pages = data['query']['pages']
|
| 49 |
+
for page_id, page_data in pages.items():
|
| 50 |
+
text = page_data.get('extract', '')
|
| 51 |
+
if len(text) > self.min_article_length:
|
| 52 |
+
return text
|
| 53 |
+
return None
|
| 54 |
+
except Exception as e:
|
| 55 |
+
print(f"Error fetching article: {e}")
|
| 56 |
+
return None
|
| 57 |
+
|
| 58 |
+
def _fetch_article_by_title(self, title: str) -> Optional[str]:
|
| 59 |
+
"""Fetch a specific Wikipedia article by title."""
|
| 60 |
+
try:
|
| 61 |
+
params = {
|
| 62 |
+
'action': 'query',
|
| 63 |
+
'format': 'json',
|
| 64 |
+
'titles': title,
|
| 65 |
+
'prop': 'extracts',
|
| 66 |
+
'explaintext': True,
|
| 67 |
+
'exlimit': 1,
|
| 68 |
+
}
|
| 69 |
+
response = requests.get(self.base_url, params=params, timeout=10)
|
| 70 |
+
data = response.json()
|
| 71 |
+
|
| 72 |
+
pages = data['query']['pages']
|
| 73 |
+
for page_id, page_data in pages.items():
|
| 74 |
+
if page_id != '-1':
|
| 75 |
+
return page_data.get('extract', '')
|
| 76 |
+
return None
|
| 77 |
+
except Exception as e:
|
| 78 |
+
print(f"Error fetching article: {e}")
|
| 79 |
+
return None
|
| 80 |
+
|
| 81 |
+
def _clean_text(self, text: str) -> str:
|
| 82 |
+
"""Clean Wikipedia text."""
|
| 83 |
+
# Remove citation markers
|
| 84 |
+
text = re.sub(r'\[\d+\]', '', text)
|
| 85 |
+
# Remove multiple spaces
|
| 86 |
+
text = re.sub(r'\s+', ' ', text)
|
| 87 |
+
# Remove special characters but keep basic punctuation
|
| 88 |
+
text = re.sub(r'[^\w\s.,!?;:\'\"()-]', ' ', text)
|
| 89 |
+
return text.strip()
|
| 90 |
+
|
| 91 |
+
def _tokenize_text(self, text: str) -> List[int]:
|
| 92 |
+
"""Tokenize text and create chunks."""
|
| 93 |
+
cleaned = self._clean_text(text)
|
| 94 |
+
tokens = self.tokenizer.encode(cleaned, add_special_tokens=True)
|
| 95 |
+
return tokens
|
| 96 |
+
|
| 97 |
+
def __iter__(self):
|
| 98 |
+
"""Iterate over Wikipedia articles."""
|
| 99 |
+
while True:
|
| 100 |
+
text = self._fetch_random_article()
|
| 101 |
+
if text:
|
| 102 |
+
tokens = self._tokenize_text(text)
|
| 103 |
+
|
| 104 |
+
# Create chunks of max_length
|
| 105 |
+
for i in range(0, len(tokens), self.max_length):
|
| 106 |
+
chunk = tokens[i:i + self.max_length]
|
| 107 |
+
if len(chunk) > 10: # Minimum chunk size
|
| 108 |
+
# Pad if necessary
|
| 109 |
+
if len(chunk) < self.max_length:
|
| 110 |
+
chunk.extend([self.tokenizer.pad_token_id] * (self.max_length - len(chunk)))
|
| 111 |
+
|
| 112 |
+
input_ids = torch.tensor(chunk[:-1])
|
| 113 |
+
labels = torch.tensor(chunk[1:])
|
| 114 |
+
|
| 115 |
+
yield {
|
| 116 |
+
'input_ids': input_ids,
|
| 117 |
+
'labels': labels,
|
| 118 |
+
'attention_mask': (input_ids != self.tokenizer.pad_token_id).long(),
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
class TextFileDataset(Dataset):
|
| 123 |
+
"""Dataset from local text files."""
|
| 124 |
+
|
| 125 |
+
def __init__(
|
| 126 |
+
self,
|
| 127 |
+
file_path: str,
|
| 128 |
+
tokenizer,
|
| 129 |
+
max_length: int = 2048,
|
| 130 |
+
stride: int = 512,
|
| 131 |
+
):
|
| 132 |
+
self.tokenizer = tokenizer
|
| 133 |
+
self.max_length = max_length
|
| 134 |
+
self.stride = stride
|
| 135 |
+
|
| 136 |
+
print(f"Loading dataset from {file_path}...")
|
| 137 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 138 |
+
text = f.read()
|
| 139 |
+
|
| 140 |
+
# Tokenize full text
|
| 141 |
+
self.tokens = tokenizer.encode(text, add_special_tokens=False)
|
| 142 |
+
|
| 143 |
+
# Create chunks
|
| 144 |
+
self.chunks = []
|
| 145 |
+
for i in range(0, len(self.tokens) - max_length, stride):
|
| 146 |
+
chunk = self.tokens[i:i + max_length + 1]
|
| 147 |
+
if len(chunk) == max_length + 1:
|
| 148 |
+
self.chunks.append(chunk)
|
| 149 |
+
|
| 150 |
+
print(f"Created {len(self.chunks)} chunks from {len(self.tokens)} tokens")
|
| 151 |
+
|
| 152 |
+
def __len__(self):
|
| 153 |
+
return len(self.chunks)
|
| 154 |
+
|
| 155 |
+
def __getitem__(self, idx):
|
| 156 |
+
chunk = self.chunks[idx]
|
| 157 |
+
input_ids = torch.tensor(chunk[:-1])
|
| 158 |
+
labels = torch.tensor(chunk[1:])
|
| 159 |
+
|
| 160 |
+
return {
|
| 161 |
+
'input_ids': input_ids,
|
| 162 |
+
'labels': labels,
|
| 163 |
+
'attention_mask': torch.ones_like(input_ids),
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
class MixedDataset(IterableDataset):
|
| 168 |
+
"""Mix multiple data sources."""
|
| 169 |
+
|
| 170 |
+
def __init__(
|
| 171 |
+
self,
|
| 172 |
+
datasets: List[IterableDataset],
|
| 173 |
+
weights: Optional[List[float]] = None,
|
| 174 |
+
):
|
| 175 |
+
self.datasets = datasets
|
| 176 |
+
self.weights = weights or [1.0] * len(datasets)
|
| 177 |
+
assert len(self.datasets) == len(self.weights)
|
| 178 |
+
|
| 179 |
+
# Normalize weights
|
| 180 |
+
total = sum(self.weights)
|
| 181 |
+
self.weights = [w / total for w in self.weights]
|
| 182 |
+
|
| 183 |
+
def __iter__(self):
|
| 184 |
+
"""Sample from datasets according to weights."""
|
| 185 |
+
iterators = [iter(ds) for ds in self.datasets]
|
| 186 |
+
|
| 187 |
+
while True:
|
| 188 |
+
# Choose dataset based on weights
|
| 189 |
+
dataset_idx = random.choices(range(len(self.datasets)), weights=self.weights)[0]
|
| 190 |
+
|
| 191 |
+
try:
|
| 192 |
+
yield next(iterators[dataset_idx])
|
| 193 |
+
except StopIteration:
|
| 194 |
+
# Restart iterator if exhausted
|
| 195 |
+
iterators[dataset_idx] = iter(self.datasets[dataset_idx])
|
| 196 |
+
yield next(iterators[dataset_idx])
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
class PretokenizedDataset(Dataset):
|
| 200 |
+
"""Dataset from pre-tokenized binary files."""
|
| 201 |
+
|
| 202 |
+
def __init__(self, data_dir: str, max_length: int = 2048):
|
| 203 |
+
self.data_dir = data_dir
|
| 204 |
+
self.max_length = max_length
|
| 205 |
+
|
| 206 |
+
# Load all .pt files
|
| 207 |
+
self.files = []
|
| 208 |
+
for fname in os.listdir(data_dir):
|
| 209 |
+
if fname.endswith('.pt'):
|
| 210 |
+
self.files.append(os.path.join(data_dir, fname))
|
| 211 |
+
|
| 212 |
+
self.files.sort()
|
| 213 |
+
print(f"Found {len(self.files)} pre-tokenized files")
|
| 214 |
+
|
| 215 |
+
# Load metadata
|
| 216 |
+
self.lengths = []
|
| 217 |
+
for f in self.files:
|
| 218 |
+
data = torch.load(f, map_location='cpu')
|
| 219 |
+
self.lengths.append(len(data) // max_length)
|
| 220 |
+
|
| 221 |
+
self.total_length = sum(self.lengths)
|
| 222 |
+
|
| 223 |
+
def __len__(self):
|
| 224 |
+
return self.total_length
|
| 225 |
+
|
| 226 |
+
def __getitem__(self, idx):
|
| 227 |
+
# Find which file contains this index
|
| 228 |
+
file_idx = 0
|
| 229 |
+
remaining = idx
|
| 230 |
+
for i, length in enumerate(self.lengths):
|
| 231 |
+
if remaining < length:
|
| 232 |
+
file_idx = i
|
| 233 |
+
break
|
| 234 |
+
remaining -= length
|
| 235 |
+
|
| 236 |
+
# Load data
|
| 237 |
+
data = torch.load(self.files[file_idx], map_location='cpu')
|
| 238 |
+
start = remaining * self.max_length
|
| 239 |
+
chunk = data[start:start + self.max_length + 1]
|
| 240 |
+
|
| 241 |
+
input_ids = chunk[:-1]
|
| 242 |
+
labels = chunk[1:]
|
| 243 |
+
|
| 244 |
+
return {
|
| 245 |
+
'input_ids': input_ids,
|
| 246 |
+
'labels': labels,
|
| 247 |
+
'attention_mask': torch.ones_like(input_ids),
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
def download_wikipedia_dump(output_dir: str, language: str = 'en'):
|
| 252 |
+
"""Download Wikipedia dump for offline processing."""
|
| 253 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 254 |
+
|
| 255 |
+
# Wikipedia dump URLs
|
| 256 |
+
base_url = f"https://dumps.wikimedia.org/{language}wiki/latest/"
|
| 257 |
+
files = [
|
| 258 |
+
f"{language}wiki-latest-pages-articles-multistream.xml.bz2",
|
| 259 |
+
]
|
| 260 |
+
|
| 261 |
+
for filename in files:
|
| 262 |
+
url = base_url + filename
|
| 263 |
+
output_path = os.path.join(output_dir, filename)
|
| 264 |
+
|
| 265 |
+
if os.path.exists(output_path):
|
| 266 |
+
print(f"{filename} already exists")
|
| 267 |
+
continue
|
| 268 |
+
|
| 269 |
+
print(f"Downloading {filename}...")
|
| 270 |
+
try:
|
| 271 |
+
response = requests.get(url, stream=True)
|
| 272 |
+
response.raise_for_status()
|
| 273 |
+
|
| 274 |
+
with open(output_path, 'wb') as f:
|
| 275 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 276 |
+
f.write(chunk)
|
| 277 |
+
|
| 278 |
+
print(f"Downloaded {filename}")
|
| 279 |
+
except Exception as e:
|
| 280 |
+
print(f"Error downloading {filename}: {e}")
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
def create_sample_corpus(output_file: str = "sample_corpus.txt", num_articles: int = 1000):
|
| 284 |
+
"""Create a sample corpus by fetching Wikipedia articles."""
|
| 285 |
+
print(f"Creating sample corpus with {num_articles} articles...")
|
| 286 |
+
|
| 287 |
+
dataset = WikipediaDataset(
|
| 288 |
+
tokenizer=None, # We'll use raw text
|
| 289 |
+
max_length=100000, # Large to get full articles
|
| 290 |
+
)
|
| 291 |
+
|
| 292 |
+
articles = []
|
| 293 |
+
for i, item in enumerate(dataset):
|
| 294 |
+
if i >= num_articles:
|
| 295 |
+
break
|
| 296 |
+
|
| 297 |
+
# Get raw text from the article fetch
|
| 298 |
+
text = dataset._fetch_random_article()
|
| 299 |
+
if text:
|
| 300 |
+
articles.append(text)
|
| 301 |
+
|
| 302 |
+
if (i + 1) % 100 == 0:
|
| 303 |
+
print(f" Fetched {i + 1}/{num_articles} articles")
|
| 304 |
+
|
| 305 |
+
# Write to file
|
| 306 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
| 307 |
+
for article in articles:
|
| 308 |
+
f.write(article + '\n\n<|endoftext|>\n\n')
|
| 309 |
+
|
| 310 |
+
print(f"Sample corpus saved to {output_file}")
|
| 311 |
+
return output_file
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
def prepare_openwebtext_data(output_dir: str, max_files: int = 100):
|
| 315 |
+
"""
|
| 316 |
+
Download and prepare OpenWebText corpus.
|
| 317 |
+
Note: This is a placeholder - actual OpenWebText requires specific download.
|
| 318 |
+
"""
|
| 319 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 320 |
+
print(f"OpenWebText data preparation placeholder")
|
| 321 |
+
print(f"Please download OpenWebText from https://github.com/jcpeterson/openwebtext")
|
| 322 |
+
print(f"and place files in {output_dir}")
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
if __name__ == "__main__":
|
| 326 |
+
# Test dataset
|
| 327 |
+
from tokenizer import BPETokenizer
|
| 328 |
+
|
| 329 |
+
# Create sample tokenizer
|
| 330 |
+
sample_texts = [
|
| 331 |
+
"This is a sample text for testing.",
|
| 332 |
+
"Wikipedia contains many interesting articles.",
|
| 333 |
+
"Machine learning models need lots of data.",
|
| 334 |
+
]
|
| 335 |
+
tokenizer = BPETokenizer(vocab_size=1000)
|
| 336 |
+
tokenizer.train(sample_texts)
|
| 337 |
+
|
| 338 |
+
# Test Wikipedia dataset
|
| 339 |
+
print("\nTesting Wikipedia dataset...")
|
| 340 |
+
wiki_dataset = WikipediaDataset(tokenizer, max_length=128)
|
| 341 |
+
|
| 342 |
+
for i, batch in enumerate(wiki_dataset):
|
| 343 |
+
if i >= 3:
|
| 344 |
+
break
|
| 345 |
+
print(f"\nBatch {i + 1}:")
|
| 346 |
+
print(f" Input shape: {batch['input_ids'].shape}")
|
| 347 |
+
print(f" Labels shape: {batch['labels'].shape}")
|
generate.py
ADDED
|
@@ -0,0 +1,330 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
VicAI Text Generation
|
| 3 |
+
Interactive text generation and sampling utilities.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import argparse
|
| 7 |
+
import sys
|
| 8 |
+
|
| 9 |
+
import torch
|
| 10 |
+
|
| 11 |
+
from model import VicAIModel, VicAIConfig, create_vicai_5b
|
| 12 |
+
from tokenizer import ByteLevelBPETokenizer, BPETokenizer
|
| 13 |
+
from utils import get_logger
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def generate_interactive(
|
| 17 |
+
model,
|
| 18 |
+
tokenizer,
|
| 19 |
+
device,
|
| 20 |
+
max_new_tokens: int = 256,
|
| 21 |
+
temperature: float = 0.8,
|
| 22 |
+
top_k: int = 50,
|
| 23 |
+
top_p: float = 0.9,
|
| 24 |
+
repetition_penalty: float = 1.1,
|
| 25 |
+
):
|
| 26 |
+
"""Interactive text generation loop."""
|
| 27 |
+
print("\n" + "=" * 60)
|
| 28 |
+
print("VicAI Interactive Generation")
|
| 29 |
+
print("=" * 60)
|
| 30 |
+
print("Commands:")
|
| 31 |
+
print(" /quit - Exit the program")
|
| 32 |
+
print(" /config - Show current generation settings")
|
| 33 |
+
print(" /temp X - Set temperature (0.1 - 2.0)")
|
| 34 |
+
print(" /topk X - Set top-k (1 - 100)")
|
| 35 |
+
print(" /topp X - Set top-p (0.0 - 1.0)")
|
| 36 |
+
print(" /reppen X - Set repetition penalty (1.0 - 2.0)")
|
| 37 |
+
print(" /maxlen X - Set max new tokens")
|
| 38 |
+
print("=" * 60 + "\n")
|
| 39 |
+
|
| 40 |
+
# Current settings
|
| 41 |
+
settings = {
|
| 42 |
+
'temperature': temperature,
|
| 43 |
+
'top_k': top_k,
|
| 44 |
+
'top_p': top_p,
|
| 45 |
+
'repetition_penalty': repetition_penalty,
|
| 46 |
+
'max_new_tokens': max_new_tokens,
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
while True:
|
| 50 |
+
try:
|
| 51 |
+
# Get prompt
|
| 52 |
+
prompt = input("\nPrompt: ").strip()
|
| 53 |
+
|
| 54 |
+
# Handle commands
|
| 55 |
+
if prompt == '/quit':
|
| 56 |
+
print("Goodbye!")
|
| 57 |
+
break
|
| 58 |
+
|
| 59 |
+
if prompt == '/config':
|
| 60 |
+
print("\nCurrent settings:")
|
| 61 |
+
for key, value in settings.items():
|
| 62 |
+
print(f" {key}: {value}")
|
| 63 |
+
continue
|
| 64 |
+
|
| 65 |
+
if prompt.startswith('/temp '):
|
| 66 |
+
try:
|
| 67 |
+
settings['temperature'] = float(prompt.split()[1])
|
| 68 |
+
print(f"Temperature set to {settings['temperature']}")
|
| 69 |
+
except (ValueError, IndexError):
|
| 70 |
+
print("Invalid temperature value")
|
| 71 |
+
continue
|
| 72 |
+
|
| 73 |
+
if prompt.startswith('/topk '):
|
| 74 |
+
try:
|
| 75 |
+
settings['top_k'] = int(prompt.split()[1])
|
| 76 |
+
print(f"Top-k set to {settings['top_k']}")
|
| 77 |
+
except (ValueError, IndexError):
|
| 78 |
+
print("Invalid top-k value")
|
| 79 |
+
continue
|
| 80 |
+
|
| 81 |
+
if prompt.startswith('/topp '):
|
| 82 |
+
try:
|
| 83 |
+
settings['top_p'] = float(prompt.split()[1])
|
| 84 |
+
print(f"Top-p set to {settings['top_p']}")
|
| 85 |
+
except (ValueError, IndexError):
|
| 86 |
+
print("Invalid top-p value")
|
| 87 |
+
continue
|
| 88 |
+
|
| 89 |
+
if prompt.startswith('/reppen '):
|
| 90 |
+
try:
|
| 91 |
+
settings['repetition_penalty'] = float(prompt.split()[1])
|
| 92 |
+
print(f"Repetition penalty set to {settings['repetition_penalty']}")
|
| 93 |
+
except (ValueError, IndexError):
|
| 94 |
+
print("Invalid repetition penalty value")
|
| 95 |
+
continue
|
| 96 |
+
|
| 97 |
+
if prompt.startswith('/maxlen '):
|
| 98 |
+
try:
|
| 99 |
+
settings['max_new_tokens'] = int(prompt.split()[1])
|
| 100 |
+
print(f"Max new tokens set to {settings['max_new_tokens']}")
|
| 101 |
+
except (ValueError, IndexError):
|
| 102 |
+
print("Invalid max new tokens value")
|
| 103 |
+
continue
|
| 104 |
+
|
| 105 |
+
if not prompt:
|
| 106 |
+
continue
|
| 107 |
+
|
| 108 |
+
# Encode prompt
|
| 109 |
+
input_ids = torch.tensor([tokenizer.encode(prompt)], device=device)
|
| 110 |
+
|
| 111 |
+
# Generate
|
| 112 |
+
print("\nGenerating...")
|
| 113 |
+
with torch.no_grad():
|
| 114 |
+
output_ids = model.generate(
|
| 115 |
+
input_ids,
|
| 116 |
+
max_new_tokens=settings['max_new_tokens'],
|
| 117 |
+
temperature=settings['temperature'],
|
| 118 |
+
top_k=settings['top_k'],
|
| 119 |
+
top_p=settings['top_p'],
|
| 120 |
+
repetition_penalty=settings['repetition_penalty'],
|
| 121 |
+
eos_token_id=tokenizer.eos_token_id,
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
# Decode and print
|
| 125 |
+
generated_text = tokenizer.decode(output_ids[0].tolist())
|
| 126 |
+
# Remove the original prompt from output
|
| 127 |
+
prompt_text = tokenizer.decode(input_ids[0].tolist())
|
| 128 |
+
if generated_text.startswith(prompt_text):
|
| 129 |
+
generated_text = generated_text[len(prompt_text):].strip()
|
| 130 |
+
|
| 131 |
+
print("\n" + "-" * 60)
|
| 132 |
+
print("Generated:")
|
| 133 |
+
print("-" * 60)
|
| 134 |
+
print(generated_text)
|
| 135 |
+
print("-" * 60)
|
| 136 |
+
|
| 137 |
+
# Print token info
|
| 138 |
+
num_tokens = output_ids.shape[1] - input_ids.shape[1]
|
| 139 |
+
print(f"\nTokens generated: {num_tokens}")
|
| 140 |
+
|
| 141 |
+
except KeyboardInterrupt:
|
| 142 |
+
print("\n\nInterrupted by user. Type /quit to exit.")
|
| 143 |
+
except Exception as e:
|
| 144 |
+
print(f"\nError: {e}")
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def generate_batch(
|
| 148 |
+
model,
|
| 149 |
+
tokenizer,
|
| 150 |
+
prompts: list,
|
| 151 |
+
device,
|
| 152 |
+
max_new_tokens: int = 256,
|
| 153 |
+
temperature: float = 0.8,
|
| 154 |
+
top_k: int = 50,
|
| 155 |
+
top_p: float = 0.9,
|
| 156 |
+
):
|
| 157 |
+
"""Generate completions for multiple prompts."""
|
| 158 |
+
results = []
|
| 159 |
+
|
| 160 |
+
for prompt in prompts:
|
| 161 |
+
input_ids = torch.tensor([tokenizer.encode(prompt)], device=device)
|
| 162 |
+
|
| 163 |
+
with torch.no_grad():
|
| 164 |
+
output_ids = model.generate(
|
| 165 |
+
input_ids,
|
| 166 |
+
max_new_tokens=max_new_tokens,
|
| 167 |
+
temperature=temperature,
|
| 168 |
+
top_k=top_k,
|
| 169 |
+
top_p=top_p,
|
| 170 |
+
eos_token_id=tokenizer.eos_token_id,
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
generated_text = tokenizer.decode(output_ids[0].tolist())
|
| 174 |
+
prompt_text = tokenizer.decode(input_ids[0].tolist())
|
| 175 |
+
|
| 176 |
+
if generated_text.startswith(prompt_text):
|
| 177 |
+
generated_text = generated_text[len(prompt_text):].strip()
|
| 178 |
+
|
| 179 |
+
results.append({
|
| 180 |
+
'prompt': prompt,
|
| 181 |
+
'completion': generated_text,
|
| 182 |
+
})
|
| 183 |
+
|
| 184 |
+
return results
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def benchmark_generation(
|
| 188 |
+
model,
|
| 189 |
+
tokenizer,
|
| 190 |
+
device,
|
| 191 |
+
num_runs: int = 10,
|
| 192 |
+
max_new_tokens: int = 128,
|
| 193 |
+
prompt: str = "The future of artificial intelligence is",
|
| 194 |
+
):
|
| 195 |
+
"""Benchmark generation speed."""
|
| 196 |
+
import time
|
| 197 |
+
|
| 198 |
+
print(f"\nBenchmarking generation ({num_runs} runs)...")
|
| 199 |
+
|
| 200 |
+
input_ids = torch.tensor([tokenizer.encode(prompt)], device=device)
|
| 201 |
+
|
| 202 |
+
# Warmup
|
| 203 |
+
with torch.no_grad():
|
| 204 |
+
_ = model.generate(input_ids, max_new_tokens=10)
|
| 205 |
+
|
| 206 |
+
torch.cuda.synchronize()
|
| 207 |
+
|
| 208 |
+
# Benchmark
|
| 209 |
+
times = []
|
| 210 |
+
tokens_generated = []
|
| 211 |
+
|
| 212 |
+
for i in range(num_runs):
|
| 213 |
+
start = time.time()
|
| 214 |
+
|
| 215 |
+
with torch.no_grad():
|
| 216 |
+
output = model.generate(
|
| 217 |
+
input_ids,
|
| 218 |
+
max_new_tokens=max_new_tokens,
|
| 219 |
+
temperature=1.0,
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
torch.cuda.synchronize()
|
| 223 |
+
elapsed = time.time() - start
|
| 224 |
+
|
| 225 |
+
num_tokens = output.shape[1] - input_ids.shape[1]
|
| 226 |
+
times.append(elapsed)
|
| 227 |
+
tokens_generated.append(num_tokens)
|
| 228 |
+
|
| 229 |
+
print(f" Run {i+1}: {num_tokens} tokens in {elapsed:.2f}s ({num_tokens/elapsed:.1f} tok/s)")
|
| 230 |
+
|
| 231 |
+
avg_time = sum(times) / len(times)
|
| 232 |
+
avg_tokens = sum(tokens_generated) / len(tokens_generated)
|
| 233 |
+
avg_speed = avg_tokens / avg_time
|
| 234 |
+
|
| 235 |
+
print(f"\nAverage: {avg_tokens:.1f} tokens in {avg_time:.2f}s ({avg_speed:.1f} tok/s)")
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def main():
|
| 239 |
+
parser = argparse.ArgumentParser(description='Generate text with VicAI')
|
| 240 |
+
|
| 241 |
+
parser.add_argument('--checkpoint', type=str, required=True, help='Path to model checkpoint')
|
| 242 |
+
parser.add_argument('--tokenizer', type=str, default='tokenizer.pkl', help='Path to tokenizer')
|
| 243 |
+
parser.add_argument('--prompt', type=str, default=None, help='Single prompt to generate from')
|
| 244 |
+
parser.add_argument('--interactive', action='store_true', help='Interactive mode')
|
| 245 |
+
parser.add_argument('--max-new-tokens', type=int, default=256, help='Maximum tokens to generate')
|
| 246 |
+
parser.add_argument('--temperature', type=float, default=0.8, help='Sampling temperature')
|
| 247 |
+
parser.add_argument('--top-k', type=int, default=50, help='Top-k sampling')
|
| 248 |
+
parser.add_argument('--top-p', type=float, default=0.9, help='Top-p (nucleus) sampling')
|
| 249 |
+
parser.add_argument('--repetition-penalty', type=float, default=1.1, help='Repetition penalty')
|
| 250 |
+
parser.add_argument('--benchmark', action='store_true', help='Run generation benchmark')
|
| 251 |
+
parser.add_argument('--device', type=str, default='cuda', help='Device to use')
|
| 252 |
+
|
| 253 |
+
args = parser.parse_args()
|
| 254 |
+
|
| 255 |
+
# Setup device
|
| 256 |
+
device = torch.device(args.device if torch.cuda.is_available() else 'cpu')
|
| 257 |
+
print(f"Using device: {device}")
|
| 258 |
+
|
| 259 |
+
# Load tokenizer
|
| 260 |
+
print(f"Loading tokenizer from {args.tokenizer}...")
|
| 261 |
+
# Use ByteLevelBPETokenizer by default (our trained tokenizer)
|
| 262 |
+
tokenizer = ByteLevelBPETokenizer()
|
| 263 |
+
tokenizer.load(args.tokenizer)
|
| 264 |
+
print(f"Tokenizer loaded: {len(tokenizer)} tokens")
|
| 265 |
+
|
| 266 |
+
# Load model
|
| 267 |
+
print(f"Loading model from {args.checkpoint}...")
|
| 268 |
+
checkpoint = torch.load(args.checkpoint, map_location=device)
|
| 269 |
+
|
| 270 |
+
# Create model (assuming 5B config)
|
| 271 |
+
model = create_vicai_5b(vocab_size=len(tokenizer))
|
| 272 |
+
|
| 273 |
+
# Load weights
|
| 274 |
+
state_dict = checkpoint.get('model', checkpoint)
|
| 275 |
+
model.load_state_dict(state_dict)
|
| 276 |
+
model = model.to(device)
|
| 277 |
+
model.eval()
|
| 278 |
+
|
| 279 |
+
print(f"Model loaded: ~{model.get_num_params() / 1e9:.2f}B parameters")
|
| 280 |
+
|
| 281 |
+
# Run benchmark if requested
|
| 282 |
+
if args.benchmark:
|
| 283 |
+
benchmark_generation(model, tokenizer, device)
|
| 284 |
+
return
|
| 285 |
+
|
| 286 |
+
# Interactive mode
|
| 287 |
+
if args.interactive or args.prompt is None:
|
| 288 |
+
generate_interactive(
|
| 289 |
+
model,
|
| 290 |
+
tokenizer,
|
| 291 |
+
device,
|
| 292 |
+
max_new_tokens=args.max_new_tokens,
|
| 293 |
+
temperature=args.temperature,
|
| 294 |
+
top_k=args.top_k,
|
| 295 |
+
top_p=args.top_p,
|
| 296 |
+
repetition_penalty=args.repetition_penalty,
|
| 297 |
+
)
|
| 298 |
+
else:
|
| 299 |
+
# Single prompt generation
|
| 300 |
+
print(f"\nPrompt: {args.prompt}")
|
| 301 |
+
print("-" * 60)
|
| 302 |
+
|
| 303 |
+
input_ids = torch.tensor([tokenizer.encode(args.prompt)], device=device)
|
| 304 |
+
|
| 305 |
+
with torch.no_grad():
|
| 306 |
+
output_ids = model.generate(
|
| 307 |
+
input_ids,
|
| 308 |
+
max_new_tokens=args.max_new_tokens,
|
| 309 |
+
temperature=args.temperature,
|
| 310 |
+
top_k=args.top_k,
|
| 311 |
+
top_p=args.top_p,
|
| 312 |
+
repetition_penalty=args.repetition_penalty,
|
| 313 |
+
eos_token_id=tokenizer.eos_token_id,
|
| 314 |
+
)
|
| 315 |
+
|
| 316 |
+
generated_text = tokenizer.decode(output_ids[0].tolist())
|
| 317 |
+
prompt_text = tokenizer.decode(input_ids[0].tolist())
|
| 318 |
+
|
| 319 |
+
if generated_text.startswith(prompt_text):
|
| 320 |
+
generated_text = generated_text[len(prompt_text):].strip()
|
| 321 |
+
|
| 322 |
+
print(generated_text)
|
| 323 |
+
print("-" * 60)
|
| 324 |
+
|
| 325 |
+
num_tokens = output_ids.shape[1] - input_ids.shape[1]
|
| 326 |
+
print(f"\nGenerated {num_tokens} tokens")
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
if __name__ == '__main__':
|
| 330 |
+
main()
|
model.py
ADDED
|
@@ -0,0 +1,403 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
VicAI Model Architecture
|
| 3 |
+
A 5B parameter decoder-only transformer language model.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import math
|
| 7 |
+
from typing import Optional, Tuple
|
| 8 |
+
|
| 9 |
+
import torch
|
| 10 |
+
import torch.nn as nn
|
| 11 |
+
import torch.nn.functional as F
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class RMSNorm(nn.Module):
|
| 15 |
+
"""Root Mean Square Layer Normalization."""
|
| 16 |
+
|
| 17 |
+
def __init__(self, dim: int, eps: float = 1e-6):
|
| 18 |
+
super().__init__()
|
| 19 |
+
self.eps = eps
|
| 20 |
+
self.weight = nn.Parameter(torch.ones(dim))
|
| 21 |
+
|
| 22 |
+
def forward(self, x):
|
| 23 |
+
return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) * self.weight
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class RotaryPositionalEmbedding(nn.Module):
|
| 27 |
+
"""Rotary Position Embedding (RoPE)."""
|
| 28 |
+
|
| 29 |
+
def __init__(self, dim: int, max_seq_len: int = 8192, base: float = 10000.0):
|
| 30 |
+
super().__init__()
|
| 31 |
+
self.dim = dim
|
| 32 |
+
self.max_seq_len = max_seq_len
|
| 33 |
+
self.base = base
|
| 34 |
+
|
| 35 |
+
inv_freq = 1.0 / (self.base ** (torch.arange(0, dim, 2).float() / dim))
|
| 36 |
+
self.register_buffer("inv_freq", inv_freq)
|
| 37 |
+
|
| 38 |
+
t = torch.arange(max_seq_len)
|
| 39 |
+
freqs = torch.einsum("i,j->ij", t, inv_freq)
|
| 40 |
+
emb = torch.cat((freqs, freqs), dim=-1)
|
| 41 |
+
self.register_buffer("cos_cached", emb.cos()[None, None, :, :])
|
| 42 |
+
self.register_buffer("sin_cached", emb.sin()[None, None, :, :])
|
| 43 |
+
|
| 44 |
+
def rotate_half(self, x):
|
| 45 |
+
x1, x2 = x.chunk(2, dim=-1)
|
| 46 |
+
return torch.cat((-x2, x1), dim=-1)
|
| 47 |
+
|
| 48 |
+
def apply_rotary_pos_emb(self, q, k, cos, sin):
|
| 49 |
+
q_embed = (q * cos) + (self.rotate_half(q) * sin)
|
| 50 |
+
k_embed = (k * cos) + (self.rotate_half(k) * sin)
|
| 51 |
+
return q_embed, k_embed
|
| 52 |
+
|
| 53 |
+
def forward(self, q, k, seq_len: int):
|
| 54 |
+
cos = self.cos_cached[:, :, :seq_len, :]
|
| 55 |
+
sin = self.sin_cached[:, :, :seq_len, :]
|
| 56 |
+
return self.apply_rotary_pos_emb(q, k, cos, sin)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class GroupedQueryAttention(nn.Module):
|
| 60 |
+
"""Grouped Query Attention (GQA) for efficient inference."""
|
| 61 |
+
|
| 62 |
+
def __init__(
|
| 63 |
+
self,
|
| 64 |
+
dim: int,
|
| 65 |
+
n_heads: int,
|
| 66 |
+
n_kv_heads: int,
|
| 67 |
+
dropout: float = 0.0,
|
| 68 |
+
):
|
| 69 |
+
super().__init__()
|
| 70 |
+
self.dim = dim
|
| 71 |
+
self.n_heads = n_heads
|
| 72 |
+
self.n_kv_heads = n_kv_heads
|
| 73 |
+
self.head_dim = dim // n_heads
|
| 74 |
+
self.n_rep = n_heads // n_kv_heads
|
| 75 |
+
|
| 76 |
+
self.wq = nn.Linear(dim, n_heads * self.head_dim, bias=False)
|
| 77 |
+
self.wk = nn.Linear(dim, n_kv_heads * self.head_dim, bias=False)
|
| 78 |
+
self.wv = nn.Linear(dim, n_kv_heads * self.head_dim, bias=False)
|
| 79 |
+
self.wo = nn.Linear(n_heads * self.head_dim, dim, bias=False)
|
| 80 |
+
|
| 81 |
+
self.attn_dropout = nn.Dropout(dropout)
|
| 82 |
+
self.resid_dropout = nn.Dropout(dropout)
|
| 83 |
+
|
| 84 |
+
self.rope = RotaryPositionalEmbedding(self.head_dim)
|
| 85 |
+
|
| 86 |
+
def forward(
|
| 87 |
+
self,
|
| 88 |
+
x: torch.Tensor,
|
| 89 |
+
mask: Optional[torch.Tensor] = None,
|
| 90 |
+
past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
|
| 91 |
+
):
|
| 92 |
+
bsz, seq_len, _ = x.shape
|
| 93 |
+
|
| 94 |
+
q = self.wq(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
|
| 95 |
+
k = self.wk(x).view(bsz, seq_len, self.n_kv_heads, self.head_dim).transpose(1, 2)
|
| 96 |
+
v = self.wv(x).view(bsz, seq_len, self.n_kv_heads, self.head_dim).transpose(1, 2)
|
| 97 |
+
|
| 98 |
+
q, k = self.rope(q, k, seq_len)
|
| 99 |
+
|
| 100 |
+
if past_key_value is not None:
|
| 101 |
+
past_k, past_v = past_key_value
|
| 102 |
+
k = torch.cat([past_k, k], dim=2)
|
| 103 |
+
v = torch.cat([past_v, v], dim=2)
|
| 104 |
+
|
| 105 |
+
past_key_value = (k, v)
|
| 106 |
+
|
| 107 |
+
# Repeat k/v for grouped query attention
|
| 108 |
+
k = k.repeat_interleave(self.n_rep, dim=1)
|
| 109 |
+
v = v.repeat_interleave(self.n_rep, dim=1)
|
| 110 |
+
|
| 111 |
+
scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
|
| 112 |
+
|
| 113 |
+
if mask is not None:
|
| 114 |
+
scores = scores + mask
|
| 115 |
+
|
| 116 |
+
attn = F.softmax(scores, dim=-1)
|
| 117 |
+
attn = self.attn_dropout(attn)
|
| 118 |
+
|
| 119 |
+
out = torch.matmul(attn, v)
|
| 120 |
+
out = out.transpose(1, 2).contiguous().view(bsz, seq_len, self.dim)
|
| 121 |
+
out = self.wo(out)
|
| 122 |
+
out = self.resid_dropout(out)
|
| 123 |
+
|
| 124 |
+
return out, past_key_value
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
class FeedForward(nn.Module):
|
| 128 |
+
"""SwiGLU Feed-Forward Network."""
|
| 129 |
+
|
| 130 |
+
def __init__(self, dim: int, hidden_dim: int, dropout: float = 0.0):
|
| 131 |
+
super().__init__()
|
| 132 |
+
self.w1 = nn.Linear(dim, hidden_dim, bias=False)
|
| 133 |
+
self.w2 = nn.Linear(hidden_dim, dim, bias=False)
|
| 134 |
+
self.w3 = nn.Linear(dim, hidden_dim, bias=False)
|
| 135 |
+
self.dropout = nn.Dropout(dropout)
|
| 136 |
+
|
| 137 |
+
def forward(self, x):
|
| 138 |
+
return self.w2(F.silu(self.w1(x)) * self.w3(x))
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
class TransformerBlock(nn.Module):
|
| 142 |
+
"""Single transformer block with pre-normalization."""
|
| 143 |
+
|
| 144 |
+
def __init__(
|
| 145 |
+
self,
|
| 146 |
+
dim: int,
|
| 147 |
+
n_heads: int,
|
| 148 |
+
n_kv_heads: int,
|
| 149 |
+
hidden_dim: int,
|
| 150 |
+
dropout: float = 0.0,
|
| 151 |
+
):
|
| 152 |
+
super().__init__()
|
| 153 |
+
self.attention_norm = RMSNorm(dim)
|
| 154 |
+
self.attention = GroupedQueryAttention(dim, n_heads, n_kv_heads, dropout)
|
| 155 |
+
self.ffn_norm = RMSNorm(dim)
|
| 156 |
+
self.feed_forward = FeedForward(dim, hidden_dim, dropout)
|
| 157 |
+
|
| 158 |
+
def forward(
|
| 159 |
+
self,
|
| 160 |
+
x: torch.Tensor,
|
| 161 |
+
mask: Optional[torch.Tensor] = None,
|
| 162 |
+
past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
|
| 163 |
+
):
|
| 164 |
+
# Attention with residual
|
| 165 |
+
attn_out, past_key_value = self.attention(
|
| 166 |
+
self.attention_norm(x), mask, past_key_value
|
| 167 |
+
)
|
| 168 |
+
x = x + attn_out
|
| 169 |
+
|
| 170 |
+
# FFN with residual
|
| 171 |
+
x = x + self.feed_forward(self.ffn_norm(x))
|
| 172 |
+
|
| 173 |
+
return x, past_key_value
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
class VicAIConfig:
|
| 177 |
+
"""Configuration for VicAI model."""
|
| 178 |
+
|
| 179 |
+
def __init__(
|
| 180 |
+
self,
|
| 181 |
+
vocab_size: int = 32000,
|
| 182 |
+
dim: int = 4096,
|
| 183 |
+
n_layers: int = 32,
|
| 184 |
+
n_heads: int = 32,
|
| 185 |
+
n_kv_heads: int = 8,
|
| 186 |
+
hidden_dim: int = 14336,
|
| 187 |
+
max_seq_len: int = 8192,
|
| 188 |
+
dropout: float = 0.0,
|
| 189 |
+
tie_weights: bool = False,
|
| 190 |
+
):
|
| 191 |
+
self.vocab_size = vocab_size
|
| 192 |
+
self.dim = dim
|
| 193 |
+
self.n_layers = n_layers
|
| 194 |
+
self.n_heads = n_heads
|
| 195 |
+
self.n_kv_heads = n_kv_heads
|
| 196 |
+
self.hidden_dim = hidden_dim
|
| 197 |
+
self.max_seq_len = max_seq_len
|
| 198 |
+
self.dropout = dropout
|
| 199 |
+
self.tie_weights = tie_weights
|
| 200 |
+
|
| 201 |
+
@property
|
| 202 |
+
def num_parameters(self):
|
| 203 |
+
"""Calculate approximate parameter count."""
|
| 204 |
+
# Embedding
|
| 205 |
+
params = self.vocab_size * self.dim
|
| 206 |
+
# Attention per layer
|
| 207 |
+
attn_params = 4 * self.dim * self.dim # q, k, v, o projections
|
| 208 |
+
# FFN per layer
|
| 209 |
+
ffn_params = 3 * self.dim * self.hidden_dim # w1, w2, w3
|
| 210 |
+
# Layers
|
| 211 |
+
params += self.n_layers * (attn_params + ffn_params)
|
| 212 |
+
# Output
|
| 213 |
+
params += self.vocab_size * self.dim
|
| 214 |
+
return params
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
class VicAIModel(nn.Module):
|
| 218 |
+
"""
|
| 219 |
+
VicAI: A 5B parameter decoder-only transformer language model.
|
| 220 |
+
|
| 221 |
+
Architecture details:
|
| 222 |
+
- 32 layers
|
| 223 |
+
- 4096 model dimension
|
| 224 |
+
- 32 attention heads (8 key-value heads for GQA)
|
| 225 |
+
- SwiGLU FFN with 14336 hidden dimension
|
| 226 |
+
- RoPE positional embeddings
|
| 227 |
+
- RMSNorm pre-normalization
|
| 228 |
+
- ~5.1B total parameters
|
| 229 |
+
"""
|
| 230 |
+
|
| 231 |
+
def __init__(self, config: VicAIConfig):
|
| 232 |
+
super().__init__()
|
| 233 |
+
self.config = config
|
| 234 |
+
|
| 235 |
+
self.token_embedding = nn.Embedding(config.vocab_size, config.dim)
|
| 236 |
+
self.dropout = nn.Dropout(config.dropout)
|
| 237 |
+
|
| 238 |
+
self.layers = nn.ModuleList([
|
| 239 |
+
TransformerBlock(
|
| 240 |
+
config.dim,
|
| 241 |
+
config.n_heads,
|
| 242 |
+
config.n_kv_heads,
|
| 243 |
+
config.hidden_dim,
|
| 244 |
+
config.dropout,
|
| 245 |
+
)
|
| 246 |
+
for _ in range(config.n_layers)
|
| 247 |
+
])
|
| 248 |
+
|
| 249 |
+
self.norm = RMSNorm(config.dim)
|
| 250 |
+
self.lm_head = nn.Linear(config.dim, config.vocab_size, bias=False)
|
| 251 |
+
|
| 252 |
+
if config.tie_weights:
|
| 253 |
+
self.lm_head.weight = self.token_embedding.weight
|
| 254 |
+
|
| 255 |
+
self.apply(self._init_weights)
|
| 256 |
+
|
| 257 |
+
# Print model info
|
| 258 |
+
total_params = self.get_num_params()
|
| 259 |
+
print(f"VicAI Model initialized with {total_params / 1e9:.2f}B parameters")
|
| 260 |
+
|
| 261 |
+
def _init_weights(self, module):
|
| 262 |
+
if isinstance(module, nn.Linear):
|
| 263 |
+
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
| 264 |
+
if module.bias is not None:
|
| 265 |
+
torch.nn.init.zeros_(module.bias)
|
| 266 |
+
elif isinstance(module, nn.Embedding):
|
| 267 |
+
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
| 268 |
+
|
| 269 |
+
def get_num_params(self, non_embedding=True):
|
| 270 |
+
n_params = sum(p.numel() for p in self.parameters())
|
| 271 |
+
if non_embedding:
|
| 272 |
+
n_params -= self.token_embedding.weight.numel()
|
| 273 |
+
return n_params
|
| 274 |
+
|
| 275 |
+
def forward(
|
| 276 |
+
self,
|
| 277 |
+
input_ids: torch.Tensor,
|
| 278 |
+
targets: Optional[torch.Tensor] = None,
|
| 279 |
+
past_key_values: Optional[list] = None,
|
| 280 |
+
):
|
| 281 |
+
bsz, seq_len = input_ids.shape
|
| 282 |
+
|
| 283 |
+
# Create causal mask
|
| 284 |
+
mask = torch.triu(
|
| 285 |
+
torch.ones(seq_len, seq_len, device=input_ids.device),
|
| 286 |
+
diagonal=1
|
| 287 |
+
).bool()
|
| 288 |
+
mask = mask.unsqueeze(0).unsqueeze(0)
|
| 289 |
+
mask = mask.to(input_ids.device)
|
| 290 |
+
mask = torch.where(mask, float('-inf'), 0.0)
|
| 291 |
+
|
| 292 |
+
x = self.token_embedding(input_ids)
|
| 293 |
+
x = self.dropout(x)
|
| 294 |
+
|
| 295 |
+
new_key_values = []
|
| 296 |
+
for i, layer in enumerate(self.layers):
|
| 297 |
+
past_kv = past_key_values[i] if past_key_values is not None else None
|
| 298 |
+
x, kv = layer(x, mask, past_kv)
|
| 299 |
+
new_key_values.append(kv)
|
| 300 |
+
|
| 301 |
+
x = self.norm(x)
|
| 302 |
+
logits = self.lm_head(x)
|
| 303 |
+
|
| 304 |
+
loss = None
|
| 305 |
+
if targets is not None:
|
| 306 |
+
loss = F.cross_entropy(
|
| 307 |
+
logits.view(-1, logits.size(-1)),
|
| 308 |
+
targets.view(-1),
|
| 309 |
+
ignore_index=-100
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
return {
|
| 313 |
+
'logits': logits,
|
| 314 |
+
'loss': loss,
|
| 315 |
+
'past_key_values': new_key_values,
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
@torch.no_grad()
|
| 319 |
+
def generate(
|
| 320 |
+
self,
|
| 321 |
+
input_ids: torch.Tensor,
|
| 322 |
+
max_new_tokens: int = 100,
|
| 323 |
+
temperature: float = 1.0,
|
| 324 |
+
top_k: int = 50,
|
| 325 |
+
top_p: float = 0.9,
|
| 326 |
+
repetition_penalty: float = 1.0,
|
| 327 |
+
eos_token_id: Optional[int] = None,
|
| 328 |
+
):
|
| 329 |
+
"""Generate text autoregressively."""
|
| 330 |
+
self.eval()
|
| 331 |
+
|
| 332 |
+
batch_size = input_ids.shape[0]
|
| 333 |
+
device = input_ids.device
|
| 334 |
+
past_key_values = None
|
| 335 |
+
|
| 336 |
+
for _ in range(max_new_tokens):
|
| 337 |
+
outputs = self(input_ids, past_key_values=past_key_values)
|
| 338 |
+
logits = outputs['logits']
|
| 339 |
+
past_key_values = outputs['past_key_values']
|
| 340 |
+
|
| 341 |
+
# Get logits for last token
|
| 342 |
+
logits = logits[:, -1, :] / temperature
|
| 343 |
+
|
| 344 |
+
# Apply repetition penalty
|
| 345 |
+
if repetition_penalty != 1.0:
|
| 346 |
+
for i in range(batch_size):
|
| 347 |
+
for token_id in set(input_ids[i].tolist()):
|
| 348 |
+
logits[i, token_id] /= repetition_penalty
|
| 349 |
+
|
| 350 |
+
# Top-k filtering
|
| 351 |
+
if top_k > 0:
|
| 352 |
+
indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
|
| 353 |
+
logits[indices_to_remove] = float('-inf')
|
| 354 |
+
|
| 355 |
+
# Top-p (nucleus) filtering
|
| 356 |
+
if top_p < 1.0:
|
| 357 |
+
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
|
| 358 |
+
cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
|
| 359 |
+
sorted_indices_to_remove = cumulative_probs > top_p
|
| 360 |
+
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
|
| 361 |
+
sorted_indices_to_remove[..., 0] = 0
|
| 362 |
+
indices_to_remove = sorted_indices_to_remove.scatter(
|
| 363 |
+
1, sorted_indices, sorted_indices_to_remove
|
| 364 |
+
)
|
| 365 |
+
logits[indices_to_remove] = float('-inf')
|
| 366 |
+
|
| 367 |
+
probs = F.softmax(logits, dim=-1)
|
| 368 |
+
next_token = torch.multinomial(probs, num_samples=1)
|
| 369 |
+
|
| 370 |
+
input_ids = torch.cat([input_ids, next_token], dim=1)
|
| 371 |
+
|
| 372 |
+
# Early stopping if EOS token generated
|
| 373 |
+
if eos_token_id is not None and (next_token == eos_token_id).all():
|
| 374 |
+
break
|
| 375 |
+
|
| 376 |
+
return input_ids
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
def create_vicai_5b(vocab_size: int = 32000) -> VicAIModel:
|
| 380 |
+
"""Create a 5B parameter VicAI model."""
|
| 381 |
+
config = VicAIConfig(
|
| 382 |
+
vocab_size=vocab_size,
|
| 383 |
+
dim=4096,
|
| 384 |
+
n_layers=32,
|
| 385 |
+
n_heads=32,
|
| 386 |
+
n_kv_heads=8,
|
| 387 |
+
hidden_dim=14336,
|
| 388 |
+
max_seq_len=8192,
|
| 389 |
+
dropout=0.0,
|
| 390 |
+
)
|
| 391 |
+
return VicAIModel(config)
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
if __name__ == "__main__":
|
| 395 |
+
# Test model creation
|
| 396 |
+
model = create_vicai_5b()
|
| 397 |
+
print(f"Total parameters: {model.get_num_params() / 1e9:.2f}B")
|
| 398 |
+
|
| 399 |
+
# Test forward pass
|
| 400 |
+
x = torch.randint(0, 32000, (2, 128))
|
| 401 |
+
outputs = model(x)
|
| 402 |
+
print(f"Output shape: {outputs['logits'].shape}")
|
| 403 |
+
print(f"Loss: {outputs['loss']}")
|
readme.md
ADDED
|
@@ -0,0 +1,303 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# VicAI
|
| 2 |
+
|
| 3 |
+
A 5B parameter decoder-only transformer language model built from scratch in PyTorch.
|
| 4 |
+
|
| 5 |
+
## Overview
|
| 6 |
+
|
| 7 |
+
VicAI is a state-of-the-art language model featuring:
|
| 8 |
+
- **5.1B parameters** with 32 transformer layers
|
| 9 |
+
- **Grouped Query Attention (GQA)** for efficient inference
|
| 10 |
+
- **Rotary Position Embeddings (RoPE)** for better long-context modeling
|
| 11 |
+
- **SwiGLU activation** in feed-forward layers
|
| 12 |
+
- **RMSNorm** pre-normalization
|
| 13 |
+
- **Byte-level BPE tokenization** (32K vocabulary)
|
| 14 |
+
|
| 15 |
+
## Architecture
|
| 16 |
+
|
| 17 |
+
| Component | Specification |
|
| 18 |
+
|-----------|---------------|
|
| 19 |
+
| Parameters | ~5.1B |
|
| 20 |
+
| Layers | 32 |
|
| 21 |
+
| Hidden Dim | 4096 |
|
| 22 |
+
| FFN Dim | 14336 |
|
| 23 |
+
| Attention Heads | 32 |
|
| 24 |
+
| KV Heads | 8 (GQA) |
|
| 25 |
+
| Context Length | 8192 |
|
| 26 |
+
| Vocabulary | 32,000 |
|
| 27 |
+
|
| 28 |
+
## File Structure
|
| 29 |
+
|
| 30 |
+
```
|
| 31 |
+
vicai/
|
| 32 |
+
├── model.py # Model architecture and VicAI 5B config
|
| 33 |
+
├── tokenizer.py # BPE tokenizer implementation
|
| 34 |
+
├── dataset.py # Data loading (Wikipedia + custom sources)
|
| 35 |
+
├── train.py # Distributed training script
|
| 36 |
+
├── utils.py # Training utilities and helpers
|
| 37 |
+
├── generate.py # Text generation and inference
|
| 38 |
+
├── requirements.txt # Dependencies
|
| 39 |
+
└── README.md # This file
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
## Installation
|
| 43 |
+
|
| 44 |
+
```bash
|
| 45 |
+
# Clone the repository
|
| 46 |
+
git clone https://github.com/yourusername/vicai.git
|
| 47 |
+
cd vicai
|
| 48 |
+
|
| 49 |
+
# Create virtual environment
|
| 50 |
+
python -m venv venv
|
| 51 |
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
| 52 |
+
|
| 53 |
+
# Install dependencies
|
| 54 |
+
pip install -r requirements.txt
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
## Quick Start
|
| 58 |
+
|
| 59 |
+
### 1. Prepare Training Data
|
| 60 |
+
|
| 61 |
+
Option A: Create sample corpus from Wikipedia
|
| 62 |
+
```bash
|
| 63 |
+
python -c "from dataset import create_sample_corpus; create_sample_corpus('data/train.txt', num_articles=10000)"
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
Option B: Use your own text files
|
| 67 |
+
```bash
|
| 68 |
+
# Place your text files in data/ directory
|
| 69 |
+
# Format: plain text with <|endoftext|> markers between documents
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
### 2. Train Tokenizer
|
| 73 |
+
|
| 74 |
+
```python
|
| 75 |
+
from tokenizer import ByteLevelBPETokenizer
|
| 76 |
+
from dataset import create_sample_corpus
|
| 77 |
+
|
| 78 |
+
# Create corpus
|
| 79 |
+
corpus = create_sample_corpus('data/train.txt', num_articles=1000)
|
| 80 |
+
|
| 81 |
+
# Read texts
|
| 82 |
+
with open(corpus, 'r') as f:
|
| 83 |
+
texts = f.read().split('<|endoftext|>')
|
| 84 |
+
|
| 85 |
+
# Train tokenizer
|
| 86 |
+
tokenizer = ByteLevelBPETokenizer(vocab_size=32000)
|
| 87 |
+
tokenizer.train([t for t in texts if t.strip()])
|
| 88 |
+
tokenizer.save('tokenizer.pkl')
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
### 3. Train Model
|
| 92 |
+
|
| 93 |
+
Single GPU:
|
| 94 |
+
```bash
|
| 95 |
+
python train.py \
|
| 96 |
+
--train-data data/train.txt \
|
| 97 |
+
--val-data data/val.txt \
|
| 98 |
+
--tokenizer tokenizer.pkl \
|
| 99 |
+
--batch-size 4 \
|
| 100 |
+
--max-steps 100000 \
|
| 101 |
+
--output-dir checkpoints
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
Multi-GPU (DDP):
|
| 105 |
+
```bash
|
| 106 |
+
torchrun --nproc_per_node=4 train.py \
|
| 107 |
+
--train-data data/train.txt \
|
| 108 |
+
--val-data data/val.txt \
|
| 109 |
+
--batch-size 1 \
|
| 110 |
+
--max-steps 100000 \
|
| 111 |
+
--output-dir checkpoints
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
Multi-GPU (FSDP):
|
| 115 |
+
```bash
|
| 116 |
+
torchrun --nproc_per_node=8 train.py \
|
| 117 |
+
--use-fsdp \
|
| 118 |
+
--train-data data/train.txt \
|
| 119 |
+
--batch-size 1 \
|
| 120 |
+
--output-dir checkpoints
|
| 121 |
+
```
|
| 122 |
+
|
| 123 |
+
### 4. Generate Text
|
| 124 |
+
|
| 125 |
+
Interactive mode:
|
| 126 |
+
```bash
|
| 127 |
+
python generate.py \
|
| 128 |
+
--checkpoint checkpoints/best_model.pt \
|
| 129 |
+
--tokenizer tokenizer.pkl \
|
| 130 |
+
--interactive
|
| 131 |
+
```
|
| 132 |
+
|
| 133 |
+
Single prompt:
|
| 134 |
+
```bash
|
| 135 |
+
python generate.py \
|
| 136 |
+
--checkpoint checkpoints/best_model.pt \
|
| 137 |
+
--tokenizer tokenizer.pkl \
|
| 138 |
+
--prompt "The future of AI is" \
|
| 139 |
+
--max-new-tokens 256
|
| 140 |
+
```
|
| 141 |
+
|
| 142 |
+
## Training Configuration
|
| 143 |
+
|
| 144 |
+
### Default Hyperparameters
|
| 145 |
+
|
| 146 |
+
| Parameter | Value |
|
| 147 |
+
|-----------|-------|
|
| 148 |
+
| Learning Rate | 3e-4 |
|
| 149 |
+
| Min LR | 3e-5 |
|
| 150 |
+
| Warmup Steps | 2,000 |
|
| 151 |
+
| Weight Decay | 0.1 |
|
| 152 |
+
| Batch Size | 4 (per device) |
|
| 153 |
+
| Max Steps | 100,000 |
|
| 154 |
+
| Beta1 | 0.9 |
|
| 155 |
+
| Beta2 | 0.95 |
|
| 156 |
+
|
| 157 |
+
### Training Tips
|
| 158 |
+
|
| 159 |
+
- **Memory constrained?** Reduce batch size or use gradient accumulation
|
| 160 |
+
- **Longer context?** Increase `--max-seq-len` (up to 8192)
|
| 161 |
+
- **Faster training?** Enable `--compile` for torch.compile optimization
|
| 162 |
+
- **Better quality?** Train longer or use larger dataset
|
| 163 |
+
|
| 164 |
+
## Generation Parameters
|
| 165 |
+
|
| 166 |
+
| Parameter | Default | Description |
|
| 167 |
+
|-----------|---------|-------------|
|
| 168 |
+
| temperature | 0.8 | Lower = more focused, higher = more random |
|
| 169 |
+
| top_k | 50 | Consider only top-k tokens |
|
| 170 |
+
| top_p | 0.9 | Nucleus sampling threshold |
|
| 171 |
+
| repetition_penalty | 1.1 | Penalize repeated tokens |
|
| 172 |
+
| max_new_tokens | 256 | Maximum tokens to generate |
|
| 173 |
+
|
| 174 |
+
## Data Sources
|
| 175 |
+
|
| 176 |
+
The model can be trained on:
|
| 177 |
+
|
| 178 |
+
1. **Wikipedia** (streaming via API)
|
| 179 |
+
2. **OpenWebText** (Common Crawl filtered)
|
| 180 |
+
3. **Custom text files** (your own data)
|
| 181 |
+
4. **Mixed datasets** (combine multiple sources)
|
| 182 |
+
|
| 183 |
+
## Hardware Requirements
|
| 184 |
+
|
| 185 |
+
### Training
|
| 186 |
+
|
| 187 |
+
| GPUs | VRAM per GPU | Config |
|
| 188 |
+
|------|--------------|--------|
|
| 189 |
+
| 1x A100 (80GB) | 80GB | batch_size=4, compile=True |
|
| 190 |
+
| 4x A100 (40GB) | 40GB | batch_size=1, DDP |
|
| 191 |
+
| 8x A100 (40GB) | 40GB | batch_size=1, FSDP |
|
| 192 |
+
| 1x RTX 4090 | 24GB | batch_size=1, smaller model |
|
| 193 |
+
|
| 194 |
+
### Inference
|
| 195 |
+
|
| 196 |
+
- Minimum: 1x GPU with 16GB VRAM (with quantization)
|
| 197 |
+
- Recommended: 1x GPU with 24GB+ VRAM
|
| 198 |
+
|
| 199 |
+
## Model Architecture Details
|
| 200 |
+
|
| 201 |
+
### Grouped Query Attention
|
| 202 |
+
|
| 203 |
+
Uses 8 key-value heads instead of 32, reducing memory bandwidth during inference while maintaining quality.
|
| 204 |
+
|
| 205 |
+
### Rotary Position Embeddings
|
| 206 |
+
|
| 207 |
+
Rotary embeddings are applied to queries and keys, providing better relative position encoding than absolute embeddings.
|
| 208 |
+
|
| 209 |
+
### SwiGLU Feed-Forward
|
| 210 |
+
|
| 211 |
+
```python
|
| 212 |
+
FFN(x) = (silu(W1 @ x) * (W3 @ x)) @ W2
|
| 213 |
+
```
|
| 214 |
+
|
| 215 |
+
This has been shown to improve training stability and performance.
|
| 216 |
+
|
| 217 |
+
## Example Usage
|
| 218 |
+
|
| 219 |
+
```python
|
| 220 |
+
from model import create_vicai_5b
|
| 221 |
+
from tokenizer import ByteLevelBPETokenizer
|
| 222 |
+
import torch
|
| 223 |
+
|
| 224 |
+
# Load tokenizer
|
| 225 |
+
tokenizer = ByteLevelBPETokenizer()
|
| 226 |
+
tokenizer.load('tokenizer.pkl')
|
| 227 |
+
|
| 228 |
+
# Create model
|
| 229 |
+
model = create_vicai_5b(vocab_size=len(tokenizer))
|
| 230 |
+
|
| 231 |
+
# Load checkpoint
|
| 232 |
+
checkpoint = torch.load('checkpoints/best_model.pt')
|
| 233 |
+
model.load_state_dict(checkpoint['model'])
|
| 234 |
+
model = model.cuda()
|
| 235 |
+
|
| 236 |
+
# Generate
|
| 237 |
+
text = "Artificial intelligence will"
|
| 238 |
+
input_ids = torch.tensor([tokenizer.encode(text)]).cuda()
|
| 239 |
+
|
| 240 |
+
with torch.no_grad():
|
| 241 |
+
output = model.generate(
|
| 242 |
+
input_ids,
|
| 243 |
+
max_new_tokens=100,
|
| 244 |
+
temperature=0.8,
|
| 245 |
+
top_k=50,
|
| 246 |
+
top_p=0.9,
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
generated = tokenizer.decode(output[0].tolist())
|
| 250 |
+
print(generated)
|
| 251 |
+
```
|
| 252 |
+
|
| 253 |
+
## Citation
|
| 254 |
+
|
| 255 |
+
If you use VicAI in your research, please cite:
|
| 256 |
+
|
| 257 |
+
```bibtex
|
| 258 |
+
@software{vicai2024,
|
| 259 |
+
title = {VicAI: A 5B Parameter Language Model from Scratch},
|
| 260 |
+
author = {Your Name},
|
| 261 |
+
year = {2024},
|
| 262 |
+
url = {https://github.com/yourusername/vicai}
|
| 263 |
+
}
|
| 264 |
+
```
|
| 265 |
+
|
| 266 |
+
## License
|
| 267 |
+
|
| 268 |
+
This project is licensed under the MIT License.
|
| 269 |
+
|
| 270 |
+
## Acknowledgments
|
| 271 |
+
|
| 272 |
+
- Transformer architecture based on "Attention Is All You Need"
|
| 273 |
+
- RoPE embeddings from RoFormer
|
| 274 |
+
- GQA from the Llama 2 paper
|
| 275 |
+
- SwiGLU from the PaLM paper
|
| 276 |
+
|
| 277 |
+
## Contributing
|
| 278 |
+
|
| 279 |
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
| 280 |
+
|
| 281 |
+
## Troubleshooting
|
| 282 |
+
|
| 283 |
+
### CUDA Out of Memory
|
| 284 |
+
- Reduce batch size
|
| 285 |
+
- Enable gradient checkpointing
|
| 286 |
+
- Use FSDP for multi-GPU training
|
| 287 |
+
- Reduce sequence length
|
| 288 |
+
|
| 289 |
+
### Slow Training
|
| 290 |
+
- Enable `--compile` flag
|
| 291 |
+
- Use mixed precision (AMP)
|
| 292 |
+
- Ensure data is on fast storage (SSD)
|
| 293 |
+
- Use DataLoader `num_workers > 0`
|
| 294 |
+
|
| 295 |
+
### Poor Generation Quality
|
| 296 |
+
- Train longer
|
| 297 |
+
- Use larger, higher quality dataset
|
| 298 |
+
- Adjust sampling parameters (temperature, top_p)
|
| 299 |
+
- Check tokenizer was trained on similar data
|
| 300 |
+
|
| 301 |
+
## Contact
|
| 302 |
+
|
| 303 |
+
For questions or issues, please open a GitHub issue or contact the maintainers.
|
requirements.txt
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch>=2.0.0
|
| 2 |
+
torchvision>=0.15.0
|
| 3 |
+
torchaudio>=2.0.0
|
| 4 |
+
numpy>=1.24.0
|
| 5 |
+
tqdm>=4.65.0
|
| 6 |
+
requests>=2.28.0
|
| 7 |
+
transformers>=4.30.0
|
| 8 |
+
datasets>=2.12.0
|
| 9 |
+
accelerate>=0.20.0
|
| 10 |
+
sentencepiece>=0.1.99
|
| 11 |
+
protobuf>=3.20.0
|
| 12 |
+
wandb>=0.15.0
|
| 13 |
+
tensorboard>=2.13.0
|
| 14 |
+
matplotlib>=3.7.0
|
| 15 |
+
scipy>=1.10.0
|
| 16 |
+
scikit-learn>=1.2.0
|
| 17 |
+
pandas>=2.0.0
|
| 18 |
+
pyyaml>=6.0
|
| 19 |
+
regex>=2023.0.0
|
| 20 |
+
filelock>=3.12.0
|
| 21 |
+
packaging>=23.0
|
| 22 |
+
psutil>=5.9.0
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:862358e10e9e9f7c70f593dd3e8d2aa9da1ceca56947cff0545204d943c27baf
|
| 3 |
+
size 71877
|
tokenizer.py
ADDED
|
@@ -0,0 +1,396 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
VicAI Tokenizer
|
| 3 |
+
Byte-Pair Encoding (BPE) tokenizer implementation.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import pickle
|
| 8 |
+
import re
|
| 9 |
+
from collections import defaultdict
|
| 10 |
+
from typing import Dict, List, Optional, Union
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class BPETokenizer:
|
| 14 |
+
"""Byte-Pair Encoding Tokenizer."""
|
| 15 |
+
|
| 16 |
+
def __init__(self, vocab_size: int = 32000):
|
| 17 |
+
self.vocab_size = vocab_size
|
| 18 |
+
self.vocab = {}
|
| 19 |
+
self.merges = []
|
| 20 |
+
self.special_tokens = {
|
| 21 |
+
'<pad>': 0,
|
| 22 |
+
'<unk>': 1,
|
| 23 |
+
'<s>': 2,
|
| 24 |
+
'</s>': 3,
|
| 25 |
+
'<mask>': 4,
|
| 26 |
+
}
|
| 27 |
+
self.pad_token_id = 0
|
| 28 |
+
self.unk_token_id = 1
|
| 29 |
+
self.bos_token_id = 2
|
| 30 |
+
self.eos_token_id = 3
|
| 31 |
+
self.mask_token_id = 4
|
| 32 |
+
|
| 33 |
+
def _get_stats(self, vocab):
|
| 34 |
+
"""Get counts of all symbol pairs."""
|
| 35 |
+
pairs = defaultdict(int)
|
| 36 |
+
for word, freq in vocab.items():
|
| 37 |
+
symbols = word.split()
|
| 38 |
+
for i in range(len(symbols) - 1):
|
| 39 |
+
pairs[(symbols[i], symbols[i + 1])] += freq
|
| 40 |
+
return pairs
|
| 41 |
+
|
| 42 |
+
def _merge_vocab(self, pair, vocab):
|
| 43 |
+
"""Merge all occurrences of pair in vocab."""
|
| 44 |
+
bigram = re.escape(' '.join(pair))
|
| 45 |
+
pattern = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
|
| 46 |
+
new_vocab = {}
|
| 47 |
+
for word in vocab:
|
| 48 |
+
new_word = pattern.sub(''.join(pair), word)
|
| 49 |
+
new_vocab[new_word] = vocab[word]
|
| 50 |
+
return new_vocab
|
| 51 |
+
|
| 52 |
+
def _pre_tokenize(self, text: str) -> List[str]:
|
| 53 |
+
"""Pre-tokenize text into words."""
|
| 54 |
+
# Simple whitespace and punctuation tokenization
|
| 55 |
+
pattern = r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"
|
| 56 |
+
return re.findall(pattern, text)
|
| 57 |
+
|
| 58 |
+
def train(self, texts: List[str]):
|
| 59 |
+
"""Train BPE on a list of texts."""
|
| 60 |
+
print(f"Training BPE tokenizer with vocab_size={self.vocab_size}")
|
| 61 |
+
|
| 62 |
+
# Initialize vocabulary with special tokens
|
| 63 |
+
self.vocab = {token: i for token, i in self.special_tokens.items()}
|
| 64 |
+
|
| 65 |
+
# Build word frequency dictionary
|
| 66 |
+
vocab = defaultdict(int)
|
| 67 |
+
for text in texts:
|
| 68 |
+
words = self._pre_tokenize(text.lower())
|
| 69 |
+
for word in words:
|
| 70 |
+
# End word with </w>
|
| 71 |
+
word = ' '.join(list(word)) + ' </w>'
|
| 72 |
+
vocab[tuple(word.split())] += 1
|
| 73 |
+
|
| 74 |
+
# Convert to string format
|
| 75 |
+
vocab = {' '.join(k): v for k, v in vocab.items()}
|
| 76 |
+
|
| 77 |
+
# Add individual characters to vocab
|
| 78 |
+
for word in vocab:
|
| 79 |
+
for char in word.split():
|
| 80 |
+
if char not in self.vocab:
|
| 81 |
+
self.vocab[char] = len(self.vocab)
|
| 82 |
+
|
| 83 |
+
# BPE training
|
| 84 |
+
num_merges = self.vocab_size - len(self.vocab)
|
| 85 |
+
for i in range(num_merges):
|
| 86 |
+
pairs = self._get_stats(vocab)
|
| 87 |
+
if not pairs:
|
| 88 |
+
break
|
| 89 |
+
|
| 90 |
+
best = max(pairs, key=pairs.get)
|
| 91 |
+
vocab = self._merge_vocab(best, vocab)
|
| 92 |
+
self.merges.append(best)
|
| 93 |
+
|
| 94 |
+
# Add merged token to vocab
|
| 95 |
+
merged_token = ''.join(best)
|
| 96 |
+
if merged_token not in self.vocab:
|
| 97 |
+
self.vocab[merged_token] = len(self.vocab)
|
| 98 |
+
|
| 99 |
+
if (i + 1) % 1000 == 0:
|
| 100 |
+
print(f" Completed {i + 1}/{num_merges} merges")
|
| 101 |
+
|
| 102 |
+
print(f"Final vocabulary size: {len(self.vocab)}")
|
| 103 |
+
|
| 104 |
+
def encode(self, text: str, add_special_tokens: bool = True) -> List[int]:
|
| 105 |
+
"""Encode text to token IDs."""
|
| 106 |
+
words = self._pre_tokenize(text)
|
| 107 |
+
token_ids = []
|
| 108 |
+
|
| 109 |
+
if add_special_tokens:
|
| 110 |
+
token_ids.append(self.bos_token_id)
|
| 111 |
+
|
| 112 |
+
for word in words:
|
| 113 |
+
word = word.lower()
|
| 114 |
+
word_tokens = ' '.join(list(word)) + ' </w>'
|
| 115 |
+
|
| 116 |
+
# Apply BPE merges
|
| 117 |
+
for merge in self.merges:
|
| 118 |
+
bigram = re.escape(' '.join(merge))
|
| 119 |
+
pattern = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
|
| 120 |
+
word_tokens = pattern.sub(''.join(merge), word_tokens)
|
| 121 |
+
|
| 122 |
+
# Convert to IDs
|
| 123 |
+
for token in word_tokens.split():
|
| 124 |
+
token_ids.append(self.vocab.get(token, self.unk_token_id))
|
| 125 |
+
|
| 126 |
+
if add_special_tokens:
|
| 127 |
+
token_ids.append(self.eos_token_id)
|
| 128 |
+
|
| 129 |
+
return token_ids
|
| 130 |
+
|
| 131 |
+
def decode(self, token_ids: List[int], skip_special_tokens: bool = True) -> str:
|
| 132 |
+
"""Decode token IDs to text."""
|
| 133 |
+
# Build reverse vocab
|
| 134 |
+
reverse_vocab = {v: k for k, v in self.vocab.items()}
|
| 135 |
+
|
| 136 |
+
tokens = []
|
| 137 |
+
for token_id in token_ids:
|
| 138 |
+
if token_id in self.special_tokens.values() and skip_special_tokens:
|
| 139 |
+
continue
|
| 140 |
+
tokens.append(reverse_vocab.get(token_id, '<unk>'))
|
| 141 |
+
|
| 142 |
+
text = ''.join(tokens)
|
| 143 |
+
text = text.replace('</w>', ' ')
|
| 144 |
+
return text.strip()
|
| 145 |
+
|
| 146 |
+
def save(self, path: str):
|
| 147 |
+
"""Save tokenizer to file."""
|
| 148 |
+
data = {
|
| 149 |
+
'vocab': self.vocab,
|
| 150 |
+
'merges': self.merges,
|
| 151 |
+
'special_tokens': self.special_tokens,
|
| 152 |
+
'vocab_size': self.vocab_size,
|
| 153 |
+
}
|
| 154 |
+
with open(path, 'wb') as f:
|
| 155 |
+
pickle.dump(data, f)
|
| 156 |
+
print(f"Tokenizer saved to {path}")
|
| 157 |
+
|
| 158 |
+
def load(self, path: str):
|
| 159 |
+
"""Load tokenizer from file."""
|
| 160 |
+
with open(path, 'rb') as f:
|
| 161 |
+
data = pickle.load(f)
|
| 162 |
+
self.vocab = data['vocab']
|
| 163 |
+
self.merges = data['merges']
|
| 164 |
+
self.special_tokens = data['special_tokens']
|
| 165 |
+
self.vocab_size = data['vocab_size']
|
| 166 |
+
|
| 167 |
+
self.pad_token_id = self.special_tokens['<pad>']
|
| 168 |
+
self.unk_token_id = self.special_tokens['<unk>']
|
| 169 |
+
self.bos_token_id = self.special_tokens['<s>']
|
| 170 |
+
self.eos_token_id = self.special_tokens['</s>']
|
| 171 |
+
self.mask_token_id = self.special_tokens['<mask>']
|
| 172 |
+
print(f"Tokenizer loaded from {path}")
|
| 173 |
+
|
| 174 |
+
def batch_encode(
|
| 175 |
+
self,
|
| 176 |
+
texts: List[str],
|
| 177 |
+
max_length: int = 512,
|
| 178 |
+
padding: bool = True,
|
| 179 |
+
truncation: bool = True,
|
| 180 |
+
) -> Dict[str, List]:
|
| 181 |
+
"""Batch encode texts."""
|
| 182 |
+
encoded = [self.encode(text) for text in texts]
|
| 183 |
+
|
| 184 |
+
if truncation:
|
| 185 |
+
encoded = [seq[:max_length] for seq in encoded]
|
| 186 |
+
|
| 187 |
+
if padding:
|
| 188 |
+
max_len = min(max(len(seq) for seq in encoded), max_length)
|
| 189 |
+
attention_mask = []
|
| 190 |
+
for seq in encoded:
|
| 191 |
+
mask = [1] * len(seq) + [0] * (max_len - len(seq))
|
| 192 |
+
seq.extend([self.pad_token_id] * (max_len - len(seq)))
|
| 193 |
+
attention_mask.append(mask[:max_len])
|
| 194 |
+
else:
|
| 195 |
+
attention_mask = [[1] * len(seq) for seq in encoded]
|
| 196 |
+
|
| 197 |
+
return {
|
| 198 |
+
'input_ids': encoded,
|
| 199 |
+
'attention_mask': attention_mask,
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
def __len__(self):
|
| 203 |
+
return len(self.vocab)
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
class ByteLevelBPETokenizer:
|
| 207 |
+
"""Byte-level BPE tokenizer (similar to GPT-2/3)."""
|
| 208 |
+
|
| 209 |
+
def __init__(self, vocab_size: int = 32000):
|
| 210 |
+
self.vocab_size = vocab_size
|
| 211 |
+
self.vocab = {}
|
| 212 |
+
self.merges = []
|
| 213 |
+
self.byte_encoder = {i: chr(i + 128) for i in range(256)} # Shift to printable range
|
| 214 |
+
self.byte_decoder = {chr(i + 128): i for i in range(256)}
|
| 215 |
+
|
| 216 |
+
self.special_tokens = {
|
| 217 |
+
'<|endoftext|>': 0,
|
| 218 |
+
'<|pad|>': 1,
|
| 219 |
+
}
|
| 220 |
+
self.eos_token_id = 0
|
| 221 |
+
self.pad_token_id = 1
|
| 222 |
+
|
| 223 |
+
def _bytes_to_unicode(self, text: str) -> str:
|
| 224 |
+
"""Convert string to byte-level representation."""
|
| 225 |
+
return ''.join(self.byte_encoder[b] for b in text.encode('utf-8'))
|
| 226 |
+
|
| 227 |
+
def _unicode_to_bytes(self, text: str) -> str:
|
| 228 |
+
"""Convert byte-level representation back to string."""
|
| 229 |
+
return bytes(self.byte_decoder[c] for c in text).decode('utf-8', errors='replace')
|
| 230 |
+
|
| 231 |
+
def train(self, texts: List[str]):
|
| 232 |
+
"""Train byte-level BPE."""
|
| 233 |
+
print(f"Training byte-level BPE tokenizer with vocab_size={self.vocab_size}")
|
| 234 |
+
|
| 235 |
+
# Initialize vocab with special tokens and all bytes
|
| 236 |
+
self.vocab = {token: i for token, i in self.special_tokens.items()}
|
| 237 |
+
for i in range(256):
|
| 238 |
+
byte_char = self.byte_encoder[i]
|
| 239 |
+
if byte_char not in self.vocab:
|
| 240 |
+
self.vocab[byte_char] = len(self.vocab)
|
| 241 |
+
|
| 242 |
+
# Build corpus as byte sequences
|
| 243 |
+
corpus = []
|
| 244 |
+
for text in texts:
|
| 245 |
+
byte_text = self._bytes_to_unicode(text)
|
| 246 |
+
corpus.extend(list(byte_text))
|
| 247 |
+
|
| 248 |
+
# Get initial word frequencies
|
| 249 |
+
vocab = defaultdict(int)
|
| 250 |
+
for text in texts:
|
| 251 |
+
byte_text = self._bytes_to_unicode(text)
|
| 252 |
+
# Add end token
|
| 253 |
+
byte_text += '<|endoftext|>'
|
| 254 |
+
vocab[tuple(byte_text)] += 1
|
| 255 |
+
|
| 256 |
+
# BPE training
|
| 257 |
+
num_merges = self.vocab_size - len(self.vocab)
|
| 258 |
+
|
| 259 |
+
for i in range(num_merges):
|
| 260 |
+
pairs = self._get_stats(vocab)
|
| 261 |
+
if not pairs:
|
| 262 |
+
break
|
| 263 |
+
|
| 264 |
+
best = max(pairs, key=pairs.get)
|
| 265 |
+
vocab = self._merge_vocab(best, vocab)
|
| 266 |
+
self.merges.append(best)
|
| 267 |
+
|
| 268 |
+
merged = ''.join(best)
|
| 269 |
+
if merged not in self.vocab:
|
| 270 |
+
self.vocab[merged] = len(self.vocab)
|
| 271 |
+
|
| 272 |
+
if (i + 1) % 1000 == 0:
|
| 273 |
+
print(f" Completed {i + 1}/{num_merges} merges")
|
| 274 |
+
|
| 275 |
+
print(f"Final vocabulary size: {len(self.vocab)}")
|
| 276 |
+
|
| 277 |
+
def _get_stats(self, vocab):
|
| 278 |
+
pairs = defaultdict(int)
|
| 279 |
+
for word, freq in vocab.items():
|
| 280 |
+
symbols = list(word)
|
| 281 |
+
for i in range(len(symbols) - 1):
|
| 282 |
+
pairs[(symbols[i], symbols[i + 1])] += freq
|
| 283 |
+
return pairs
|
| 284 |
+
|
| 285 |
+
def _merge_vocab(self, pair, vocab):
|
| 286 |
+
new_vocab = {}
|
| 287 |
+
bigram = pair[0] + pair[1]
|
| 288 |
+
for word in vocab:
|
| 289 |
+
new_word = []
|
| 290 |
+
i = 0
|
| 291 |
+
while i < len(word):
|
| 292 |
+
if i < len(word) - 1 and word[i] == pair[0] and word[i + 1] == pair[1]:
|
| 293 |
+
new_word.append(bigram)
|
| 294 |
+
i += 2
|
| 295 |
+
else:
|
| 296 |
+
new_word.append(word[i])
|
| 297 |
+
i += 1
|
| 298 |
+
new_vocab[tuple(new_word)] = vocab[word]
|
| 299 |
+
return new_vocab
|
| 300 |
+
|
| 301 |
+
def encode(self, text: str, add_special_tokens: bool = True) -> List[int]:
|
| 302 |
+
"""Encode text to token IDs."""
|
| 303 |
+
byte_text = self._bytes_to_unicode(text)
|
| 304 |
+
if add_special_tokens:
|
| 305 |
+
byte_text += '<|endoftext|>'
|
| 306 |
+
|
| 307 |
+
# Apply merges
|
| 308 |
+
word = list(byte_text)
|
| 309 |
+
for merge in self.merges:
|
| 310 |
+
new_word = []
|
| 311 |
+
i = 0
|
| 312 |
+
while i < len(word):
|
| 313 |
+
if i < len(word) - 1 and word[i] == merge[0] and word[i + 1] == merge[1]:
|
| 314 |
+
new_word.append(merge[0] + merge[1])
|
| 315 |
+
i += 2
|
| 316 |
+
else:
|
| 317 |
+
new_word.append(word[i])
|
| 318 |
+
i += 1
|
| 319 |
+
word = new_word
|
| 320 |
+
|
| 321 |
+
# Convert to IDs
|
| 322 |
+
return [self.vocab.get(token, self.special_tokens['<|pad|>']) for token in word]
|
| 323 |
+
|
| 324 |
+
def decode(self, token_ids: List[int]) -> str:
|
| 325 |
+
"""Decode token IDs to text."""
|
| 326 |
+
reverse_vocab = {v: k for k, v in self.vocab.items()}
|
| 327 |
+
text = ''.join(reverse_vocab.get(id, '') for id in token_ids)
|
| 328 |
+
text = text.replace('<|endoftext|>', '')
|
| 329 |
+
return self._unicode_to_bytes(text)
|
| 330 |
+
|
| 331 |
+
def save(self, path: str):
|
| 332 |
+
"""Save tokenizer to file."""
|
| 333 |
+
data = {
|
| 334 |
+
'vocab': self.vocab,
|
| 335 |
+
'merges': self.merges,
|
| 336 |
+
'special_tokens': self.special_tokens,
|
| 337 |
+
'vocab_size': self.vocab_size,
|
| 338 |
+
'byte_encoder': self.byte_encoder,
|
| 339 |
+
'byte_decoder': self.byte_decoder,
|
| 340 |
+
}
|
| 341 |
+
with open(path, 'wb') as f:
|
| 342 |
+
pickle.dump(data, f)
|
| 343 |
+
print(f"Tokenizer saved to {path}")
|
| 344 |
+
|
| 345 |
+
def load(self, path: str):
|
| 346 |
+
"""Load tokenizer from file."""
|
| 347 |
+
with open(path, 'rb') as f:
|
| 348 |
+
data = pickle.load(f)
|
| 349 |
+
self.vocab = data['vocab']
|
| 350 |
+
self.merges = data['merges']
|
| 351 |
+
self.special_tokens = data['special_tokens']
|
| 352 |
+
self.vocab_size = data['vocab_size']
|
| 353 |
+
self.byte_encoder = data.get('byte_encoder', self.byte_encoder)
|
| 354 |
+
self.byte_decoder = data.get('byte_decoder', self.byte_decoder)
|
| 355 |
+
|
| 356 |
+
# Ensure all special tokens exist
|
| 357 |
+
if '<|endoftext|>' not in self.special_tokens:
|
| 358 |
+
self.special_tokens['<|endoftext|>'] = 0
|
| 359 |
+
if '<|pad|>' not in self.special_tokens:
|
| 360 |
+
self.special_tokens['<|pad|>'] = 1
|
| 361 |
+
|
| 362 |
+
self.eos_token_id = self.special_tokens.get('<|endoftext|>', 0)
|
| 363 |
+
self.pad_token_id = self.special_tokens.get('<|pad|>', 1)
|
| 364 |
+
print(f"Tokenizer loaded from {path}")
|
| 365 |
+
|
| 366 |
+
def __len__(self):
|
| 367 |
+
return len(self.vocab)
|
| 368 |
+
|
| 369 |
+
|
| 370 |
+
def create_and_train_tokenizer(texts: List[str], vocab_size: int = 32000, output_path: str = "tokenizer.pkl"):
|
| 371 |
+
"""Create and train a tokenizer on the given texts."""
|
| 372 |
+
tokenizer = ByteLevelBPETokenizer(vocab_size=vocab_size)
|
| 373 |
+
tokenizer.train(texts)
|
| 374 |
+
tokenizer.save(output_path)
|
| 375 |
+
return tokenizer
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
if __name__ == "__main__":
|
| 379 |
+
# Test tokenizer
|
| 380 |
+
sample_texts = [
|
| 381 |
+
"Hello, world! This is a test.",
|
| 382 |
+
"The quick brown fox jumps over the lazy dog.",
|
| 383 |
+
"Machine learning is fascinating.",
|
| 384 |
+
"Artificial intelligence will change the world.",
|
| 385 |
+
]
|
| 386 |
+
|
| 387 |
+
tokenizer = BPETokenizer(vocab_size=1000)
|
| 388 |
+
tokenizer.train(sample_texts)
|
| 389 |
+
|
| 390 |
+
test_text = "Hello world!"
|
| 391 |
+
encoded = tokenizer.encode(test_text)
|
| 392 |
+
decoded = tokenizer.decode(encoded)
|
| 393 |
+
|
| 394 |
+
print(f"\nOriginal: {test_text}")
|
| 395 |
+
print(f"Encoded: {encoded}")
|
| 396 |
+
print(f"Decoded: {decoded}")
|
train.py
ADDED
|
@@ -0,0 +1,402 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
VicAI Training Script
|
| 3 |
+
Distributed training with FSDP/DDP support.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import argparse
|
| 7 |
+
import os
|
| 8 |
+
import time
|
| 9 |
+
from contextlib import nullcontext
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
import torch
|
| 13 |
+
import torch.distributed as dist
|
| 14 |
+
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
|
| 15 |
+
from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
|
| 16 |
+
from torch.nn.parallel import DistributedDataParallel as DDP
|
| 17 |
+
from torch.utils.data import DataLoader
|
| 18 |
+
from torch.utils.data.distributed import DistributedSampler
|
| 19 |
+
|
| 20 |
+
from model import VicAIModel, VicAIConfig, create_vicai_5b
|
| 21 |
+
from tokenizer import ByteLevelBPETokenizer, BPETokenizer
|
| 22 |
+
from dataset import (
|
| 23 |
+
WikipediaDataset,
|
| 24 |
+
TextFileDataset,
|
| 25 |
+
MixedDataset,
|
| 26 |
+
create_sample_corpus,
|
| 27 |
+
)
|
| 28 |
+
from utils import (
|
| 29 |
+
get_logger,
|
| 30 |
+
load_checkpoint,
|
| 31 |
+
save_checkpoint,
|
| 32 |
+
get_lr_scheduler,
|
| 33 |
+
estimate_loss,
|
| 34 |
+
configure_optimizers,
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def setup_distributed():
|
| 39 |
+
"""Initialize distributed training."""
|
| 40 |
+
if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
|
| 41 |
+
rank = int(os.environ['RANK'])
|
| 42 |
+
world_size = int(os.environ['WORLD_SIZE'])
|
| 43 |
+
local_rank = int(os.environ.get('LOCAL_RANK', 0))
|
| 44 |
+
else:
|
| 45 |
+
rank = 0
|
| 46 |
+
world_size = 1
|
| 47 |
+
local_rank = 0
|
| 48 |
+
|
| 49 |
+
if world_size > 1:
|
| 50 |
+
dist.init_process_group(backend='nccl', rank=rank, world_size=world_size)
|
| 51 |
+
torch.cuda.set_device(local_rank)
|
| 52 |
+
|
| 53 |
+
return rank, world_size, local_rank
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def cleanup_distributed():
|
| 57 |
+
"""Cleanup distributed training."""
|
| 58 |
+
if dist.is_initialized():
|
| 59 |
+
dist.destroy_process_group()
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def get_data_loader(dataset, batch_size, world_size, rank, shuffle=True):
|
| 63 |
+
"""Create distributed data loader."""
|
| 64 |
+
if world_size > 1:
|
| 65 |
+
sampler = DistributedSampler(
|
| 66 |
+
dataset,
|
| 67 |
+
num_replicas=world_size,
|
| 68 |
+
rank=rank,
|
| 69 |
+
shuffle=shuffle,
|
| 70 |
+
)
|
| 71 |
+
else:
|
| 72 |
+
sampler = None
|
| 73 |
+
|
| 74 |
+
loader = DataLoader(
|
| 75 |
+
dataset,
|
| 76 |
+
batch_size=batch_size,
|
| 77 |
+
sampler=sampler,
|
| 78 |
+
num_workers=4,
|
| 79 |
+
pin_memory=True,
|
| 80 |
+
drop_last=True,
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
return loader, sampler
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def train_step(model, batch, optimizer, scaler, device, use_amp):
|
| 87 |
+
"""Single training step."""
|
| 88 |
+
model.train()
|
| 89 |
+
|
| 90 |
+
input_ids = batch['input_ids'].to(device)
|
| 91 |
+
labels = batch['labels'].to(device)
|
| 92 |
+
|
| 93 |
+
optimizer.zero_grad()
|
| 94 |
+
|
| 95 |
+
with torch.cuda.amp.autocast(enabled=use_amp):
|
| 96 |
+
outputs = model(input_ids, targets=labels)
|
| 97 |
+
loss = outputs['loss']
|
| 98 |
+
|
| 99 |
+
if use_amp:
|
| 100 |
+
scaler.scale(loss).backward()
|
| 101 |
+
scaler.step(optimizer)
|
| 102 |
+
scaler.update()
|
| 103 |
+
else:
|
| 104 |
+
loss.backward()
|
| 105 |
+
optimizer.step()
|
| 106 |
+
|
| 107 |
+
return loss.item()
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def train(
|
| 111 |
+
model,
|
| 112 |
+
train_loader,
|
| 113 |
+
val_loader,
|
| 114 |
+
optimizer,
|
| 115 |
+
lr_scheduler,
|
| 116 |
+
scaler,
|
| 117 |
+
device,
|
| 118 |
+
args,
|
| 119 |
+
logger,
|
| 120 |
+
):
|
| 121 |
+
"""Main training loop."""
|
| 122 |
+
best_val_loss = float('inf')
|
| 123 |
+
step = 0
|
| 124 |
+
|
| 125 |
+
model.train()
|
| 126 |
+
train_iterator = iter(train_loader)
|
| 127 |
+
|
| 128 |
+
for epoch in range(args.max_epochs):
|
| 129 |
+
if hasattr(train_loader.sampler, 'set_epoch'):
|
| 130 |
+
train_loader.sampler.set_epoch(epoch)
|
| 131 |
+
|
| 132 |
+
epoch_start_time = time.time()
|
| 133 |
+
|
| 134 |
+
while step < args.max_steps:
|
| 135 |
+
try:
|
| 136 |
+
batch = next(train_iterator)
|
| 137 |
+
except StopIteration:
|
| 138 |
+
train_iterator = iter(train_loader)
|
| 139 |
+
batch = next(train_iterator)
|
| 140 |
+
|
| 141 |
+
# Training step
|
| 142 |
+
loss = train_step(model, batch, optimizer, scaler, device, args.use_amp)
|
| 143 |
+
lr_scheduler.step()
|
| 144 |
+
|
| 145 |
+
step += 1
|
| 146 |
+
|
| 147 |
+
# Logging
|
| 148 |
+
if step % args.log_interval == 0 and args.rank == 0:
|
| 149 |
+
lr = optimizer.param_groups[0]['lr']
|
| 150 |
+
logger.info(
|
| 151 |
+
f"Step {step}/{args.max_steps} | "
|
| 152 |
+
f"Loss: {loss:.4f} | LR: {lr:.2e}"
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
# Evaluation
|
| 156 |
+
if step % args.eval_interval == 0:
|
| 157 |
+
val_loss = evaluate(model, val_loader, device, args.use_amp)
|
| 158 |
+
|
| 159 |
+
if args.rank == 0:
|
| 160 |
+
logger.info(f"Validation loss: {val_loss:.4f}")
|
| 161 |
+
|
| 162 |
+
# Save best model
|
| 163 |
+
if val_loss < best_val_loss:
|
| 164 |
+
best_val_loss = val_loss
|
| 165 |
+
save_checkpoint(
|
| 166 |
+
model,
|
| 167 |
+
optimizer,
|
| 168 |
+
scaler,
|
| 169 |
+
step,
|
| 170 |
+
val_loss,
|
| 171 |
+
args.output_dir / 'best_model.pt',
|
| 172 |
+
)
|
| 173 |
+
logger.info(f"Saved best model with loss {val_loss:.4f}")
|
| 174 |
+
|
| 175 |
+
model.train()
|
| 176 |
+
|
| 177 |
+
# Regular checkpointing
|
| 178 |
+
if step % args.save_interval == 0 and args.rank == 0:
|
| 179 |
+
save_checkpoint(
|
| 180 |
+
model,
|
| 181 |
+
optimizer,
|
| 182 |
+
scaler,
|
| 183 |
+
step,
|
| 184 |
+
loss,
|
| 185 |
+
args.output_dir / f'checkpoint_step_{step}.pt',
|
| 186 |
+
)
|
| 187 |
+
logger.info(f"Saved checkpoint at step {step}")
|
| 188 |
+
|
| 189 |
+
if step >= args.max_steps:
|
| 190 |
+
break
|
| 191 |
+
|
| 192 |
+
epoch_time = time.time() - epoch_start_time
|
| 193 |
+
if args.rank == 0:
|
| 194 |
+
logger.info(f"Epoch {epoch + 1} completed in {epoch_time:.2f}s")
|
| 195 |
+
|
| 196 |
+
return step
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def evaluate(model, data_loader, device, use_amp):
|
| 200 |
+
"""Evaluate model on validation set."""
|
| 201 |
+
model.eval()
|
| 202 |
+
total_loss = 0
|
| 203 |
+
num_batches = 0
|
| 204 |
+
|
| 205 |
+
with torch.no_grad():
|
| 206 |
+
for batch in data_loader:
|
| 207 |
+
input_ids = batch['input_ids'].to(device)
|
| 208 |
+
labels = batch['labels'].to(device)
|
| 209 |
+
|
| 210 |
+
with torch.cuda.amp.autocast(enabled=use_amp):
|
| 211 |
+
outputs = model(input_ids, targets=labels)
|
| 212 |
+
loss = outputs['loss']
|
| 213 |
+
|
| 214 |
+
total_loss += loss.item()
|
| 215 |
+
num_batches += 1
|
| 216 |
+
|
| 217 |
+
if num_batches >= 100: # Limit eval batches
|
| 218 |
+
break
|
| 219 |
+
|
| 220 |
+
# Average across all processes
|
| 221 |
+
avg_loss = total_loss / num_batches
|
| 222 |
+
|
| 223 |
+
if dist.is_initialized():
|
| 224 |
+
loss_tensor = torch.tensor([avg_loss], device=device)
|
| 225 |
+
dist.all_reduce(loss_tensor, op=dist.ReduceOp.AVG)
|
| 226 |
+
avg_loss = loss_tensor.item()
|
| 227 |
+
|
| 228 |
+
return avg_loss
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
def main():
|
| 232 |
+
parser = argparse.ArgumentParser(description='Train VicAI')
|
| 233 |
+
|
| 234 |
+
# Model args
|
| 235 |
+
parser.add_argument('--vocab-size', type=int, default=32000)
|
| 236 |
+
parser.add_argument('--dim', type=int, default=4096)
|
| 237 |
+
parser.add_argument('--n-layers', type=int, default=32)
|
| 238 |
+
parser.add_argument('--n-heads', type=int, default=32)
|
| 239 |
+
parser.add_argument('--n-kv-heads', type=int, default=8)
|
| 240 |
+
parser.add_argument('--hidden-dim', type=int, default=14336)
|
| 241 |
+
|
| 242 |
+
# Training args
|
| 243 |
+
parser.add_argument('--batch-size', type=int, default=4)
|
| 244 |
+
parser.add_argument('--max-seq-len', type=int, default=2048)
|
| 245 |
+
parser.add_argument('--max-steps', type=int, default=100000)
|
| 246 |
+
parser.add_argument('--max-epochs', type=int, default=10)
|
| 247 |
+
parser.add_argument('--learning-rate', type=float, default=3e-4)
|
| 248 |
+
parser.add_argument('--min-lr', type=float, default=3e-5)
|
| 249 |
+
parser.add_argument('--warmup-steps', type=int, default=2000)
|
| 250 |
+
parser.add_argument('--weight-decay', type=float, default=0.1)
|
| 251 |
+
parser.add_argument('--grad-clip', type=float, default=1.0)
|
| 252 |
+
parser.add_argument('--beta1', type=float, default=0.9)
|
| 253 |
+
parser.add_argument('--beta2', type=float, default=0.95)
|
| 254 |
+
|
| 255 |
+
# Data args
|
| 256 |
+
parser.add_argument('--train-data', type=str, default='data/train.txt')
|
| 257 |
+
parser.add_argument('--val-data', type=str, default='data/val.txt')
|
| 258 |
+
parser.add_argument('--tokenizer-path', type=str, default='tokenizer.pkl')
|
| 259 |
+
|
| 260 |
+
# System args
|
| 261 |
+
parser.add_argument('--output-dir', type=str, default='checkpoints')
|
| 262 |
+
parser.add_argument('--resume', type=str, default=None)
|
| 263 |
+
parser.add_argument('--eval-interval', type=int, default=1000)
|
| 264 |
+
parser.add_argument('--save-interval', type=int, default=5000)
|
| 265 |
+
parser.add_argument('--log-interval', type=int, default=100)
|
| 266 |
+
parser.add_argument('--use-amp', action='store_true', default=True)
|
| 267 |
+
parser.add_argument('--use-fsdp', action='store_true', default=False)
|
| 268 |
+
parser.add_argument('--compile', action='store_true', default=False)
|
| 269 |
+
|
| 270 |
+
args = parser.parse_args()
|
| 271 |
+
|
| 272 |
+
# Setup
|
| 273 |
+
args.rank, args.world_size, args.local_rank = setup_distributed()
|
| 274 |
+
args.is_distributed = args.world_size > 1
|
| 275 |
+
|
| 276 |
+
# Create output directory
|
| 277 |
+
args.output_dir = Path(args.output_dir)
|
| 278 |
+
if args.rank == 0:
|
| 279 |
+
args.output_dir.mkdir(parents=True, exist_ok=True)
|
| 280 |
+
|
| 281 |
+
# Logger
|
| 282 |
+
logger = get_logger('vicai_train', args.output_dir / 'train.log' if args.rank == 0 else None)
|
| 283 |
+
|
| 284 |
+
if args.rank == 0:
|
| 285 |
+
logger.info(f"Starting VicAI training with {args.world_size} GPUs")
|
| 286 |
+
logger.info(f"Arguments: {args}")
|
| 287 |
+
|
| 288 |
+
# Device
|
| 289 |
+
device = torch.device(f'cuda:{args.local_rank}' if torch.cuda.is_available() else 'cpu')
|
| 290 |
+
|
| 291 |
+
# Load tokenizer
|
| 292 |
+
if os.path.exists(args.tokenizer_path):
|
| 293 |
+
logger.info(f"Loading tokenizer from {args.tokenizer_path}")
|
| 294 |
+
tokenizer = ByteLevelBPETokenizer()
|
| 295 |
+
tokenizer.load(args.tokenizer_path)
|
| 296 |
+
else:
|
| 297 |
+
logger.warning(f"Tokenizer not found at {args.tokenizer_path}, creating default")
|
| 298 |
+
tokenizer = ByteLevelBPETokenizer(vocab_size=args.vocab_size)
|
| 299 |
+
# Train on sample data
|
| 300 |
+
if args.rank == 0:
|
| 301 |
+
sample_file = create_sample_corpus(num_articles=100)
|
| 302 |
+
with open(sample_file, 'r') as f:
|
| 303 |
+
texts = f.read().split('<|endoftext|>')
|
| 304 |
+
tokenizer.train([t for t in texts if t.strip()])
|
| 305 |
+
tokenizer.save(args.tokenizer_path)
|
| 306 |
+
|
| 307 |
+
if args.is_distributed:
|
| 308 |
+
dist.barrier()
|
| 309 |
+
|
| 310 |
+
if args.rank != 0:
|
| 311 |
+
tokenizer.load(args.tokenizer_path)
|
| 312 |
+
|
| 313 |
+
# Create model
|
| 314 |
+
logger.info("Creating model...")
|
| 315 |
+
config = VicAIConfig(
|
| 316 |
+
vocab_size=len(tokenizer),
|
| 317 |
+
dim=args.dim,
|
| 318 |
+
n_layers=args.n_layers,
|
| 319 |
+
n_heads=args.n_heads,
|
| 320 |
+
n_kv_heads=args.n_kv_heads,
|
| 321 |
+
hidden_dim=args.hidden_dim,
|
| 322 |
+
max_seq_len=args.max_seq_len,
|
| 323 |
+
dropout=0.0,
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
if args.rank == 0:
|
| 327 |
+
logger.info(f"Model config: {config.__dict__}")
|
| 328 |
+
logger.info(f"Model parameters: ~{config.num_parameters / 1e9:.2f}B")
|
| 329 |
+
|
| 330 |
+
model = VicAIModel(config)
|
| 331 |
+
|
| 332 |
+
if args.use_fsdp and args.is_distributed:
|
| 333 |
+
model = FSDP(model, device_id=device)
|
| 334 |
+
elif args.is_distributed:
|
| 335 |
+
model = DDP(model, device_ids=[args.local_rank])
|
| 336 |
+
else:
|
| 337 |
+
model = model.to(device)
|
| 338 |
+
|
| 339 |
+
if args.compile and hasattr(torch, 'compile'):
|
| 340 |
+
logger.info("Compiling model...")
|
| 341 |
+
model = torch.compile(model)
|
| 342 |
+
|
| 343 |
+
# Create datasets
|
| 344 |
+
logger.info("Creating datasets...")
|
| 345 |
+
|
| 346 |
+
if os.path.exists(args.train_data):
|
| 347 |
+
train_dataset = TextFileDataset(args.train_data, tokenizer, args.max_seq_len)
|
| 348 |
+
val_dataset = TextFileDataset(args.val_data, tokenizer, args.max_seq_len) if os.path.exists(args.val_data) else train_dataset
|
| 349 |
+
else:
|
| 350 |
+
logger.warning("Training data not found, using Wikipedia streaming dataset")
|
| 351 |
+
train_dataset = WikipediaDataset(tokenizer, max_length=args.max_seq_len)
|
| 352 |
+
val_dataset = WikipediaDataset(tokenizer, max_length=args.max_seq_len)
|
| 353 |
+
|
| 354 |
+
train_loader, train_sampler = get_data_loader(train_dataset, args.batch_size, args.world_size, args.rank)
|
| 355 |
+
val_loader, _ = get_data_loader(val_dataset, args.batch_size, args.world_size, args.rank, shuffle=False)
|
| 356 |
+
|
| 357 |
+
# Optimizer
|
| 358 |
+
optimizer = configure_optimizers(model, args)
|
| 359 |
+
|
| 360 |
+
# Learning rate scheduler
|
| 361 |
+
lr_scheduler = get_lr_scheduler(optimizer, args)
|
| 362 |
+
|
| 363 |
+
# Gradient scaler for AMP
|
| 364 |
+
scaler = torch.cuda.amp.GradScaler(enabled=args.use_amp)
|
| 365 |
+
|
| 366 |
+
# Resume from checkpoint
|
| 367 |
+
start_step = 0
|
| 368 |
+
if args.resume:
|
| 369 |
+
logger.info(f"Resuming from {args.resume}")
|
| 370 |
+
start_step = load_checkpoint(model, optimizer, scaler, args.resume, device)
|
| 371 |
+
|
| 372 |
+
# Training
|
| 373 |
+
logger.info("Starting training...")
|
| 374 |
+
final_step = train(
|
| 375 |
+
model,
|
| 376 |
+
train_loader,
|
| 377 |
+
val_loader,
|
| 378 |
+
optimizer,
|
| 379 |
+
lr_scheduler,
|
| 380 |
+
scaler,
|
| 381 |
+
device,
|
| 382 |
+
args,
|
| 383 |
+
logger,
|
| 384 |
+
)
|
| 385 |
+
|
| 386 |
+
# Save final model
|
| 387 |
+
if args.rank == 0:
|
| 388 |
+
save_checkpoint(
|
| 389 |
+
model,
|
| 390 |
+
optimizer,
|
| 391 |
+
scaler,
|
| 392 |
+
final_step,
|
| 393 |
+
0.0,
|
| 394 |
+
args.output_dir / 'final_model.pt',
|
| 395 |
+
)
|
| 396 |
+
logger.info("Training completed!")
|
| 397 |
+
|
| 398 |
+
cleanup_distributed()
|
| 399 |
+
|
| 400 |
+
|
| 401 |
+
if __name__ == '__main__':
|
| 402 |
+
main()
|
utils.py
ADDED
|
@@ -0,0 +1,359 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
VicAI Utilities
|
| 3 |
+
Helper functions for training and evaluation.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import logging
|
| 8 |
+
import math
|
| 9 |
+
import os
|
| 10 |
+
import sys
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import Dict, Optional
|
| 13 |
+
|
| 14 |
+
import torch
|
| 15 |
+
import torch.distributed as dist
|
| 16 |
+
from torch.optim import AdamW
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def get_logger(name: str, log_file: Optional[Path] = None) -> logging.Logger:
|
| 20 |
+
"""Create a logger with file and console handlers."""
|
| 21 |
+
logger = logging.getLogger(name)
|
| 22 |
+
logger.setLevel(logging.INFO)
|
| 23 |
+
|
| 24 |
+
# Clear existing handlers
|
| 25 |
+
logger.handlers = []
|
| 26 |
+
|
| 27 |
+
# Formatter
|
| 28 |
+
formatter = logging.Formatter(
|
| 29 |
+
'%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 30 |
+
datefmt='%Y-%m-%d %H:%M:%S'
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
# Console handler
|
| 34 |
+
console_handler = logging.StreamHandler(sys.stdout)
|
| 35 |
+
console_handler.setLevel(logging.INFO)
|
| 36 |
+
console_handler.setFormatter(formatter)
|
| 37 |
+
logger.addHandler(console_handler)
|
| 38 |
+
|
| 39 |
+
# File handler
|
| 40 |
+
if log_file:
|
| 41 |
+
log_file.parent.mkdir(parents=True, exist_ok=True)
|
| 42 |
+
file_handler = logging.FileHandler(log_file)
|
| 43 |
+
file_handler.setLevel(logging.INFO)
|
| 44 |
+
file_handler.setFormatter(formatter)
|
| 45 |
+
logger.addHandler(file_handler)
|
| 46 |
+
|
| 47 |
+
return logger
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def save_checkpoint(
|
| 51 |
+
model,
|
| 52 |
+
optimizer,
|
| 53 |
+
scaler,
|
| 54 |
+
step: int,
|
| 55 |
+
loss: float,
|
| 56 |
+
path: Path,
|
| 57 |
+
):
|
| 58 |
+
"""Save model checkpoint."""
|
| 59 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 60 |
+
|
| 61 |
+
# Unwrap model if using DDP/FSDP
|
| 62 |
+
state_dict = model.state_dict()
|
| 63 |
+
if hasattr(model, 'module'):
|
| 64 |
+
state_dict = model.module.state_dict()
|
| 65 |
+
|
| 66 |
+
checkpoint = {
|
| 67 |
+
'model': state_dict,
|
| 68 |
+
'optimizer': optimizer.state_dict(),
|
| 69 |
+
'scaler': scaler.state_dict() if scaler else None,
|
| 70 |
+
'step': step,
|
| 71 |
+
'loss': loss,
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
torch.save(checkpoint, path)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def load_checkpoint(
|
| 78 |
+
model,
|
| 79 |
+
optimizer,
|
| 80 |
+
scaler,
|
| 81 |
+
path: str,
|
| 82 |
+
device,
|
| 83 |
+
):
|
| 84 |
+
"""Load model checkpoint."""
|
| 85 |
+
checkpoint = torch.load(path, map_location=device)
|
| 86 |
+
|
| 87 |
+
# Handle both wrapped and unwrapped models
|
| 88 |
+
state_dict = checkpoint['model']
|
| 89 |
+
if hasattr(model, 'module'):
|
| 90 |
+
model.module.load_state_dict(state_dict)
|
| 91 |
+
else:
|
| 92 |
+
model.load_state_dict(state_dict)
|
| 93 |
+
|
| 94 |
+
optimizer.load_state_dict(checkpoint['optimizer'])
|
| 95 |
+
|
| 96 |
+
if scaler and checkpoint.get('scaler'):
|
| 97 |
+
scaler.load_state_dict(checkpoint['scaler'])
|
| 98 |
+
|
| 99 |
+
return checkpoint.get('step', 0)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def get_lr_scheduler(optimizer, args):
|
| 103 |
+
"""Create learning rate scheduler with warmup and cosine decay."""
|
| 104 |
+
|
| 105 |
+
def lr_lambda(current_step):
|
| 106 |
+
if current_step < args.warmup_steps:
|
| 107 |
+
# Linear warmup
|
| 108 |
+
return current_step / args.warmup_steps
|
| 109 |
+
else:
|
| 110 |
+
# Cosine decay
|
| 111 |
+
progress = (current_step - args.warmup_steps) / (args.max_steps - args.warmup_steps)
|
| 112 |
+
progress = min(1.0, progress)
|
| 113 |
+
cosine_decay = 0.5 * (1 + math.cos(math.pi * progress))
|
| 114 |
+
return args.min_lr / args.learning_rate + (1 - args.min_lr / args.learning_rate) * cosine_decay
|
| 115 |
+
|
| 116 |
+
from torch.optim.lr_scheduler import LambdaLR
|
| 117 |
+
return LambdaLR(optimizer, lr_lambda)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def configure_optimizers(model, args):
|
| 121 |
+
"""Configure optimizer with weight decay."""
|
| 122 |
+
# Separate parameters that should and shouldn't have weight decay
|
| 123 |
+
decay_params = []
|
| 124 |
+
no_decay_params = []
|
| 125 |
+
|
| 126 |
+
for name, param in model.named_parameters():
|
| 127 |
+
if not param.requires_grad:
|
| 128 |
+
continue
|
| 129 |
+
|
| 130 |
+
# Don't apply weight decay to bias and normalization parameters
|
| 131 |
+
if 'bias' in name or 'norm' in name or 'embedding' in name:
|
| 132 |
+
no_decay_params.append(param)
|
| 133 |
+
else:
|
| 134 |
+
decay_params.append(param)
|
| 135 |
+
|
| 136 |
+
param_groups = [
|
| 137 |
+
{'params': decay_params, 'weight_decay': args.weight_decay},
|
| 138 |
+
{'params': no_decay_params, 'weight_decay': 0.0},
|
| 139 |
+
]
|
| 140 |
+
|
| 141 |
+
optimizer = AdamW(
|
| 142 |
+
param_groups,
|
| 143 |
+
lr=args.learning_rate,
|
| 144 |
+
betas=(args.beta1, args.beta2),
|
| 145 |
+
eps=1e-8,
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
return optimizer
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def estimate_loss(model, data_loader, device, num_batches=10):
|
| 152 |
+
"""Estimate loss on a data loader."""
|
| 153 |
+
model.eval()
|
| 154 |
+
total_loss = 0
|
| 155 |
+
|
| 156 |
+
with torch.no_grad():
|
| 157 |
+
for i, batch in enumerate(data_loader):
|
| 158 |
+
if i >= num_batches:
|
| 159 |
+
break
|
| 160 |
+
|
| 161 |
+
input_ids = batch['input_ids'].to(device)
|
| 162 |
+
labels = batch['labels'].to(device)
|
| 163 |
+
|
| 164 |
+
outputs = model(input_ids, targets=labels)
|
| 165 |
+
total_loss += outputs['loss'].item()
|
| 166 |
+
|
| 167 |
+
model.train()
|
| 168 |
+
return total_loss / num_batches
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def get_grad_norm(model):
|
| 172 |
+
"""Calculate gradient norm."""
|
| 173 |
+
total_norm = 0.0
|
| 174 |
+
for p in model.parameters():
|
| 175 |
+
if p.grad is not None:
|
| 176 |
+
total_norm += p.grad.data.norm(2).item() ** 2
|
| 177 |
+
return total_norm ** 0.5
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def clip_gradients(model, max_norm):
|
| 181 |
+
"""Clip gradients by norm."""
|
| 182 |
+
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
class AverageMeter:
|
| 186 |
+
"""Track running average of metrics."""
|
| 187 |
+
|
| 188 |
+
def __init__(self):
|
| 189 |
+
self.reset()
|
| 190 |
+
|
| 191 |
+
def reset(self):
|
| 192 |
+
self.val = 0
|
| 193 |
+
self.avg = 0
|
| 194 |
+
self.sum = 0
|
| 195 |
+
self.count = 0
|
| 196 |
+
|
| 197 |
+
def update(self, val, n=1):
|
| 198 |
+
self.val = val
|
| 199 |
+
self.sum += val * n
|
| 200 |
+
self.count += n
|
| 201 |
+
self.avg = self.sum / self.count
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
class EarlyStopping:
|
| 205 |
+
"""Early stopping to prevent overfitting."""
|
| 206 |
+
|
| 207 |
+
def __init__(self, patience=5, min_delta=0.0):
|
| 208 |
+
self.patience = patience
|
| 209 |
+
self.min_delta = min_delta
|
| 210 |
+
self.counter = 0
|
| 211 |
+
self.best_loss = None
|
| 212 |
+
self.early_stop = False
|
| 213 |
+
|
| 214 |
+
def __call__(self, val_loss):
|
| 215 |
+
if self.best_loss is None:
|
| 216 |
+
self.best_loss = val_loss
|
| 217 |
+
elif val_loss > self.best_loss - self.min_delta:
|
| 218 |
+
self.counter += 1
|
| 219 |
+
if self.counter >= self.patience:
|
| 220 |
+
self.early_stop = True
|
| 221 |
+
else:
|
| 222 |
+
self.best_loss = val_loss
|
| 223 |
+
self.counter = 0
|
| 224 |
+
|
| 225 |
+
return self.early_stop
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def count_parameters(model):
|
| 229 |
+
"""Count trainable parameters."""
|
| 230 |
+
return sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def format_num_parameters(num_params):
|
| 234 |
+
"""Format parameter count for display."""
|
| 235 |
+
if num_params >= 1e9:
|
| 236 |
+
return f"{num_params / 1e9:.2f}B"
|
| 237 |
+
elif num_params >= 1e6:
|
| 238 |
+
return f"{num_params / 1e6:.2f}M"
|
| 239 |
+
elif num_params >= 1e3:
|
| 240 |
+
return f"{num_params / 1e3:.2f}K"
|
| 241 |
+
else:
|
| 242 |
+
return str(num_params)
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
def get_device_info():
|
| 246 |
+
"""Get information about available GPUs."""
|
| 247 |
+
if not torch.cuda.is_available():
|
| 248 |
+
return "No CUDA available"
|
| 249 |
+
|
| 250 |
+
info = []
|
| 251 |
+
for i in range(torch.cuda.device_count()):
|
| 252 |
+
props = torch.cuda.get_device_properties(i)
|
| 253 |
+
info.append(
|
| 254 |
+
f"GPU {i}: {props.name} ({props.total_memory / 1e9:.1f} GB)"
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
return "\n".join(info)
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
def print_model_summary(model):
|
| 261 |
+
"""Print a summary of the model architecture."""
|
| 262 |
+
print("\n" + "=" * 60)
|
| 263 |
+
print("MODEL SUMMARY")
|
| 264 |
+
print("=" * 60)
|
| 265 |
+
|
| 266 |
+
total_params = 0
|
| 267 |
+
trainable_params = 0
|
| 268 |
+
|
| 269 |
+
print(f"\n{'Layer':<40} {'Parameters':>15} {'Trainable':>10}")
|
| 270 |
+
print("-" * 70)
|
| 271 |
+
|
| 272 |
+
for name, param in model.named_parameters():
|
| 273 |
+
num_params = param.numel()
|
| 274 |
+
total_params += num_params
|
| 275 |
+
if param.requires_grad:
|
| 276 |
+
trainable_params += num_params
|
| 277 |
+
trainable = "Yes"
|
| 278 |
+
else:
|
| 279 |
+
trainable = "No"
|
| 280 |
+
|
| 281 |
+
print(f"{name:<40} {num_params:>15,} {trainable:>10}")
|
| 282 |
+
|
| 283 |
+
print("-" * 70)
|
| 284 |
+
print(f"{'Total':<40} {total_params:>15,}")
|
| 285 |
+
print(f"{'Trainable':<40} {trainable_params:>15,}")
|
| 286 |
+
print(f"{'Non-trainable':<40} {total_params - trainable_params:>15,}")
|
| 287 |
+
print("=" * 60 + "\n")
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
def save_training_config(args, output_path: Path):
|
| 291 |
+
"""Save training configuration to JSON."""
|
| 292 |
+
config = vars(args)
|
| 293 |
+
with open(output_path, 'w') as f:
|
| 294 |
+
json.dump(config, f, indent=2)
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
def load_training_config(config_path: Path):
|
| 298 |
+
"""Load training configuration from JSON."""
|
| 299 |
+
with open(config_path, 'r') as f:
|
| 300 |
+
return json.load(f)
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
def all_reduce_dict(data: Dict, device):
|
| 304 |
+
"""All reduce dictionary values across processes."""
|
| 305 |
+
if not dist.is_initialized():
|
| 306 |
+
return data
|
| 307 |
+
|
| 308 |
+
reduced_data = {}
|
| 309 |
+
for key, value in data.items():
|
| 310 |
+
if isinstance(value, (int, float)):
|
| 311 |
+
tensor = torch.tensor([value], device=device)
|
| 312 |
+
dist.all_reduce(tensor, op=dist.ReduceOp.AVG)
|
| 313 |
+
reduced_data[key] = tensor.item()
|
| 314 |
+
else:
|
| 315 |
+
reduced_data[key] = value
|
| 316 |
+
|
| 317 |
+
return reduced_data
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
def set_seed(seed: int):
|
| 321 |
+
"""Set random seed for reproducibility."""
|
| 322 |
+
import random
|
| 323 |
+
import numpy as np
|
| 324 |
+
|
| 325 |
+
random.seed(seed)
|
| 326 |
+
np.random.seed(seed)
|
| 327 |
+
torch.manual_seed(seed)
|
| 328 |
+
torch.cuda.manual_seed_all(seed)
|
| 329 |
+
|
| 330 |
+
# For deterministic operations (may be slower)
|
| 331 |
+
torch.backends.cudnn.deterministic = True
|
| 332 |
+
torch.backends.cudnn.benchmark = False
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
def get_memory_usage():
|
| 336 |
+
"""Get current memory usage."""
|
| 337 |
+
if torch.cuda.is_available():
|
| 338 |
+
allocated = torch.cuda.memory_allocated() / 1e9
|
| 339 |
+
reserved = torch.cuda.memory_reserved() / 1e9
|
| 340 |
+
max_allocated = torch.cuda.max_memory_allocated() / 1e9
|
| 341 |
+
return {
|
| 342 |
+
'allocated_gb': allocated,
|
| 343 |
+
'reserved_gb': reserved,
|
| 344 |
+
'max_allocated_gb': max_allocated,
|
| 345 |
+
}
|
| 346 |
+
return {'allocated_gb': 0, 'reserved_gb': 0, 'max_allocated_gb': 0}
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
if __name__ == "__main__":
|
| 350 |
+
# Test utilities
|
| 351 |
+
logger = get_logger("test")
|
| 352 |
+
logger.info("Testing logger")
|
| 353 |
+
|
| 354 |
+
print(get_device_info())
|
| 355 |
+
|
| 356 |
+
meter = AverageMeter()
|
| 357 |
+
for i in range(10):
|
| 358 |
+
meter.update(i)
|
| 359 |
+
print(f"Average: {meter.avg}")
|
vocab.json
ADDED
|
@@ -0,0 +1,2002 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"<|endoftext|>": 0,
|
| 3 |
+
"<|pad|>": 1,
|
| 4 |
+
"": 2,
|
| 5 |
+
"": 3,
|
| 6 |
+
"": 4,
|
| 7 |
+
"": 5,
|
| 8 |
+
"": 6,
|
| 9 |
+
"
": 7,
|
| 10 |
+
"": 8,
|
| 11 |
+
"": 9,
|
| 12 |
+
"": 10,
|
| 13 |
+
"": 11,
|
| 14 |
+
"": 12,
|
| 15 |
+
"": 13,
|
| 16 |
+
"": 14,
|
| 17 |
+
"": 15,
|
| 18 |
+
"": 16,
|
| 19 |
+
"": 17,
|
| 20 |
+
"": 18,
|
| 21 |
+
"": 19,
|
| 22 |
+
"": 20,
|
| 23 |
+
"": 21,
|
| 24 |
+
"": 22,
|
| 25 |
+
"": 23,
|
| 26 |
+
"": 24,
|
| 27 |
+
"": 25,
|
| 28 |
+
"": 26,
|
| 29 |
+
"": 27,
|
| 30 |
+
"": 28,
|
| 31 |
+
"": 29,
|
| 32 |
+
"": 30,
|
| 33 |
+
"": 31,
|
| 34 |
+
"": 32,
|
| 35 |
+
"": 33,
|
| 36 |
+
" ": 34,
|
| 37 |
+
"¡": 35,
|
| 38 |
+
"¢": 36,
|
| 39 |
+
"£": 37,
|
| 40 |
+
"¤": 38,
|
| 41 |
+
"¥": 39,
|
| 42 |
+
"¦": 40,
|
| 43 |
+
"§": 41,
|
| 44 |
+
"¨": 42,
|
| 45 |
+
"©": 43,
|
| 46 |
+
"ª": 44,
|
| 47 |
+
"«": 45,
|
| 48 |
+
"¬": 46,
|
| 49 |
+
"": 47,
|
| 50 |
+
"®": 48,
|
| 51 |
+
"¯": 49,
|
| 52 |
+
"°": 50,
|
| 53 |
+
"±": 51,
|
| 54 |
+
"²": 52,
|
| 55 |
+
"³": 53,
|
| 56 |
+
"´": 54,
|
| 57 |
+
"µ": 55,
|
| 58 |
+
"¶": 56,
|
| 59 |
+
"·": 57,
|
| 60 |
+
"¸": 58,
|
| 61 |
+
"¹": 59,
|
| 62 |
+
"º": 60,
|
| 63 |
+
"»": 61,
|
| 64 |
+
"¼": 62,
|
| 65 |
+
"½": 63,
|
| 66 |
+
"¾": 64,
|
| 67 |
+
"¿": 65,
|
| 68 |
+
"À": 66,
|
| 69 |
+
"Á": 67,
|
| 70 |
+
"Â": 68,
|
| 71 |
+
"Ã": 69,
|
| 72 |
+
"Ä": 70,
|
| 73 |
+
"Å": 71,
|
| 74 |
+
"Æ": 72,
|
| 75 |
+
"Ç": 73,
|
| 76 |
+
"È": 74,
|
| 77 |
+
"É": 75,
|
| 78 |
+
"Ê": 76,
|
| 79 |
+
"Ë": 77,
|
| 80 |
+
"Ì": 78,
|
| 81 |
+
"Í": 79,
|
| 82 |
+
"Î": 80,
|
| 83 |
+
"Ï": 81,
|
| 84 |
+
"Ð": 82,
|
| 85 |
+
"Ñ": 83,
|
| 86 |
+
"Ò": 84,
|
| 87 |
+
"Ó": 85,
|
| 88 |
+
"Ô": 86,
|
| 89 |
+
"Õ": 87,
|
| 90 |
+
"Ö": 88,
|
| 91 |
+
"×": 89,
|
| 92 |
+
"Ø": 90,
|
| 93 |
+
"Ù": 91,
|
| 94 |
+
"Ú": 92,
|
| 95 |
+
"Û": 93,
|
| 96 |
+
"Ü": 94,
|
| 97 |
+
"Ý": 95,
|
| 98 |
+
"Þ": 96,
|
| 99 |
+
"ß": 97,
|
| 100 |
+
"à": 98,
|
| 101 |
+
"á": 99,
|
| 102 |
+
"â": 100,
|
| 103 |
+
"ã": 101,
|
| 104 |
+
"ä": 102,
|
| 105 |
+
"å": 103,
|
| 106 |
+
"æ": 104,
|
| 107 |
+
"ç": 105,
|
| 108 |
+
"è": 106,
|
| 109 |
+
"é": 107,
|
| 110 |
+
"ê": 108,
|
| 111 |
+
"ë": 109,
|
| 112 |
+
"ì": 110,
|
| 113 |
+
"í": 111,
|
| 114 |
+
"î": 112,
|
| 115 |
+
"ï": 113,
|
| 116 |
+
"ð": 114,
|
| 117 |
+
"ñ": 115,
|
| 118 |
+
"ò": 116,
|
| 119 |
+
"ó": 117,
|
| 120 |
+
"ô": 118,
|
| 121 |
+
"õ": 119,
|
| 122 |
+
"ö": 120,
|
| 123 |
+
"÷": 121,
|
| 124 |
+
"ø": 122,
|
| 125 |
+
"ù": 123,
|
| 126 |
+
"ú": 124,
|
| 127 |
+
"û": 125,
|
| 128 |
+
"ü": 126,
|
| 129 |
+
"ý": 127,
|
| 130 |
+
"þ": 128,
|
| 131 |
+
"ÿ": 129,
|
| 132 |
+
"Ā": 130,
|
| 133 |
+
"ā": 131,
|
| 134 |
+
"Ă": 132,
|
| 135 |
+
"ă": 133,
|
| 136 |
+
"Ą": 134,
|
| 137 |
+
"ą": 135,
|
| 138 |
+
"Ć": 136,
|
| 139 |
+
"ć": 137,
|
| 140 |
+
"Ĉ": 138,
|
| 141 |
+
"ĉ": 139,
|
| 142 |
+
"Ċ": 140,
|
| 143 |
+
"ċ": 141,
|
| 144 |
+
"Č": 142,
|
| 145 |
+
"č": 143,
|
| 146 |
+
"Ď": 144,
|
| 147 |
+
"ď": 145,
|
| 148 |
+
"Đ": 146,
|
| 149 |
+
"đ": 147,
|
| 150 |
+
"Ē": 148,
|
| 151 |
+
"ē": 149,
|
| 152 |
+
"Ĕ": 150,
|
| 153 |
+
"ĕ": 151,
|
| 154 |
+
"Ė": 152,
|
| 155 |
+
"ė": 153,
|
| 156 |
+
"Ę": 154,
|
| 157 |
+
"ę": 155,
|
| 158 |
+
"Ě": 156,
|
| 159 |
+
"ě": 157,
|
| 160 |
+
"Ĝ": 158,
|
| 161 |
+
"ĝ": 159,
|
| 162 |
+
"Ğ": 160,
|
| 163 |
+
"ğ": 161,
|
| 164 |
+
"Ġ": 162,
|
| 165 |
+
"ġ": 163,
|
| 166 |
+
"Ģ": 164,
|
| 167 |
+
"ģ": 165,
|
| 168 |
+
"Ĥ": 166,
|
| 169 |
+
"ĥ": 167,
|
| 170 |
+
"Ħ": 168,
|
| 171 |
+
"ħ": 169,
|
| 172 |
+
"Ĩ": 170,
|
| 173 |
+
"ĩ": 171,
|
| 174 |
+
"Ī": 172,
|
| 175 |
+
"ī": 173,
|
| 176 |
+
"Ĭ": 174,
|
| 177 |
+
"ĭ": 175,
|
| 178 |
+
"Į": 176,
|
| 179 |
+
"į": 177,
|
| 180 |
+
"İ": 178,
|
| 181 |
+
"ı": 179,
|
| 182 |
+
"IJ": 180,
|
| 183 |
+
"ij": 181,
|
| 184 |
+
"Ĵ": 182,
|
| 185 |
+
"ĵ": 183,
|
| 186 |
+
"Ķ": 184,
|
| 187 |
+
"ķ": 185,
|
| 188 |
+
"ĸ": 186,
|
| 189 |
+
"Ĺ": 187,
|
| 190 |
+
"ĺ": 188,
|
| 191 |
+
"Ļ": 189,
|
| 192 |
+
"ļ": 190,
|
| 193 |
+
"Ľ": 191,
|
| 194 |
+
"ľ": 192,
|
| 195 |
+
"Ŀ": 193,
|
| 196 |
+
"ŀ": 194,
|
| 197 |
+
"Ł": 195,
|
| 198 |
+
"ł": 196,
|
| 199 |
+
"Ń": 197,
|
| 200 |
+
"ń": 198,
|
| 201 |
+
"Ņ": 199,
|
| 202 |
+
"ņ": 200,
|
| 203 |
+
"Ň": 201,
|
| 204 |
+
"ň": 202,
|
| 205 |
+
"ʼn": 203,
|
| 206 |
+
"Ŋ": 204,
|
| 207 |
+
"ŋ": 205,
|
| 208 |
+
"Ō": 206,
|
| 209 |
+
"ō": 207,
|
| 210 |
+
"Ŏ": 208,
|
| 211 |
+
"ŏ": 209,
|
| 212 |
+
"Ő": 210,
|
| 213 |
+
"ő": 211,
|
| 214 |
+
"Œ": 212,
|
| 215 |
+
"œ": 213,
|
| 216 |
+
"Ŕ": 214,
|
| 217 |
+
"ŕ": 215,
|
| 218 |
+
"Ŗ": 216,
|
| 219 |
+
"ŗ": 217,
|
| 220 |
+
"Ř": 218,
|
| 221 |
+
"ř": 219,
|
| 222 |
+
"Ś": 220,
|
| 223 |
+
"ś": 221,
|
| 224 |
+
"Ŝ": 222,
|
| 225 |
+
"ŝ": 223,
|
| 226 |
+
"Ş": 224,
|
| 227 |
+
"ş": 225,
|
| 228 |
+
"Š": 226,
|
| 229 |
+
"š": 227,
|
| 230 |
+
"Ţ": 228,
|
| 231 |
+
"ţ": 229,
|
| 232 |
+
"Ť": 230,
|
| 233 |
+
"ť": 231,
|
| 234 |
+
"Ŧ": 232,
|
| 235 |
+
"ŧ": 233,
|
| 236 |
+
"Ũ": 234,
|
| 237 |
+
"ũ": 235,
|
| 238 |
+
"Ū": 236,
|
| 239 |
+
"ū": 237,
|
| 240 |
+
"Ŭ": 238,
|
| 241 |
+
"ŭ": 239,
|
| 242 |
+
"Ů": 240,
|
| 243 |
+
"ů": 241,
|
| 244 |
+
"Ű": 242,
|
| 245 |
+
"ű": 243,
|
| 246 |
+
"Ų": 244,
|
| 247 |
+
"ų": 245,
|
| 248 |
+
"Ŵ": 246,
|
| 249 |
+
"ŵ": 247,
|
| 250 |
+
"Ŷ": 248,
|
| 251 |
+
"ŷ": 249,
|
| 252 |
+
"Ÿ": 250,
|
| 253 |
+
"Ź": 251,
|
| 254 |
+
"ź": 252,
|
| 255 |
+
"Ż": 253,
|
| 256 |
+
"ż": 254,
|
| 257 |
+
"Ž": 255,
|
| 258 |
+
"ž": 256,
|
| 259 |
+
"ſ": 257,
|
| 260 |
+
" ": 258,
|
| 261 |
+
" ": 259,
|
| 262 |
+
" ": 260,
|
| 263 |
+
" ": 261,
|
| 264 |
+
"éî": 262,
|
| 265 |
+
"åî": 263,
|
| 266 |
+
" ": 264,
|
| 267 |
+
"óå": 265,
|
| 268 |
+
"åò": 266,
|
| 269 |
+
"½ ": 267,
|
| 270 |
+
"áô": 268,
|
| 271 |
+
"ôï": 269,
|
| 272 |
+
" ½ ": 270,
|
| 273 |
+
"¬ ": 271,
|
| 274 |
+
"áò": 272,
|
| 275 |
+
"äå": 273,
|
| 276 |
+
"ïò": 274,
|
| 277 |
+
"ìæ": 275,
|
| 278 |
+
"óåìæ": 276,
|
| 279 |
+
" ": 277,
|
| 280 |
+
"ôå": 278,
|
| 281 |
+
"¢¢": 279,
|
| 282 |
+
"òå": 280,
|
| 283 |
+
"áì": 281,
|
| 284 |
+
"óåìæ®": 282,
|
| 285 |
+
"ëåî": 283,
|
| 286 |
+
"ïî": 284,
|
| 287 |
+
"ãè": 285,
|
| 288 |
+
"óô": 286,
|
| 289 |
+
"ôïëåî": 287,
|
| 290 |
+
"ìï": 288,
|
| 291 |
+
"éú": 289,
|
| 292 |
+
"ôé": 290,
|
| 293 |
+
"© ": 291,
|
| 294 |
+
"ìå": 292,
|
| 295 |
+
"æ ": 293,
|
| 296 |
+
"ô¨": 294,
|
| 297 |
+
"º ": 295,
|
| 298 |
+
"© ": 296,
|
| 299 |
+
"áä": 297,
|
| 300 |
+
"ó®": 298,
|
| 301 |
+
"äé": 299,
|
| 302 |
+
"º ": 300,
|
| 303 |
+
"õô": 301,
|
| 304 |
+
"íð": 302,
|
| 305 |
+
"ô ": 303,
|
| 306 |
+
"áòç": 304,
|
| 307 |
+
"òá": 305,
|
| 308 |
+
"ðò": 306,
|
| 309 |
+
"áâ": 307,
|
| 310 |
+
"ïäå": 308,
|
| 311 |
+
"¬ ": 309,
|
| 312 |
+
"¢¢¢": 310,
|
| 313 |
+
"éä": 311,
|
| 314 |
+
"ó ": 312,
|
| 315 |
+
"íá": 313,
|
| 316 |
+
"éîç": 314,
|
| 317 |
+
"ðáò": 315,
|
| 318 |
+
"çå": 316,
|
| 319 |
+
"ðå": 317,
|
| 320 |
+
"éúåò": 318,
|
| 321 |
+
"áôá": 319,
|
| 322 |
+
"ôåø": 320,
|
| 323 |
+
"ãå": 321,
|
| 324 |
+
"ó ½ ": 322,
|
| 325 |
+
"áí": 323,
|
| 326 |
+
"ìåî": 324,
|
| 327 |
+
"õí": 325,
|
| 328 |
+
"��": 326,
|
| 329 |
+
"áôå": 327,
|
| 330 |
+
"éæ ": 328,
|
| 331 |
+
"ïã": 329,
|
| 332 |
+
"éô": 330,
|
| 333 |
+
"öïã": 331,
|
| 334 |
+
"öïãáâ": 332,
|
| 335 |
+
"£ ": 333,
|
| 336 |
+
"ßéä": 334,
|
| 337 |
+
"ôåøô": 335,
|
| 338 |
+
"ïäåì": 336,
|
| 339 |
+
"æïò": 337,
|
| 340 |
+
"éîô": 338,
|
| 341 |
+
"éîô¨": 339,
|
| 342 |
+
"äß": 340,
|
| 343 |
+
"äáôá": 341,
|
| 344 |
+
"áòçó®": 342,
|
| 345 |
+
"ðõô": 343,
|
| 346 |
+
"òãè": 344,
|
| 347 |
+
"©º ": 345,
|
| 348 |
+
"© ": 346,
|
| 349 |
+
"ôïòãè": 347,
|
| 350 |
+
"æé": 348,
|
| 351 |
+
"íïäåì": 349,
|
| 352 |
+
"ßôïëåî": 350,
|
| 353 |
+
"ðòéîô¨": 351,
|
| 354 |
+
"òï": 352,
|
| 355 |
+
"ôïòãè®": 353,
|
| 356 |
+
"äåæ ": 354,
|
| 357 |
+
"íáø": 355,
|
| 358 |
+
"éî ": 356,
|
| 359 |
+
"õå": 357,
|
| 360 |
+
"åì": 358,
|
| 361 |
+
"ôïëåîéúåò": 359,
|
| 362 |
+
"õò": 360,
|
| 363 |
+
"öáì": 361,
|
| 364 |
+
"éúå": 362,
|
| 365 |
+
"ä ": 363,
|
| 366 |
+
"ßó": 364,
|
| 367 |
+
"ðáòáí": 365,
|
| 368 |
+
"ßß": 366,
|
| 369 |
+
"æïò ": 367,
|
| 370 |
+
"© ": 368,
|
| 371 |
+
"ïõô": 369,
|
| 372 |
+
"äéí": 370,
|
| 373 |
+
"º ": 371,
|
| 374 |
+
"óôò": 372,
|
| 375 |
+
"ôéïî": 373,
|
| 376 |
+
"© ": 374,
|
| 377 |
+
"öé": 375,
|
| 378 |
+
"æ¢": 376,
|
| 379 |
+
"óåô": 377,
|
| 380 |
+
" óåìæ®": 378,
|
| 381 |
+
"ôòá": 379,
|
| 382 |
+
"õì": 380,
|
| 383 |
+
"õî": 381,
|
| 384 |
+
"áî": 382,
|
| 385 |
+
"íáøß": 383,
|
| 386 |
+
"÷ïò": 384,
|
| 387 |
+
"ìïç": 385,
|
| 388 |
+
"ôåò": 386,
|
| 389 |
+
"î ": 387,
|
| 390 |
+
"§¬ ": 388,
|
| 391 |
+
"òåô": 389,
|
| 392 |
+
"òåôõò": 390,
|
| 393 |
+
"§º ": 391,
|
| 394 |
+
"òåôõòî ": 392,
|
| 395 |
+
"¬ ": 393,
|
| 396 |
+
"ðá": 394,
|
| 397 |
+
"ðòï": 395,
|
| 398 |
+
"éîç ": 396,
|
| 399 |
+
" ": 397,
|
| 400 |
+
" éî ": 398,
|
| 401 |
+
"ôòáéî": 399,
|
| 402 |
+
"òáî": 400,
|
| 403 |
+
"öéãå": 401,
|
| 404 |
+
" ": 402,
|
| 405 |
+
"èå": 403,
|
| 406 |
+
"ðåî": 404,
|
| 407 |
+
"ôù": 405,
|
| 408 |
+
"ðï": 406,
|
| 409 |
+
"ßë": 407,
|
| 410 |
+
"ðòéîô¨æ¢": 408,
|
| 411 |
+
"ðõôßéä": 409,
|
| 412 |
+
"äåöéãå": 410,
|
| 413 |
+
"ãï": 411,
|
| 414 |
+
"áôè": 412,
|
| 415 |
+
"öå": 413,
|
| 416 |
+
"ãïî": 414,
|
| 417 |
+
"éíð": 415,
|
| 418 |
+
"ó¨": 416,
|
| 419 |
+
"ìïó": 417,
|
| 420 |
+
"º ": 418,
|
| 421 |
+
"© ": 419,
|
| 422 |
+
"îõí": 420,
|
| 423 |
+
"óÛ": 421,
|
| 424 |
+
"§Ý": 422,
|
| 425 |
+
"ôéí": 423,
|
| 426 |
+
"éíðïò": 424,
|
| 427 |
+
"éíðïòô ": 425,
|
| 428 |
+
"": 426,
|
| 429 |
+
"åä": 427,
|
| 430 |
+
"ôïð": 428,
|
| 431 |
+
"îå": 429,
|
| 432 |
+
"óåò": 430,
|
| 433 |
+
"èåáä": 431,
|
| 434 |
+
"éîðõôßéä": 432,
|
| 435 |
+
"åîåò": 433,
|
| 436 |
+
"í ": 434,
|
| 437 |
+
"°°": 435,
|
| 438 |
+
"ùôå": 436,
|
| 439 |
+
"ìïáä": 437,
|
| 440 |
+
"ô¨§": 438,
|
| 441 |
+
"éã": 439,
|
| 442 |
+
"íåò": 440,
|
| 443 |
+
"éóô": 441,
|
| 444 |
+
"®¢¢¢": 442,
|
| 445 |
+
"áôé": 443,
|
| 446 |
+
"éôè": 444,
|
| 447 |
+
"ßóéúå": 445,
|
| 448 |
+
"º ¢¢¢": 446,
|
| 449 |
+
"ä¨": 447,
|
| 450 |
+
"÷éôè": 448,
|
| 451 |
+
"õìô": 449,
|
| 452 |
+
"ª ": 450,
|
| 453 |
+
"« ": 451,
|
| 454 |
+
"óß": 452,
|
| 455 |
+
"æéç": 453,
|
| 456 |
+
"¬ ": 454,
|
| 457 |
+
"ý¢": 455,
|
| 458 |
+
"åø": 456,
|
| 459 |
+
"ìïçç": 457,
|
| 460 |
+
"ôåð": 458,
|
| 461 |
+
"éç": 459,
|
| 462 |
+
"áíð": 460,
|
| 463 |
+
"ðáôè": 461,
|
| 464 |
+
"áääß": 462,
|
| 465 |
+
"ìåî¨": 463,
|
| 466 |
+
"ôè": 464,
|
| 467 |
+
"æòï": 465,
|
| 468 |
+
"ìá": 466,
|
| 469 |
+
"ßðáòáí": 467,
|
| 470 |
+
"îå÷": 468,
|
| 471 |
+
"ðáòóåò": 469,
|
| 472 |
+
"æá": 470,
|
| 473 |
+
"íåòçå": 471,
|
| 474 |
+
"ãïäå": 472,
|
| 475 |
+
"æòïí ": 473,
|
| 476 |
+
"º éîô": 474,
|
| 477 |
+
"ïîå": 475,
|
| 478 |
+
" ": 476,
|
| 479 |
+
"¬ ": 477,
|
| 480 |
+
"çåîåò": 478,
|
| 481 |
+
"õíåî": 479,
|
| 482 |
+
"äåæá": 480,
|
| 483 |
+
"äåæáõìô": 481,
|
| 484 |
+
"÷ïòä": 482,
|
| 485 |
+
"ðòïíð": 483,
|
| 486 |
+
"áôéïî": 484,
|
| 487 |
+
"ó½": 485,
|
| 488 |
+
"ïõôðõô": 486,
|
| 489 |
+
"ïð": 487,
|
| 490 |
+
"ðáòóåò®": 488,
|
| 491 |
+
"æéìå": 489,
|
| 492 |
+
"ôåî": 490,
|
| 493 |
+
"ßäé": 491,
|
| 494 |
+
"ìåîç": 492,
|
| 495 |
+
"ìåîçôè": 493,
|
| 496 |
+
"âùôå": 494,
|
| 497 |
+
"©º ¢¢¢": 495,
|
| 498 |
+
"óåìæ¬ ": 496,
|
| 499 |
+
"óåñ": 497,
|
| 500 |
+
"óïò": 498,
|
| 501 |
+
"öïãáâßóéúå": 499,
|
| 502 |
+
"ãìå": 500,
|
| 503 |
+
"ãèå": 501,
|
| 504 |
+
"ðáòóåò®áääß": 502,
|
| 505 |
+
"ðáòóåò®áääßáòç": 503,
|
| 506 |
+
"ðáòóåò®áääßáòçõíåî": 504,
|
| 507 |
+
"ðáòóåò®áääßáòçõíåîô¨§": 505,
|
| 508 |
+
"ðáòóåò®áääßáòçõíåîô¨§": 506,
|
| 509 |
+
"ðáòóåò®áääßáòçõíåîô¨§": 507,
|
| 510 |
+
"õð": 508,
|
| 511 |
+
"ãé": 509,
|
| 512 |
+
"ôß": 510,
|
| 513 |
+
"Îïîå": 511,
|
| 514 |
+
"ßèåáä": 512,
|
| 515 |
+
"ãë": 513,
|
| 516 |
+
"÷éôè ": 514,
|
| 517 |
+
"ãïîæéç": 515,
|
| 518 |
+
"®ó": 516,
|
| 519 |
+
"õîë": 517,
|
| 520 |
+
"ìïççåò": 518,
|
| 521 |
+
"äåæáõìô½": 519,
|
| 522 |
+
"ôï ": 520,
|
| 523 |
+
"֌": 521,
|
| 524 |
+
"öáìõå": 522,
|
| 525 |
+
"º ": 523,
|
| 526 |
+
"© £ ": 524,
|
| 527 |
+
"äáôáóåô": 525,
|
| 528 |
+
"äõ": 526,
|
| 529 |
+
" äåæ ": 527,
|
| 530 |
+
"æìï": 528,
|
| 531 |
+
"æìïáô": 529,
|
| 532 |
+
"òõå": 530,
|
| 533 |
+
" « ": 531,
|
| 534 |
+
"§¬ ôù": 532,
|
| 535 |
+
"§¬ ôùðå": 533,
|
| 536 |
+
"§¬ ôùðå½": 534,
|
| 537 |
+
"© ðáòóåò®áääßáòçõíåîô¨§": 535,
|
| 538 |
+
"ðòéîô¨¢": 536,
|
| 539 |
+
"îî": 537,
|
| 540 |
+
"÷åéç": 538,
|
| 541 |
+
"÷åéçè": 539,
|
| 542 |
+
"¯ ": 540,
|
| 543 |
+
"óã": 541,
|
| 544 |
+
"âáô": 542,
|
| 545 |
+
"© ": 543,
|
| 546 |
+
"äåæ ": 544,
|
| 547 |
+
"¬ äåæáõìô½": 545,
|
| 548 |
+
"ðåãé": 546,
|
| 549 |
+
"íïäåì®": 547,
|
| 550 |
+
"áíå": 548,
|
| 551 |
+
"íâ": 549,
|
| 552 |
+
"áð": 550,
|
| 553 |
+
"çåîåòáôå": 551,
|
| 554 |
+
"áòôé": 552,
|
| 555 |
+
"îî®": 553,
|
| 556 |
+
"ïòí": 554,
|
| 557 |
+
"åò¨": 555,
|
| 558 |
+
" ª ": 556,
|
| 559 |
+
" ": 557,
|
| 560 |
+
"ùß": 558,
|
| 561 |
+
" ½ óåìæ®": 559,
|
| 562 |
+
" ": 560,
|
| 563 |
+
"ßôïëåîó": 561,
|
| 564 |
+
"åòáô": 562,
|
| 565 |
+
"½½ ": 563,
|
| 566 |
+
"åóô": 564,
|
| 567 |
+
"óôòé": 565,
|
| 568 |
+
"áòôéãìå": 566,
|
| 569 |
+
"óðåãé": 567,
|
| 570 |
+
"õòå": 568,
|
| 571 |
+
"ù ": 569,
|
| 572 |
+
"®°": 570,
|
| 573 |
+
"ãá": 571,
|
| 574 |
+
"ãèõîë": 572,
|
| 575 |
+
"óë": 573,
|
| 576 |
+
"îï": 574,
|
| 577 |
+
"ìå ": 575,
|
| 578 |
+
"ãèåãë": 576,
|
| 579 |
+
"ãèåãëðï": 577,
|
| 580 |
+
"ßìïó": 578,
|
| 581 |
+
"ïðôéí": 579,
|
| 582 |
+
"ôïëåîéúåò®": 580,
|
| 583 |
+
"ßߨ": 581,
|
| 584 |
+
"Ôòõå": 582,
|
| 585 |
+
"ó¬ ": 583,
|
| 586 |
+
"éîæ": 584,
|
| 587 |
+
"ßôïëåîßéä": 585,
|
| 588 |
+
"Äáôá": 586,
|
| 589 |
+
"òáîë": 587,
|
| 590 |
+
"óðåãéáì": 588,
|
| 591 |
+
"åôåò": 589,
|
| 592 |
+
"º éîô ½ ": 590,
|
| 593 |
+
"íáóë": 591,
|
| 594 |
+
"óÛ§": 592,
|
| 595 |
+
"®®": 593,
|
| 596 |
+
"º û": 594,
|
| 597 |
+
"éë": 595,
|
| 598 |
+
"ïðôéíéúåò": 596,
|
| 599 |
+
"ý ": 597,
|
| 600 |
+
"éîéô": 598,
|
| 601 |
+
"÷åéçèô": 599,
|
| 602 |
+
" ½ °": 600,
|
| 603 |
+
"áöå": 601,
|
| 604 |
+
"åô": 602,
|
| 605 |
+
"áôå ": 603,
|
| 606 |
+
"±Ý": 604,
|
| 607 |
+
"áîä ": 605,
|
| 608 |
+
" ½½ ": 606,
|
| 609 |
+
"Ôï": 607,
|
| 610 |
+
"Û§": 608,
|
| 611 |
+
"íáøßìåîçôè": 609,
|
| 612 |
+
"Öéã": 610,
|
| 613 |
+
"ÖéãÁ": 611,
|
| 614 |
+
"ÖéãÁÉ": 612,
|
| 615 |
+
"© óåìæ®": 613,
|
| 616 |
+
"ïó": 614,
|
| 617 |
+
"óåñß": 615,
|
| 618 |
+
"óåñßìåî": 616,
|
| 619 |
+
"éëé": 617,
|
| 620 |
+
"éò": 618,
|
| 621 |
+
"âùôåß": 619,
|
| 622 |
+
"ðòïíðô": 620,
|
| 623 |
+
"éôå": 621,
|
| 624 |
+
"ÐÅ": 622,
|
| 625 |
+
"òåó": 623,
|
| 626 |
+
"íï": 624,
|
| 627 |
+
"âáôãè": 625,
|
| 628 |
+
"© ": 626,
|
| 629 |
+
"á ": 627,
|
| 630 |
+
"Ôïëåî": 628,
|
| 631 |
+
"áã": 629,
|
| 632 |
+
"ìáâ": 630,
|
| 633 |
+
"Üî": 631,
|
| 634 |
+
"áó ": 632,
|
| 635 |
+
"©®": 633,
|
| 636 |
+
"ßäéí": 634,
|
| 637 |
+
"ðáóô": 635,
|
| 638 |
+
"ôéïîß": 636,
|
| 639 |
+
"éîæï": 637,
|
| 640 |
+
"çåôß": 638,
|
| 641 |
+
"åìóå": 639,
|
| 642 |
+
"ôéíå": 640,
|
| 643 |
+
"½§": 641,
|
| 644 |
+
"ðáéò": 642,
|
| 645 |
+
"÷áò": 643,
|
| 646 |
+
"òåñ": 644,
|
| 647 |
+
"âõ": 645,
|
| 648 |
+
"èá": 646,
|
| 649 |
+
"äòï": 647,
|
| 650 |
+
"ôáì": 648,
|
| 651 |
+
"¬ §": 649,
|
| 652 |
+
"ôåíð": 650,
|
| 653 |
+
"ÂÐÅ": 651,
|
| 654 |
+
"ãèåãëðïéîô": 652,
|
| 655 |
+
"îõíß": 653,
|
| 656 |
+
"ìïççåò®": 654,
|
| 657 |
+
"äï": 655,
|
| 658 |
+
"óåìæ®öïãáâ": 656,
|
| 659 |
+
"åîãïäå": 657,
|
| 660 |
+
"© äåæ ": 658,
|
| 661 |
+
" ½ ôïòãè®": 659,
|
| 662 |
+
"éî¨": 660,
|
| 663 |
+
"Ý ½ ": 661,
|
| 664 |
+
"ðáóôßë": 662,
|
| 665 |
+
"«½ ": 663,
|
| 666 |
+
"áððåî": 664,
|
| 667 |
+
"áððåîä¨": 665,
|
| 668 |
+
"çò": 666,
|
| 669 |
+
"åòáôõòå": 667,
|
| 670 |
+
"¾ ": 668,
|
| 671 |
+
"®¢¢¢ ": 669,
|
| 672 |
+
"îáíå": 670,
|
| 673 |
+
"ìé": 671,
|
| 674 |
+
"ìåò": 672,
|
| 675 |
+
"Ôïëåîéúåò": 673,
|
| 676 |
+
"ìïã": 674,
|
| 677 |
+
"äá": 675,
|
| 678 |
+
"óãáì": 676,
|
| 679 |
+
"óãáìåò": 677,
|
| 680 |
+
"óôåð": 678,
|
| 681 |
+
"áö": 679,
|
| 682 |
+
"º óôò": 680,
|
| 683 |
+
"ãïä": 681,
|
| 684 |
+
"ðôéïî": 682,
|
| 685 |
+
"ßßéîéô": 683,
|
| 686 |
+
"ßßéîéôßߨ": 684,
|
| 687 |
+
"åùß": 685,
|
| 688 |
+
"åùßöáìõå": 686,
|
| 689 |
+
"ôïôáì": 687,
|
| 690 |
+
"©º ": 688,
|
| 691 |
+
"éîðõôßéäó": 689,
|
| 692 |
+
"ó ½ Û": 690,
|
| 693 |
+
"ôåíðåòáôõòå": 691,
|
| 694 |
+
"Ìï": 692,
|
| 695 |
+
"éëéðå": 693,
|
| 696 |
+
"éëéðåäé": 694,
|
| 697 |
+
"Éî": 695,
|
| 698 |
+
"óåß": 696,
|
| 699 |
+
"ôéîç": 697,
|
| 700 |
+
"æòïí ": 698,
|
| 701 |
+
"çå¨": 699,
|
| 702 |
+
"äòïð": 700,
|
| 703 |
+
"òåðå": 701,
|
| 704 |
+
"îïòí": 702,
|
| 705 |
+
"ìïçéô": 703,
|
| 706 |
+
"óéúå": 704,
|
| 707 |
+
"çòáä": 705,
|
| 708 |
+
"íáøßîå÷": 706,
|
| 709 |
+
"ßð": 707,
|
| 710 |
+
"ðåîáì": 708,
|
| 711 |
+
"ðåîáìôù": 709,
|
| 712 |
+
"óôòéâõ": 710,
|
| 713 |
+
"óôòéâõôå": 711,
|
| 714 |
+
"ãõ": 712,
|
| 715 |
+
"óáíð": 713,
|
| 716 |
+
"󮢢¢": 714,
|
| 717 |
+
"ôåøô ½ ": 715,
|
| 718 |
+
"áîä": 716,
|
| 719 |
+
"óåôôéîç": 717,
|
| 720 |
+
"ãô": 718,
|
| 721 |
+
"ðáòáíåôåò": 719,
|
| 722 |
+
"äòïðïõô": 720,
|
| 723 |
+
"îïô ": 721,
|
| 724 |
+
"îõíßðáòáí": 722,
|
| 725 |
+
"¨§": 723,
|
| 726 |
+
" ": 724,
|
| 727 |
+
"©º ": 725,
|
| 728 |
+
"×éëéðåäé": 726,
|
| 729 |
+
"ìïáäåò": 727,
|
| 730 |
+
"ßóôåð": 728,
|
| 731 |
+
"ìáâåì": 729,
|
| 732 |
+
"âåóô": 730,
|
| 733 |
+
"éîæï¨": 731,
|
| 734 |
+
"äõìå": 732,
|
| 735 |
+
"ùåò": 733,
|
| 736 |
+
"íâåä": 734,
|
| 737 |
+
"±°°": 735,
|
| 738 |
+
"¨óåìæ¬ ": 736,
|
| 739 |
+
"ðáóôßëåùßöáìõå": 737,
|
| 740 |
+
"º®": 738,
|
| 741 |
+
"Ãòå": 739,
|
| 742 |
+
"ôò": 740,
|
| 743 |
+
"æïò é": 741,
|
| 744 |
+
"òåðåôé": 742,
|
| 745 |
+
"ôåøô ": 743,
|
| 746 |
+
"ãõäá": 744,
|
| 747 |
+
"éôåí": 745,
|
| 748 |
+
"¬ §": 746,
|
| 749 |
+
"ßäéò": 747,
|
| 750 |
+
"öåò": 748,
|
| 751 |
+
"ßðáôè": 749,
|
| 752 |
+
"¢© ": 750,
|
| 753 |
+
"ãïäåò": 751,
|
| 754 |
+
"óó ": 752,
|
| 755 |
+
"îßèåáä": 753,
|
| 756 |
+
"îßë": 754,
|
| 757 |
+
"îßëö": 755,
|
| 758 |
+
"îßëößèåáä": 756,
|
| 759 |
+
"îß": 757,
|
| 760 |
+
"éîå": 758,
|
| 761 |
+
"éó ": 759,
|
| 762 |
+
"äåî": 760,
|
| 763 |
+
"áôéïî ": 761,
|
| 764 |
+
"ïì": 762,
|
| 765 |
+
"ó ": 763,
|
| 766 |
+
"Ìéóô": 764,
|
| 767 |
+
"òáîçå¨": 765,
|
| 768 |
+
"¬ äåöéãå": 766,
|
| 769 |
+
"ôïðßë": 767,
|
| 770 |
+
"ôïðßð": 768,
|
| 771 |
+
"ïóßôïëåîßéä": 769,
|
| 772 |
+
"öåì": 770,
|
| 773 |
+
"ôïß": 771,
|
| 774 |
+
"Ìïáä": 772,
|
| 775 |
+
"ìò": 773,
|
| 776 |
+
"÷ïòì": 774,
|
| 777 |
+
"áâìå": 775,
|
| 778 |
+
"ôòáéîß": 776,
|
| 779 |
+
"óôáò": 777,
|
| 780 |
+
"ìïççåò®éîæï¨": 778,
|
| 781 |
+
"äåãá": 779,
|
| 782 |
+
"óðåãéáìßôïëåî": 780,
|
| 783 |
+
"®ê": 781,
|
| 784 |
+
" ": 782,
|
| 785 |
+
"Û±Ý": 783,
|
| 786 |
+
"óôáôå": 784,
|
| 787 |
+
"óôáôåßäé": 785,
|
| 788 |
+
"áçå": 786,
|
| 789 |
+
"éíðïòô ": 787,
|
| 790 |
+
"óº ": 788,
|
| 791 |
+
"±å": 789,
|
| 792 |
+
"äéîç": 790,
|
| 793 |
+
"èéä": 791,
|
| 794 |
+
"èéääåî": 792,
|
| 795 |
+
" éî òáîçå¨": 793,
|
| 796 |
+
"ìéóô": 794,
|
| 797 |
+
"°Ý": 795,
|
| 798 |
+
"ðõ": 796,
|
| 799 |
+
"©äåæ ": 797,
|
| 800 |
+
" ½½ °": 798,
|
| 801 |
+
"éæ áòçó®": 799,
|
| 802 |
+
"ôåîóïò": 800,
|
| 803 |
+
"§¬ ôùðå½éîô": 801,
|
| 804 |
+
"§¬ ôùðå½éîô¬ äåæáõìô½": 802,
|
| 805 |
+
"äåãáù": 803,
|
| 806 |
+
"½áòçó®": 804,
|
| 807 |
+
"îå÷ß": 805,
|
| 808 |
+
"íåòçåó": 806,
|
| 809 |
+
"Ïðôéïî": 807,
|
| 810 |
+
"Ïðôéïîáì": 808,
|
| 811 |
+
"ã": 809,
|
| 812 |
+
"ãìá": 810,
|
| 813 |
+
"ãìáóó ": 811,
|
| 814 |
+
" äåæ ßßéîéôßߨ": 812,
|
| 815 |
+
"±": 813,
|
| 816 |
+
"óõí": 814,
|
| 817 |
+
"Æáì": 815,
|
| 818 |
+
"Æáìóå": 816,
|
| 819 |
+
"èáðå": 817,
|
| 820 |
+
"³²": 818,
|
| 821 |
+
"ìáùåò": 819,
|
| 822 |
+
"ãïîæéç®": 820,
|
| 823 |
+
"ó ½ ÛÝ": 821,
|
| 824 |
+
"ó®áððåîä¨": 822,
|
| 825 |
+
"åîô": 823,
|
| 826 |
+
" áîä ": 824,
|
| 827 |
+
"Ôòá": 825,
|
| 828 |
+
" éíðïòô ": 826,
|
| 829 |
+
"áöç": 827,
|
| 830 |
+
" §": 828,
|
| 831 |
+
" ½ û": 829,
|
| 832 |
+
"é « ": 830,
|
| 833 |
+
"®êï": 831,
|
| 834 |
+
"®êïéî¨": 832,
|
| 835 |
+
"òïò": 833,
|
| 836 |
+
"óåôôéîçóÛ§": 834,
|
| 837 |
+
"çåîåòáôåäß": 835,
|
| 838 |
+
" ½ îî®": 836,
|
| 839 |
+
"äéí½": 837,
|
| 840 |
+
"íáøßóåñßìåî": 838,
|
| 841 |
+
"âá": 839,
|
| 842 |
+
"¨óåìæ®": 840,
|
| 843 |
+
"ó ½ ôïòãè®": 841,
|
| 844 |
+
"ãèåä": 842,
|
| 845 |
+
"áôôåî": 843,
|
| 846 |
+
"èéääåîßäéí": 844,
|
| 847 |
+
"Ãïî": 845,
|
| 848 |
+
"®¢¢¢ ": 846,
|
| 849 |
+
"Ãòåáôå ": 847,
|
| 850 |
+
"®ôï": 848,
|
| 851 |
+
"íáøßîå÷ßôïëåîó": 849,
|
| 852 |
+
"òåðåôéôéïîß": 850,
|
| 853 |
+
"òåðåôéôéïîßðåîáìôù": 851,
|
| 854 |
+
"åïóßôïëåîßéä": 852,
|
| 855 |
+
"ôïëåîßéä": 853,
|
| 856 |
+
"óßôïß": 854,
|
| 857 |
+
"íïöå": 855,
|
| 858 |
+
"áôôåò": 856,
|
| 859 |
+
"© ": 857,
|
| 860 |
+
"© ¾ ": 858,
|
| 861 |
+
"ÂÐÅÔïëåîéúåò": 859,
|
| 862 |
+
"ìåß": 860,
|
| 863 |
+
"óáöå": 861,
|
| 864 |
+
"ïó®": 862,
|
| 865 |
+
"÷ïòìäß": 863,
|
| 866 |
+
"÷ïòìäßóéúå": 864,
|
| 867 |
+
"ãõäá®": 865,
|
| 868 |
+
"¬ áòçó®": 866,
|
| 869 |
+
"óôáòô": 867,
|
| 870 |
+
"ðáä": 868,
|
| 871 |
+
"ãô¨": 869,
|
| 872 |
+
"é ": 870,
|
| 873 |
+
"èáîä": 871,
|
| 874 |
+
"ðáçå": 872,
|
| 875 |
+
"Åò": 873,
|
| 876 |
+
"Åòòïò": 874,
|
| 877 |
+
"áì ": 875,
|
| 878 |
+
"ó óåìæ®": 876,
|
| 879 |
+
"íâåääéîç": 877,
|
| 880 |
+
"º æìïáô": 878,
|
| 881 |
+
"æòåñ": 879,
|
| 882 |
+
"éîß": 880,
|
| 883 |
+
"ó ": 881,
|
| 884 |
+
"âé": 882,
|
| 885 |
+
"éîôåò": 883,
|
| 886 |
+
"ó§Ý": 884,
|
| 887 |
+
"åô ": 885,
|
| 888 |
+
"æïò é éî òáîçå¨": 886,
|
| 889 |
+
"óåô¨": 887,
|
| 890 |
+
"éîäé": 888,
|
| 891 |
+
"®çå": 889,
|
| 892 |
+
"öáìßìïó": 890,
|
| 893 |
+
"ðô ": 891,
|
| 894 |
+
"º ": 892,
|
| 895 |
+
"ïõôðõôßäéò": 893,
|
| 896 |
+
"§© ðáòóåò®áääßáòçõíåîô¨§": 894,
|
| 897 |
+
"×éëéðåäéá ": 895,
|
| 898 |
+
"ÌéóôÛ": 896,
|
| 899 |
+
"íáò": 897,
|
| 900 |
+
"èáîäìåò": 898,
|
| 901 |
+
"¢ ª ": 899,
|
| 902 |
+
"Íïäåì": 900,
|
| 903 |
+
"íå": 901,
|
| 904 |
+
"ëå": 902,
|
| 905 |
+
"ó© ": 903,
|
| 906 |
+
"ææ": 904,
|
| 907 |
+
"¬ º": 905,
|
| 908 |
+
"ßå": 906,
|
| 909 |
+
"ó ½ óåìæ®": 907,
|
| 910 |
+
" £ ": 908,
|
| 911 |
+
"©º ¢¢¢": 909,
|
| 912 |
+
"ãïõî": 910,
|
| 913 |
+
"íïäåì ": 911,
|
| 914 |
+
"ó¢": 912,
|
| 915 |
+
"áìß": 913,
|
| 916 |
+
"½Ôòõå": 914,
|
| 917 |
+
"ìéóô¨": 915,
|
| 918 |
+
"éîäéãå": 916,
|
| 919 |
+
"®®®": 917,
|
| 920 |
+
"áìì": 918,
|
| 921 |
+
"ãòå": 919,
|
| 922 |
+
"© ðòéîô¨æ¢": 920,
|
| 923 |
+
"äéóôòéâõôå": 921,
|
| 924 |
+
"äéóô": 922,
|
| 925 |
+
"ãïò": 923,
|
| 926 |
+
"¨íïäåì": 924,
|
| 927 |
+
"ðïãè": 925,
|
| 928 |
+
"åøãå": 926,
|
| 929 |
+
"åøãåðô ": 927,
|
| 930 |
+
"ïðåî": 928,
|
| 931 |
+
"óù": 929,
|
| 932 |
+
"º ¢¢¢": 930,
|
| 933 |
+
"äåãïäå": 931,
|
| 934 |
+
"ãïîô": 932,
|
| 935 |
+
"¬ §": 933,
|
| 936 |
+
"åìð": 934,
|
| 937 |
+
"ÖéãÁÉ ": 935,
|
| 938 |
+
"éíðïòô ": 936,
|
| 939 |
+
"÷áòä¨": 937,
|
| 940 |
+
"±®°": 938,
|
| 941 |
+
" ¯ ": 939,
|
| 942 |
+
"äéí¬ ": 940,
|
| 943 |
+
"ó©": 941,
|
| 944 |
+
" óåìæ": 942,
|
| 945 |
+
" óåìæ¬ ": 943,
|
| 946 |
+
"òåð": 944,
|
| 947 |
+
"ôïòãè®Ô": 945,
|
| 948 |
+
"ôïòãè®Ôåî": 946,
|
| 949 |
+
"ôïòãè®Ôåîóïò": 947,
|
| 950 |
+
"ÏðôéïîáìÛ": 948,
|
| 951 |
+
"¨Û": 949,
|
| 952 |
+
"© £ ": 950,
|
| 953 |
+
"°°°": 951,
|
| 954 |
+
"©º óåìæ®": 952,
|
| 955 |
+
"óº ": 953,
|
| 956 |
+
"¯ ±å": 954,
|
| 957 |
+
"º®²": 955,
|
| 958 |
+
"íïäõìå": 956,
|
| 959 |
+
"°®°": 957,
|
| 960 |
+
"ó§º ": 958,
|
| 961 |
+
"îïß": 959,
|
| 962 |
+
" éæ ": 960,
|
| 963 |
+
"ë ": 961,
|
| 964 |
+
"¼ ": 962,
|
| 965 |
+
"© £ ": 963,
|
| 966 |
+
"Ìå": 964,
|
| 967 |
+
"óåô ": 965,
|
| 968 |
+
"Äáôáóåô": 966,
|
| 969 |
+
"ãïòðõ": 967,
|
| 970 |
+
"éóß": 968,
|
| 971 |
+
"ôòù": 969,
|
| 972 |
+
"¬ �� ": 970,
|
| 973 |
+
"åøéóô": 971,
|
| 974 |
+
"Ìïáä ": 972,
|
| 975 |
+
"ôïëåîéúåò ": 973,
|
| 976 |
+
"§© ": 974,
|
| 977 |
+
"éôåíó¨": 975,
|
| 978 |
+
"§Ý óåìæ®": 976,
|
| 979 |
+
"òáîäï": 977,
|
| 980 |
+
"¢ ª ¶": 978,
|
| 981 |
+
"¢ ª ¶°": 979,
|
| 982 |
+
"èåìð": 980,
|
| 983 |
+
"èåìð½§": 981,
|
| 984 |
+
"ñõ": 982,
|
| 985 |
+
"®¢¢¢ äåæ ßßéîéôßߨ": 983,
|
| 986 |
+
"ôéïî ": 984,
|
| 987 |
+
"¬ ë": 985,
|
| 988 |
+
"èåáäßäéí": 986,
|
| 989 |
+
"Ìéîå": 987,
|
| 990 |
+
"Ìéîåáò": 988,
|
| 991 |
+
"âéá": 989,
|
| 992 |
+
"áôô": 990,
|
| 993 |
+
"¨ø": 991,
|
| 994 |
+
"Òå": 992,
|
| 995 |
+
"áôôåîôéïîß": 993,
|
| 996 |
+
"º®²æ": 994,
|
| 997 |
+
"º®²æý": 995,
|
| 998 |
+
"óèáðå": 996,
|
| 999 |
+
"¨éîðõôßéä": 997,
|
| 1000 |
+
" §": 998,
|
| 1001 |
+
"¬ ý": 999,
|
| 1002 |
+
"åöáì": 1000,
|
| 1003 |
+
" ôïëåî": 1001,
|
| 1004 |
+
"éîäéãåóßôïß": 1002,
|
| 1005 |
+
"éîäéãåóßôïßòå": 1003,
|
| 1006 |
+
"éîäéãåóßôïßòåíïöå": 1004,
|
| 1007 |
+
"óïòôå": 1005,
|
| 1008 |
+
"óïòôåäß": 1006,
|
| 1009 |
+
"âòå": 1007,
|
| 1010 |
+
"âòåá": 1008,
|
| 1011 |
+
"âòåáë": 1009,
|
| 1012 |
+
"íáéî": 1010,
|
| 1013 |
+
"Ôòáéî": 1011,
|
| 1014 |
+
"Âùôå": 1012,
|
| 1015 |
+
"Ìåöåì": 1013,
|
| 1016 |
+
"éìå": 1014,
|
| 1017 |
+
"ìïáäß": 1015,
|
| 1018 |
+
"ãèåäõì": 1016,
|
| 1019 |
+
"åìóåº ": 1017,
|
| 1020 |
+
"äéóô®": 1018,
|
| 1021 |
+
"© òåôõòî ": 1019,
|
| 1022 |
+
"òáîë ½½ °": 1020,
|
| 1023 |
+
"ìïççåò®éîæï¨æ¢": 1021,
|
| 1024 |
+
"© ": 1022,
|
| 1025 |
+
"ãïíð": 1023,
|
| 1026 |
+
"±°": 1024,
|
| 1027 |
+
"¨áòçó®": 1025,
|
| 1028 |
+
"÷éôè ïðåî": 1026,
|
| 1029 |
+
"÷éôè ïðåî¨": 1027,
|
| 1030 |
+
"§© áó ": 1028,
|
| 1031 |
+
"§© áó æ": 1029,
|
| 1032 |
+
"ðìé": 1030,
|
| 1033 |
+
"¼ü": 1031,
|
| 1034 |
+
"ü¾": 1032,
|
| 1035 |
+
"Äáôáóåô¨": 1033,
|
| 1036 |
+
"îå÷ß÷ïòä": 1034,
|
| 1037 |
+
"ãïîôéî": 1035,
|
| 1038 |
+
"ãïîôéîõå": 1036,
|
| 1039 |
+
"äáôáÛ§": 1037,
|
| 1040 |
+
"åîãïäåä": 1038,
|
| 1041 |
+
"íéîß": 1039,
|
| 1042 |
+
"åôãè": 1040,
|
| 1043 |
+
"¢© ðòéîô¨¢": 1041,
|
| 1044 |
+
"åîãè": 1042,
|
| 1045 |
+
"åîãèíáò": 1043,
|
| 1046 |
+
"áììïã": 1044,
|
| 1047 |
+
"áììïãáôå": 1045,
|
| 1048 |
+
" ðáòáíåôåò": 1046,
|
| 1049 |
+
"äåãïäåò": 1047,
|
| 1050 |
+
"ôòáî": 1048,
|
| 1051 |
+
"ôòáîó": 1049,
|
| 1052 |
+
"çõ": 1050,
|
| 1053 |
+
"¬ äéí½": 1051,
|
| 1054 |
+
"áôåß": 1052,
|
| 1055 |
+
"¬ óåñßìåî": 1053,
|
| 1056 |
+
"åîãå": 1054,
|
| 1057 |
+
"º æìïáô ½ °": 1055,
|
| 1058 |
+
"ó½Æáìóå": 1056,
|
| 1059 |
+
"Ý ½ Îïîå": 1057,
|
| 1060 |
+
"´°": 1058,
|
| 1061 |
+
"¯ ±å¹": 1059,
|
| 1062 |
+
"©º éæ ": 1060,
|
| 1063 |
+
"ð®": 1061,
|
| 1064 |
+
"֏": 1062,
|
| 1065 |
+
"éÝ": 1063,
|
| 1066 |
+
"çòáä¨": 1064,
|
| 1067 |
+
"Çåîåò": 1065,
|
| 1068 |
+
"©º ": 1066,
|
| 1069 |
+
"ð ": 1067,
|
| 1070 |
+
"Ðáôè": 1068,
|
| 1071 |
+
"Ôåø": 1069,
|
| 1072 |
+
"ôïòãè®ãõäá®": 1070,
|
| 1073 |
+
"õóåß": 1071,
|
| 1074 |
+
"ôòáéî¨": 1072,
|
| 1075 |
+
"öáìß": 1073,
|
| 1076 |
+
"éôåòáô": 1074,
|
| 1077 |
+
"éôåòáôïò": 1075,
|
| 1078 |
+
"åðïãè": 1076,
|
| 1079 |
+
"¾½ ": 1077,
|
| 1080 |
+
"½äåöéãå": 1078,
|
| 1081 |
+
"§¬ ôùð彿ìïáô": 1079,
|
| 1082 |
+
"§¬ ôùð彿ìïáô¬ äåæáõìô½": 1080,
|
| 1083 |
+
"§¬ ôùðå½óôò": 1081,
|
| 1084 |
+
"ôïëåîéúåò ½ ": 1082,
|
| 1085 |
+
"®óðìé": 1083,
|
| 1086 |
+
"©Ý": 1084,
|
| 1087 |
+
"óïî": 1085,
|
| 1088 |
+
"öïãáâ ½ û": 1086,
|
| 1089 |
+
"óùíâ": 1087,
|
| 1090 |
+
"óùíâïì": 1088,
|
| 1091 |
+
"󮢢¢ ": 1089,
|
| 1092 |
+
"õòò": 1090,
|
| 1093 |
+
"ôåøô®": 1091,
|
| 1094 |
+
"ìåî¨óåìæ®öïãáâ": 1092,
|
| 1095 |
+
"é « ±": 1093,
|
| 1096 |
+
"óðåãéáìßôïëåîó": 1094,
|
| 1097 |
+
"èå ": 1095,
|
| 1098 |
+
"éîç®": 1096,
|
| 1099 |
+
"ðòéîô¨æ¢Üî": 1097,
|
| 1100 |
+
"òáîäïí": 1098,
|
| 1101 |
+
"óº": 1099,
|
| 1102 |
+
"ó÷éôè": 1100,
|
| 1103 |
+
"óôáôåßäéãô¨": 1101,
|
| 1104 |
+
"û§": 1102,
|
| 1105 |
+
"óååä": 1103,
|
| 1106 |
+
"áãôé": 1104,
|
| 1107 |
+
"áãôéöå": 1105,
|
| 1108 |
+
"ðòïíðôß": 1106,
|
| 1109 |
+
"ìù ": 1107,
|
| 1110 |
+
"ìáî": 1108,
|
| 1111 |
+
"ìáîçõ": 1109,
|
| 1112 |
+
"ìáîçõáçå": 1110,
|
| 1113 |
+
"¨© ": 1111,
|
| 1114 |
+
"±¬ ": 1112,
|
| 1115 |
+
"óéî": 1113,
|
| 1116 |
+
" òåôõòî ": 1114,
|
| 1117 |
+
"åîô ": 1115,
|
| 1118 |
+
" ½ îî®Ìéîåáò": 1116,
|
| 1119 |
+
" ½ îî®Ìéîåáò¨": 1117,
|
| 1120 |
+
"óåìæ®èåáäßäéí": 1118,
|
| 1121 |
+
"âéáó½Æáìóå": 1119,
|
| 1122 |
+
"© éæ ": 1120,
|
| 1123 |
+
" æïò ": 1121,
|
| 1124 |
+
"éì": 1122,
|
| 1125 |
+
"ðòå": 1123,
|
| 1126 |
+
"Ãïîæéç": 1124,
|
| 1127 |
+
"îßìáùåò": 1125,
|
| 1128 |
+
"ôéå": 1126,
|
| 1129 |
+
"âï": 1127,
|
| 1130 |
+
"éí": 1128,
|
| 1131 |
+
"ëåù": 1129,
|
| 1132 |
+
"éôé": 1130,
|
| 1133 |
+
"áìéúå": 1131,
|
| 1134 |
+
"û §": 1132,
|
| 1135 |
+
"ôïòãè®îïß": 1133,
|
| 1136 |
+
"ôïòãè®îïßçòáä¨": 1134,
|
| 1137 |
+
"Û°Ý": 1135,
|
| 1138 |
+
" ": 1136,
|
| 1139 |
+
"öïãáâßóéúå½": 1137,
|
| 1140 |
+
"Ôåóô": 1138,
|
| 1141 |
+
"ôòáéîéîç ": 1139,
|
| 1142 |
+
"ÄÐ": 1140,
|
| 1143 |
+
"õôé": 1141,
|
| 1144 |
+
"¬": 1142,
|
| 1145 |
+
"ìòßó": 1143,
|
| 1146 |
+
"ìòßóãèåäõì": 1144,
|
| 1147 |
+
"ôòáéîéîç": 1145,
|
| 1148 |
+
"®çåô¨§": 1146,
|
| 1149 |
+
"© åìóåº ": 1147,
|
| 1150 |
+
"õóåßáíð": 1148,
|
| 1151 |
+
"âáôãèÛ§": 1149,
|
| 1152 |
+
"¨äåöéãå": 1150,
|
| 1153 |
+
"áòçó": 1151,
|
| 1154 |
+
"éìå ": 1152,
|
| 1155 |
+
"Éôåò": 1153,
|
| 1156 |
+
"ûáòçó®": 1154,
|
| 1157 |
+
"éæ ��òçó®òáîë ½½ °": 1155,
|
| 1158 |
+
"Óáöå": 1156,
|
| 1159 |
+
"âáôãèå": 1157,
|
| 1160 |
+
" éæ ": 1158,
|
| 1161 |
+
"òåäõ": 1159,
|
| 1162 |
+
"òåäõãå": 1160,
|
| 1163 |
+
"ìåáò": 1161,
|
| 1164 |
+
"íõð": 1162,
|
| 1165 |
+
"§¬ ôùðå½óôò¬ äåæáõìô½": 1163,
|
| 1166 |
+
"ïæ ": 1164,
|
| 1167 |
+
"®óðìéô¨": 1165,
|
| 1168 |
+
"÷ïòä ": 1166,
|
| 1169 |
+
" ôåøô ": 1167,
|
| 1170 |
+
"óº ÌéóôÛ": 1168,
|
| 1171 |
+
"æïòí": 1169,
|
| 1172 |
+
"ãèáò": 1170,
|
| 1173 |
+
" ôï ": 1171,
|
| 1174 |
+
"§§": 1172,
|
| 1175 |
+
"ä û": 1173,
|
| 1176 |
+
"ôåøô© ": 1174,
|
| 1177 |
+
"§º óåìæ®": 1175,
|
| 1178 |
+
"ôï û": 1176,
|
| 1179 |
+
"åîãïäå¨": 1177,
|
| 1180 |
+
" ½ Û": 1178,
|
| 1181 |
+
"áôôåîôéïîßíáóë": 1179,
|
| 1182 |
+
"âùôåßåî": 1180,
|
| 1183 |
+
"âùôåßåîãïäåò": 1181,
|
| 1184 |
+
"óï": 1182,
|
| 1185 |
+
"æåôãè": 1183,
|
| 1186 |
+
"ó ½ ôïòãè®ôåîóïò": 1184,
|
| 1187 |
+
"ìåî¨óåìæ®": 1185,
|
| 1188 |
+
"ßéäø": 1186,
|
| 1189 |
+
"֔": 1187,
|
| 1190 |
+
"ßæéìå": 1188,
|
| 1191 |
+
"óôáôåßäéãô": 1189,
|
| 1192 |
+
"¬ §": 1190,
|
| 1193 |
+
"ôòáéîáâìå": 1191,
|
| 1194 |
+
"åîãèíáòë": 1192,
|
| 1195 |
+
"óôáòôó÷éôè": 1193,
|
| 1196 |
+
"ïõôðõôßéä": 1194,
|
| 1197 |
+
"îî®Í": 1195,
|
| 1198 |
+
"îî®Íï": 1196,
|
| 1199 |
+
"îî®Íïäõìå": 1197,
|
| 1200 |
+
"óõð": 1198,
|
| 1201 |
+
"æïò÷áòä¨": 1199,
|
| 1202 |
+
"©º òåôõòî ": 1200,
|
| 1203 |
+
"֬": 1201,
|
| 1204 |
+
"ôáò": 1202,
|
| 1205 |
+
"ñ¬ ë": 1203,
|
| 1206 |
+
"Ûº": 1204,
|
| 1207 |
+
"º éîô¬ ": 1205,
|
| 1208 |
+
"© óåìæ®": 1206,
|
| 1209 |
+
"ðïõ": 1207,
|
| 1210 |
+
"ðïõô¨": 1208,
|
| 1211 |
+
"¬ óåìæ®": 1209,
|
| 1212 |
+
"éó îïô ": 1210,
|
| 1213 |
+
"éó îïô Îïîå": 1211,
|
| 1214 |
+
"¬ ö": 1212,
|
| 1215 |
+
"ßéîôåò": 1213,
|
| 1216 |
+
" ÷éôè ": 1214,
|
| 1217 |
+
"©¬ ": 1215,
|
| 1218 |
+
"º éîô ½ ³²": 1216,
|
| 1219 |
+
"ðòïð": 1217,
|
| 1220 |
+
"îßðáòáí": 1218,
|
| 1221 |
+
" £ ": 1219,
|
| 1222 |
+
"ó «½ ": 1220,
|
| 1223 |
+
"ó© ": 1221,
|
| 1224 |
+
"© ðòéîô¨æ¢": 1222,
|
| 1225 |
+
"éôéáìéúå": 1223,
|
| 1226 |
+
"½°®°": 1224,
|
| 1227 |
+
"© éæ ": 1225,
|
| 1228 |
+
" éî óåìæ®": 1226,
|
| 1229 |
+
"¨© ": 1227,
|
| 1230 |
+
"éîðõôßéäó®": 1228,
|
| 1231 |
+
"äåø": 1229,
|
| 1232 |
+
"âáôãèßóéúå": 1230,
|
| 1233 |
+
"ïõôðõôóÛ§": 1231,
|
| 1234 |
+
"Çåô ": 1232,
|
| 1235 |
+
"®ôïìéóô¨": 1233,
|
| 1236 |
+
"®ôïìéóô¨©": 1234,
|
| 1237 |
+
"îåø": 1235,
|
| 1238 |
+
" íïäåì": 1236,
|
| 1239 |
+
"Ôòáéîéîç ": 1237,
|
| 1240 |
+
"æòïí ôïòãè®": 1238,
|
| 1241 |
+
"ì ": 1239,
|
| 1242 |
+
"ðß": 1240,
|
| 1243 |
+
"ÂùôåÌåöåì": 1241,
|
| 1244 |
+
"ÂùôåÌåöåìÂÐÅÔïëåîéúåò": 1242,
|
| 1245 |
+
"òïî": 1243,
|
| 1246 |
+
"ìïãáìß": 1244,
|
| 1247 |
+
"ìïãáìßòáîë": 1245,
|
| 1248 |
+
"äáôáß": 1246,
|
| 1249 |
+
"íåí": 1247,
|
| 1250 |
+
"íåíïò": 1248,
|
| 1251 |
+
"ôòáéîßìïáäåò": 1249,
|
| 1252 |
+
"óôåð ": 1250,
|
| 1253 |
+
"óÛ°Ý": 1251,
|
| 1254 |
+
"öáìßìïóó": 1252,
|
| 1255 |
+
"¬ ": 1253,
|
| 1256 |
+
"ý¢© ": 1254,
|
| 1257 |
+
"ïî ": 1255,
|
| 1258 |
+
"ßìïóó ½ ": 1256,
|
| 1259 |
+
"îõíßâáôãèå": 1257,
|
| 1260 |
+
"÷éôè ôïòãè®îïßçòáä¨": 1258,
|
| 1261 |
+
"ó§¬ ôùðå½éîô¬ äåæáõìô½": 1259,
|
| 1262 |
+
"ìåáòî": 1260,
|
| 1263 |
+
"÷áòíõð": 1261,
|
| 1264 |
+
"°®": 1262,
|
| 1265 |
+
"äéò": 1263,
|
| 1266 |
+
"áãôéïî": 1264,
|
| 1267 |
+
"óáíðìåß": 1265,
|
| 1268 |
+
"¼üåî": 1266,
|
| 1269 |
+
"¼üåîäï": 1267,
|
| 1270 |
+
"¼üåîäïæ": 1268,
|
| 1271 |
+
"¼üåîäïæôåøô": 1269,
|
| 1272 |
+
"¼üåîäïæôåøôü¾": 1270,
|
| 1273 |
+
"óôòéð": 1271,
|
| 1274 |
+
"óôòéð¨": 1272,
|
| 1275 |
+
"¢© ": 1273,
|
| 1276 |
+
"Åî": 1274,
|
| 1277 |
+
"ðáäßôïëåîßéä": 1275,
|
| 1278 |
+
"ãõòò": 1276,
|
| 1279 |
+
"ò§": 1277,
|
| 1280 |
+
" æïò ": 1278,
|
| 1281 |
+
"ôåøôº óôò": 1279,
|
| 1282 |
+
"óôòÝ": 1280,
|
| 1283 |
+
"ÂÐÅ ": 1281,
|
| 1284 |
+
"©ý": 1282,
|
| 1285 |
+
"ó®çå": 1283,
|
| 1286 |
+
" ½ óåìæ®ß": 1284,
|
| 1287 |
+
"äßôïëåî": 1285,
|
| 1288 |
+
"äõíð": 1286,
|
| 1289 |
+
" ½ óåìæ®óðåãéáìßôïëåî": 1287,
|
| 1290 |
+
"®áððåîä¨": 1288,
|
| 1291 |
+
"âùôåßäåãïäåò": 1289,
|
| 1292 |
+
"ïõôðõôßðáôè": 1290,
|
| 1293 |
+
"ôèå ": 1291,
|
| 1294 |
+
"󮢢¢ ": 1292,
|
| 1295 |
+
"ôïëåîéúåò®äåãïäå": 1293,
|
| 1296 |
+
"õòì": 1294,
|
| 1297 |
+
"§º §": 1295,
|
| 1298 |
+
"§¬ §": 1296,
|
| 1299 |
+
"òåóð": 1297,
|
| 1300 |
+
"òåóðïî": 1298,
|
| 1301 |
+
"òåóðïîóå": 1299,
|
| 1302 |
+
"æéìåî": 1300,
|
| 1303 |
+
"æéìåîáíå": 1301,
|
| 1304 |
+
"ðòéîô¨¢Üî": 1302,
|
| 1305 |
+
"ìïççéîç®": 1303,
|
| 1306 |
+
"Éîôåò": 1304,
|
| 1307 |
+
"¢© ðòéîô¨¢ ": 1305,
|
| 1308 |
+
"¢© ðòéîô¨¢ ¯": 1306,
|
| 1309 |
+
"ðòïíðô ": 1307,
|
| 1310 |
+
"¢© ": 1308,
|
| 1311 |
+
"¢¢¢": 1309,
|
| 1312 |
+
"¢¢¢ÖéãÁÉ ": 1310,
|
| 1313 |
+
"æïòíåò": 1311,
|
| 1314 |
+
"¢¢¢": 1312,
|
| 1315 |
+
"¢¢¢éíðïòô ": 1313,
|
| 1316 |
+
"ðéîç ": 1314,
|
| 1317 |
+
"õðìå": 1315,
|
| 1318 |
+
"ãôéïî": 1316,
|
| 1319 |
+
"Îïòí": 1317,
|
| 1320 |
+
"îî®Íïäõì婺 ¢¢¢": 1318,
|
| 1321 |
+
"Òï": 1319,
|
| 1322 |
+
"®¢¢¢ äåæ ßßéîéôßߨóåìæ¬ ": 1320,
|
| 1323 |
+
"åð": 1321,
|
| 1324 |
+
"©º óõð": 1322,
|
| 1325 |
+
"©º ó��ðåò¨": 1323,
|
| 1326 |
+
"©º óõðåò¨©®": 1324,
|
| 1327 |
+
"©º óõðåò¨©®ßßéîéôßߨ": 1325,
|
| 1328 |
+
"©º óõðåò¨©®ßßéîéôßߨ© óåìæ®": 1326,
|
| 1329 |
+
"Ðáò": 1327,
|
| 1330 |
+
"óõí¨": 1328,
|
| 1331 |
+
"åòù": 1329,
|
| 1332 |
+
"¬ âéáó½Æáìóå": 1330,
|
| 1333 |
+
"º ÏðôéïîáìÛ": 1331,
|
| 1334 |
+
"âó": 1332,
|
| 1335 |
+
"âóú": 1333,
|
| 1336 |
+
"âóú¬ óåñßìåî": 1334,
|
| 1337 |
+
"öéå": 1335,
|
| 1338 |
+
"öéå÷¨": 1336,
|
| 1339 |
+
"çòï": 1337,
|
| 1340 |
+
"íõì": 1338,
|
| 1341 |
+
"íáø¨": 1339,
|
| 1342 |
+
"£ Á": 1340,
|
| 1343 |
+
"ÖéãÁÉÃïîæéç": 1341,
|
| 1344 |
+
"ß÷åéçèô": 1342,
|
| 1345 |
+
"âïïì": 1343,
|
| 1346 |
+
" ª óåìæ®": 1344,
|
| 1347 |
+
"ÖéãÁÉÍïäåì": 1345,
|
| 1348 |
+
"ßåíâåääéîç": 1346,
|
| 1349 |
+
"ìù": 1347,
|
| 1350 |
+
"ôïôáìßðáòáí": 1348,
|
| 1351 |
+
"íïäõìå®": 1349,
|
| 1352 |
+
"ðáòáíåôåòó¨": 1350,
|
| 1353 |
+
"ôáòç": 1351,
|
| 1354 |
+
"ôáòçåô": 1352,
|
| 1355 |
+
"õó": 1353,
|
| 1356 |
+
"å¨": 1354,
|
| 1357 |
+
"ìïóó": 1355,
|
| 1358 |
+
"ôïòå": 1356,
|
| 1359 |
+
"Ôïð": 1357,
|
| 1360 |
+
"Ôïð": 1358,
|
| 1361 |
+
"óïòôåäßéîäéãåóßôïßòåíïöå": 1359,
|
| 1362 |
+
"çåîåòáôåä": 1360,
|
| 1363 |
+
"ãòåáôåß": 1361,
|
| 1364 |
+
"öéãá": 1362,
|
| 1365 |
+
"öéãáé": 1363,
|
| 1366 |
+
"öéãáéß": 1364,
|
| 1367 |
+
"â¨": 1365,
|
| 1368 |
+
"éæ ": 1366,
|
| 1369 |
+
"éæ ßß": 1367,
|
| 1370 |
+
"éæ ßßîáíå": 1368,
|
| 1371 |
+
"éæ ßßîáíåßß": 1369,
|
| 1372 |
+
"éæ ßßîáíåßß ½½ ": 1370,
|
| 1373 |
+
"ßßíáéî": 1371,
|
| 1374 |
+
"ßßíáéîßß": 1372,
|
| 1375 |
+
"Ôåóô ": 1373,
|
| 1376 |
+
"©": 1374,
|
| 1377 |
+
"©<": 1375,
|
| 1378 |
+
"©<|": 1376,
|
| 1379 |
+
"©<|e": 1377,
|
| 1380 |
+
"©<|en": 1378,
|
| 1381 |
+
"©<|end": 1379,
|
| 1382 |
+
"©<|endo": 1380,
|
| 1383 |
+
"©<|endof": 1381,
|
| 1384 |
+
"©<|endoft": 1382,
|
| 1385 |
+
"©<|endofte": 1383,
|
| 1386 |
+
"©<|endoftex": 1384,
|
| 1387 |
+
"©<|endoftext": 1385,
|
| 1388 |
+
"©<|endoftext|": 1386,
|
| 1389 |
+
"©<|endoftext|>": 1387,
|
| 1390 |
+
"äéóôòéâõôåä ": 1388,
|
| 1391 |
+
"äéóôòéâõôåä": 1389,
|
| 1392 |
+
"õôéì": 1390,
|
| 1393 |
+
"×éëéðåäéá": 1391,
|
| 1394 |
+
"óáöåß": 1392,
|
| 1395 |
+
"ìòßóãèåäõìåò": 1393,
|
| 1396 |
+
"§ éî ": 1394,
|
| 1397 |
+
"ãåó": 1395,
|
| 1398 |
+
"ãìåáî": 1396,
|
| 1399 |
+
"óè": 1397,
|
| 1400 |
+
"ïðôéíéúåò¬ ": 1398,
|
| 1401 |
+
"ó ½ âáôãèÛ§": 1399,
|
| 1402 |
+
"®ôï¨äåöéãå": 1400,
|
| 1403 |
+
" íïäåì¬ ": 1401,
|
| 1404 |
+
"¬©º ¢¢¢": 1402,
|
| 1405 |
+
"÷èéìå ": 1403,
|
| 1406 |
+
"«½ ±": 1404,
|
| 1407 |
+
"¥ ": 1405,
|
| 1408 |
+
"åý¢": 1406,
|
| 1409 |
+
"Öáì": 1407,
|
| 1410 |
+
"Óáöå ": 1408,
|
| 1411 |
+
"ãèåãëðïéîô¨": 1409,
|
| 1412 |
+
"áòçó®ïõôðõôßäéò": 1410,
|
| 1413 |
+
"öáìéä": 1411,
|
| 1414 |
+
"¬ äåöéãå½äåöéãå": 1412,
|
| 1415 |
+
"§¬ ôùðå½óôò¬ äåæáõìô½§": 1413,
|
| 1416 |
+
"±°°°": 1414,
|
| 1417 |
+
"§ ": 1415,
|
| 1418 |
+
"Äå": 1416,
|
| 1419 |
+
"áöá": 1417,
|
| 1420 |
+
"áöáé": 1418,
|
| 1421 |
+
"áöáéìáâ": 1419,
|
| 1422 |
+
"ïó®ðáôè": 1420,
|
| 1423 |
+
"ôïëåîéúåòßðáôè": 1421,
|
| 1424 |
+
"Ôïëåîéúåò ": 1422,
|
| 1425 |
+
"¢© ": 1423,
|
| 1426 |
+
"ìïççåò®éîæï¨¢": 1424,
|
| 1427 |
+
"íïäåì ½ ": 1425,
|
| 1428 |
+
"äáôáóåô ½ ": 1426,
|
| 1429 |
+
"êóïî": 1427,
|
| 1430 |
+
"§º °": 1428,
|
| 1431 |
+
"¼¯": 1429,
|
| 1432 |
+
" ½ ° óåìæ®": 1430,
|
| 1433 |
+
"®éôåíó¨": 1431,
|
| 1434 |
+
" äåæ ß": 1432,
|
| 1435 |
+
"âéç": 1433,
|
| 1436 |
+
"âéçòá": 1434,
|
| 1437 |
+
"ðáôôåò": 1435,
|
| 1438 |
+
"ðáôôåòî": 1436,
|
| 1439 |
+
"öïãáâÛ": 1437,
|
| 1440 |
+
"ôåøôº óôò© ¾ ": 1438,
|
| 1441 |
+
"ü§": 1439,
|
| 1442 |
+
"®óðìéô¨©": 1440,
|
| 1443 |
+
" ½ Ôòõå": 1441,
|
| 1444 |
+
"ÉÄ": 1442,
|
| 1445 |
+
"ìáãå": 1443,
|
| 1446 |
+
"ó ½ äáôáÛ§": 1444,
|
| 1447 |
+
"åìóåº ": 1445,
|
| 1448 |
+
"ìåöåì": 1446,
|
| 1449 |
+
"²µ": 1447,
|
| 1450 |
+
"²µ¶": 1448,
|
| 1451 |
+
"õîéã": 1449,
|
| 1452 |
+
"õîéãïäå": 1450,
|
| 1453 |
+
"÷ïòäÛ": 1451,
|
| 1454 |
+
"۱ݩ ": 1452,
|
| 1455 |
+
" ½ ¢": 1453,
|
| 1456 |
+
"ôåóô": 1454,
|
| 1457 |
+
"ôïëåîéúåò®åîãïäå¨": 1455,
|
| 1458 |
+
"ßìåîçôè": 1456,
|
| 1459 |
+
"éîðõôßéäó ½ ôïòãè®ôåîóïò": 1457,
|
| 1460 |
+
"ºÝ": 1458,
|
| 1461 |
+
"÷îìïáä": 1459,
|
| 1462 |
+
"ó ": 1460,
|
| 1463 |
+
"Èáîä": 1461,
|
| 1464 |
+
"äåãáùßðáòáí": 1462,
|
| 1465 |
+
"îõíßðáòáíó ": 1463,
|
| 1466 |
+
"½¢ ª ¶°": 1464,
|
| 1467 |
+
"§º": 1465,
|
| 1468 |
+
"ðòéîô¨¢": 1466,
|
| 1469 |
+
"Óåô ": 1467,
|
| 1470 |
+
"âåîãèíáòë": 1468,
|
| 1471 |
+
"äßç": 1469,
|
| 1472 |
+
"äßçâ": 1470,
|
| 1473 |
+
"çåîåòáôéïî ": 1471,
|
| 1474 |
+
"ôïð": 1472,
|
| 1475 |
+
"ãïîôéîõå éæ ": 1473,
|
| 1476 |
+
"íáøßîå÷ßôïëåî": 1474,
|
| 1477 |
+
"íáøßîå÷ßôïëåîó½": 1475,
|
| 1478 |
+
"çåîåòáôåäßôåøô ½ ": 1476,
|
| 1479 |
+
"óÛ°Ý®ôïìéóô¨©": 1477,
|
| 1480 |
+
"ðòïíðôßôåøô": 1478,
|
| 1481 |
+
"¬ èåìð½§": 1479,
|
| 1482 |
+
"Íïäåì ": 1480,
|
| 1483 |
+
"Â ðáòáíåôåò": 1481,
|
| 1484 |
+
"íáôè": 1482,
|
| 1485 |
+
"ôïòãè®îî®": 1483,
|
| 1486 |
+
"ÒÍ": 1484,
|
| 1487 |
+
"ÒÍÓ": 1485,
|
| 1488 |
+
"¨îî®Íïäõì婺 ¢¢¢": 1486,
|
| 1489 |
+
"Åíâåääéîç": 1487,
|
| 1490 |
+
"²¬ ": 1488,
|
| 1491 |
+
"âáóå": 1489,
|
| 1492 |
+
"¢¬ ": 1490,
|
| 1493 |
+
"åíâ": 1491,
|
| 1494 |
+
"ãáô": 1492,
|
| 1495 |
+
"¬ ºÝ": 1493,
|
| 1496 |
+
"© òåôõòî ": 1494,
|
| 1497 |
+
"©ãìáóó ": 1495,
|
| 1498 |
+
"ôôåî": 1496,
|
| 1499 |
+
"© æïò ": 1497,
|
| 1500 |
+
"îßèåáäó": 1498,
|
| 1501 |
+
"îßëößèåáäó": 1499,
|
| 1502 |
+
" ½ îî®Ìéîåáò¨äéí¬ ": 1500,
|
| 1503 |
+
"ó ª ": 1501,
|
| 1504 |
+
"¬ âéáó½Æáìóå© óåìæ®": 1502,
|
| 1505 |
+
"Ý ½ Îïîå¬ ": 1503,
|
| 1506 |
+
"®óèáðå": 1504,
|
| 1507 |
+
"ôòáîóðï": 1505,
|
| 1508 |
+
"ôòáîóðïóå": 1506,
|
| 1509 |
+
"ôòáîóðïóå¨": 1507,
|
| 1510 |
+
"ïòå": 1508,
|
| 1511 |
+
"© òåôõòî ": 1509,
|
| 1512 |
+
"öïãáâßóéúåº éîô ½ ³²": 1510,
|
| 1513 |
+
"öïãáâßóéúåº éîô ½ ³²°°°": 1511,
|
| 1514 |
+
"öïãáâßóéúå ½ ": 1512,
|
| 1515 |
+
"ãïõîô": 1513,
|
| 1516 |
+
" ª óåìæ®äéí": 1514,
|
| 1517 |
+
"ôá": 1515,
|
| 1518 |
+
"ó æïò ": 1516,
|
| 1519 |
+
"¨ ": 1517,
|
| 1520 |
+
"¯ ±å¹º®²æý": 1518,
|
| 1521 |
+
"ó¨óåìæ¬ ": 1519,
|
| 1522 |
+
"éîðõôßéäó®óèáðå": 1520,
|
| 1523 |
+
"õîó": 1521,
|
| 1524 |
+
"æïò é¬ ": 1522,
|
| 1525 |
+
" éî åî": 1523,
|
| 1526 |
+
" éî åîõí": 1524,
|
| 1527 |
+
" éî åîõíåò": 1525,
|
| 1528 |
+
" éî åîõíåòáôå": 1526,
|
| 1529 |
+
"ìïóó ½ ": 1527,
|
| 1530 |
+
"º éîô ½ µ": 1528,
|
| 1531 |
+
"º æìïáô ½ °®": 1529,
|
| 1532 |
+
"çòåó": 1530,
|
| 1533 |
+
"ïõôðõôó ½ ": 1531,
|
| 1534 |
+
"¨éîðõôßéäó¬ ": 1532,
|
| 1535 |
+
"ôéïî ðåîáìôù": 1533,
|
| 1536 |
+
"ìïçéôó¬ ": 1534,
|
| 1537 |
+
"îõ": 1535,
|
| 1538 |
+
"ßóáíð": 1536,
|
| 1539 |
+
"ãòåáôåßöéãáéß": 1537,
|
| 1540 |
+
"ãòåáôåßöéãáéßµ": 1538,
|
| 1541 |
+
"±²": 1539,
|
| 1542 |
+
"ý¢© ðòéîô¨æ¢": 1540,
|
| 1543 |
+
"ðô": 1541,
|
| 1544 |
+
"÷òá": 1542,
|
| 1545 |
+
"Óáíð": 1543,
|
| 1546 |
+
"Æéìå": 1544,
|
| 1547 |
+
"çåôßìïççåò": 1545,
|
| 1548 |
+
"®¢¢¢ éæ ": 1546,
|
| 1549 |
+
"ïó®åî": 1547,
|
| 1550 |
+
"ïó®åîöé": 1548,
|
| 1551 |
+
"ïó®åîöéòïî": 1549,
|
| 1552 |
+
" ½ éîô¨": 1550,
|
| 1553 |
+
"ðòïãåó": 1551,
|
| 1554 |
+
"çòïõð": 1552,
|
| 1555 |
+
"äáôá ": 1553,
|
| 1556 |
+
"óáíðìåò": 1554,
|
| 1557 |
+
"ìåò¨": 1555,
|
| 1558 |
+
"¨íïäåì¬ ": 1556,
|
| 1559 |
+
"¬ äåöéãå": 1557,
|
| 1560 |
+
"Íá": 1558,
|
| 1561 |
+
"®óåô": 1559,
|
| 1562 |
+
" û": 1560,
|
| 1563 |
+
"ü ": 1561,
|
| 1564 |
+
"æý ": 1562,
|
| 1565 |
+
" ": 1563,
|
| 1566 |
+
"óº ": 1564,
|
| 1567 |
+
"ßôéíå": 1565,
|
| 1568 |
+
"ìáâåìó": 1566,
|
| 1569 |
+
"öåòáçå": 1567,
|
| 1570 |
+
"áìì ": 1568,
|
| 1571 |
+
"§¬ ôùð彿ìïáô¬ äåæáõìô½°®": 1569,
|
| 1572 |
+
"âåô": 1570,
|
| 1573 |
+
"âåôá": 1571,
|
| 1574 |
+
"§¬ áãôéïî": 1572,
|
| 1575 |
+
"§¬ áãôéïî½§": 1573,
|
| 1576 |
+
"§¬ áãôéïî½§ó": 1574,
|
| 1577 |
+
"§¬ áãôéïî½§óôïòå": 1575,
|
| 1578 |
+
"§¬ áãôéïî½§óôïòåß": 1576,
|
| 1579 |
+
"§¬ áãôéïî½§óôïòåßô": 1577,
|
| 1580 |
+
"§¬ áãôéïî½§óôïòåßôòõå": 1578,
|
| 1581 |
+
"§¬ áãôéïî½§óôïòåßôòõ姬 ": 1579,
|
| 1582 |
+
"ãïíðéìå": 1580,
|
| 1583 |
+
"ðáòåîô": 1581,
|
| 1584 |
+
"¬ åøéóô": 1582,
|
| 1585 |
+
"¬ åøéóôß": 1583,
|
| 1586 |
+
"¬ åøéóôßï": 1584,
|
| 1587 |
+
"¬ åøéóôßïë": 1585,
|
| 1588 |
+
"¬ åøéóôßïë½Ôòõå": 1586,
|
| 1589 |
+
"¨æ": 1587,
|
| 1590 |
+
"áöáéìáâìå": 1588,
|
| 1591 |
+
"óáíðìå ": 1589,
|
| 1592 |
+
"îõíßáòôéãìå": 1590,
|
| 1593 |
+
"¬ §ò": 1591,
|
| 1594 |
+
"éî ôåøô": 1592,
|
| 1595 |
+
"Ãï": 1593,
|
| 1596 |
+
"òåáí": 1594,
|
| 1597 |
+
"×éëéðåäéáÄáôáóåô¨": 1595,
|
| 1598 |
+
"ó¨íïäåì": 1596,
|
| 1599 |
+
"æéî": 1597,
|
| 1600 |
+
"ðéã": 1598,
|
| 1601 |
+
"ðéãë": 1599,
|
| 1602 |
+
"ðéãëìå": 1600,
|
| 1603 |
+
"äåæáõìôäé": 1601,
|
| 1604 |
+
"Äéã": 1602,
|
| 1605 |
+
"Äéãô": 1603,
|
| 1606 |
+
"óôáô": 1604,
|
| 1607 |
+
" ½ òå": 1605,
|
| 1608 |
+
" §®êïéî¨": 1606,
|
| 1609 |
+
"¨¿": 1607,
|
| 1610 |
+
"¡Ü": 1608,
|
| 1611 |
+
"¡ÜÓ": 1609,
|
| 1612 |
+
"¡ÜÓ©": 1610,
|
| 1613 |
+
"÷ïòä éî ": 1611,
|
| 1614 |
+
"®óõ": 1612,
|
| 1615 |
+
"ßôïëåîéúå": 1613,
|
| 1616 |
+
"áòù ": 1614,
|
| 1617 |
+
"ó®éôåíó¨": 1615,
|
| 1618 |
+
"Ãïîöåò": 1616,
|
| 1619 |
+
"Ãïîöåòô ": 1617,
|
| 1620 |
+
"§§®êïéî¨": 1618,
|
| 1621 |
+
"¨é « ±": 1619,
|
| 1622 |
+
"ðòéîô¨æ¢ ": 1620,
|
| 1623 |
+
"ý¢© äåæ ": 1621,
|
| 1624 |
+
"éîôÝ": 1622,
|
| 1625 |
+
" æïò ": 1623,
|
| 1626 |
+
"óôòº ¢¢¢": 1624,
|
| 1627 |
+
"òåöåò": 1625,
|
| 1628 |
+
"ðáôèº óôò": 1626,
|
| 1629 |
+
"¬ §÷": 1627,
|
| 1630 |
+
"â§© áó æ": 1628,
|
| 1631 |
+
" ½ óåìæ®óðåãéáìßôïëåîóÛ§": 1629,
|
| 1632 |
+
"ìïá": 1630,
|
| 1633 |
+
"ìïáäå": 1631,
|
| 1634 |
+
"ìåî¨óåñ": 1632,
|
| 1635 |
+
" ½ ÛÝ": 1633,
|
| 1636 |
+
" äåæ ßß": 1634,
|
| 1637 |
+
"ßߨóåìæ": 1635,
|
| 1638 |
+
"ìåöåì": 1636,
|
| 1639 |
+
"ìåöåì ": 1637,
|
| 1640 |
+
"¸": 1638,
|
| 1641 |
+
"âùôåßôåøô": 1639,
|
| 1642 |
+
"¢¬ ": 1640,
|
| 1643 |
+
"¢¬ ¢": 1641,
|
| 1644 |
+
"éîå ": 1642,
|
| 1645 |
+
"¢ ": 1643,
|
| 1646 |
+
"òåñõå": 1644,
|
| 1647 |
+
" ×éëéðåäéá ": 1645,
|
| 1648 |
+
"âáóåß": 1646,
|
| 1649 |
+
"âáóåßõòì": 1647,
|
| 1650 |
+
"ßäáôá": 1648,
|
| 1651 |
+
"Åø": 1649,
|
| 1652 |
+
"ôéô": 1650,
|
| 1653 |
+
"ôéôìå": 1651,
|
| 1654 |
+
"¨ãèõîë": 1652,
|
| 1655 |
+
"äáôáóåôßéäø": 1653,
|
| 1656 |
+
"äáôáßäéò": 1654,
|
| 1657 |
+
"÷éëé": 1655,
|
| 1658 |
+
"×å": 1656,
|
| 1659 |
+
"Ôåøô ": 1657,
|
| 1660 |
+
"òåñõ": 1658,
|
| 1661 |
+
"òåñõé": 1659,
|
| 1662 |
+
"òåñõéòå": 1660,
|
| 1663 |
+
"ãïîóï": 1661,
|
| 1664 |
+
"Æïòí": 1662,
|
| 1665 |
+
"ìïáäßóôáôåßäéãô¨": 1663,
|
| 1666 |
+
" « ¢": 1664,
|
| 1667 |
+
"¼´°": 1665,
|
| 1668 |
+
"¼´°ý ": 1666,
|
| 1669 |
+
"¾±": 1667,
|
| 1670 |
+
"¾±µ": 1668,
|
| 1671 |
+
"¨óååä": 1669,
|
| 1672 |
+
"òåóåò": 1670,
|
| 1673 |
+
"òåóåòöå": 1671,
|
| 1674 |
+
"Éîôåòáãôéöå": 1672,
|
| 1675 |
+
"ãïîôéîõå éæ ðòïíðô": 1673,
|
| 1676 |
+
"ãïîôéîõå éæ ðòïíðô®": 1674,
|
| 1677 |
+
"ãïîôéîõå éæ ðòïíðô®óôáòôó÷éôè": 1675,
|
| 1678 |
+
"ãïîôéîõå éæ ðòïíðô®óôáòôó÷éô訧": 1676,
|
| 1679 |
+
"ãïîôéîõå éæ ðòïíðô®óôáòôó÷éô訧¯": 1677,
|
| 1680 |
+
" §©º ": 1678,
|
| 1681 |
+
" §©º ôòù": 1679,
|
| 1682 |
+
" §©º ôòùº ": 1680,
|
| 1683 |
+
" §©º ôòùº óåôôéîçóÛ§": 1681,
|
| 1684 |
+
"ðòïíðô®óðìéô¨©": 1682,
|
| 1685 |
+
"ðòïíðô®óðìéô¨©Û±Ý© ": 1683,
|
| 1686 |
+
"ðòïíðô®óðìéô¨©Û±Ý© ðòéîô¨æ¢": 1684,
|
| 1687 |
+
"óåô ôï û": 1685,
|
| 1688 |
+
"óåô ôï ûóåôôéîçóÛ§": 1686,
|
| 1689 |
+
"§Ýý¢© ": 1687,
|
| 1690 |
+
"§Ýý¢© åøãåðô ": 1688,
|
| 1691 |
+
"§Ýý¢© åøãåðô ¨": 1689,
|
| 1692 |
+
"§Ýý¢© åøãåðô ¨Öáì": 1690,
|
| 1693 |
+
"§Ýý¢© åøãåðô ¨Öáìõå": 1691,
|
| 1694 |
+
"§Ýý¢© åøãåðô ¨ÖáìõåÅòòïò": 1692,
|
| 1695 |
+
"§Ýý¢© åøãåðô ¨ÖáìõåÅòòïò¬ ": 1693,
|
| 1696 |
+
"§Ýý¢© åøãåðô ¨ÖáìõåÅòòïò¬ Éî": 1694,
|
| 1697 |
+
"§Ýý¢© åøãåðô ¨ÖáìõåÅòòïò¬ Éîäåø": 1695,
|
| 1698 |
+
"§Ýý¢© åøãåðô ¨ÖáìõåÅòòïò¬ ÉîäåøÅòòïò": 1696,
|
| 1699 |
+
"§Ýý¢© åøãåðô ¨ÖáìõåÅòòïò¬ ÉîäåøÅòòïò©º ": 1697,
|
| 1700 |
+
"§Ýý¢© åøãåðô ¨ÖáìõåÅòòïò¬ ÉîäåøÅòòïò©º ðòéîô¨¢": 1698,
|
| 1701 |
+
"§Ýý¢© åøãåðô ¨ÖáìõåÅòòïò¬ ÉîäåøÅòòïò©º ðòéîô¨¢Éî": 1699,
|
| 1702 |
+
"§Ýý¢© åøãåðô ¨ÖáìõåÅòòïò¬ ÉîäåøÅòòïò©º ðòéîô¨¢Éîöáìéä": 1700,
|
| 1703 |
+
"§Ýý¢© åøãåðô ¨ÖáìõåÅòòïò¬ ÉîäåøÅòòïò©º ðòéîô¨¢Éîöáìéä ": 1701,
|
| 1704 |
+
"öáìõ墩 ": 1702,
|
| 1705 |
+
"íïäåì®çåîåòáôå": 1703,
|
| 1706 |
+
"§Ý¬ ": 1704,
|
| 1707 |
+
"çåîåòáôåäßôåøô": 1705,
|
| 1708 |
+
"îõíßôïëåîó": 1706,
|
| 1709 |
+
"ôòáîóæïòíåò": 1707,
|
| 1710 |
+
"æòïí ôù": 1708,
|
| 1711 |
+
"æòïí ôùðéîç ": 1709,
|
| 1712 |
+
"æòïí ôùðéîç éíðïòô ": 1710,
|
| 1713 |
+
"éíðïòô ôïòãè": 1711,
|
| 1714 |
+
"ÒÍÓÎïòí": 1712,
|
| 1715 |
+
"å ": 1713,
|
| 1716 |
+
"áìéú": 1714,
|
| 1717 |
+
"º éîô¬ ": 1715,
|
| 1718 |
+
"ôïòãè®ïîå": 1716,
|
| 1719 |
+
"²©®": 1717,
|
| 1720 |
+
"© « ": 1718,
|
| 1721 |
+
"¸±": 1719,
|
| 1722 |
+
"¸±¹": 1720,
|
| 1723 |
+
"º æìïáô ½ ": 1721,
|
| 1724 |
+
" ½ äéí": 1722,
|
| 1725 |
+
"éîö": 1723,
|
| 1726 |
+
"éîöß": 1724,
|
| 1727 |
+
"éîößæòåñ": 1725,
|
| 1728 |
+
"°¬ ": 1726,
|
| 1729 |
+
"åòß": 1727,
|
| 1730 |
+
"¬ äéí½±": 1728,
|
| 1731 |
+
"ãáãèåä": 1729,
|
| 1732 |
+
" ½ ø": 1730,
|
| 1733 |
+
"áðð": 1731,
|
| 1734 |
+
"ßåíâåä": 1732,
|
| 1735 |
+
"© ª ": 1733,
|
| 1736 |
+
"õðå": 1734,
|
| 1737 |
+
"®¢¢¢ äåæ ßßéîéôßߨ óåìæ¬ ": 1735,
|
| 1738 |
+
"äòïðïõôº æìïáô ½ °": 1736,
|
| 1739 |
+
"äòïðïõôº æìïáô ½ °®°": 1737,
|
| 1740 |
+
"ó ª óåìæ®èåáäßäéí": 1738,
|
| 1741 |
+
"¬ âéáó½Æáìóå© óåìæ®÷": 1739,
|
| 1742 |
+
"áôôîß": 1740,
|
| 1743 |
+
"äòïðïõô ½ îî®": 1741,
|
| 1744 |
+
"äòïðïõô ½ îî®Ä": 1742,
|
| 1745 |
+
"äòïðïõô ½ îî®Äòï": 1743,
|
| 1746 |
+
"äòïðïõô ½ îî®Äòïðïõô¨": 1744,
|
| 1747 |
+
"òåóéä": 1745,
|
| 1748 |
+
"º ôïòãè®Ôåîóïò": 1746,
|
| 1749 |
+
"º ôïòãè®Ôåîóïò¬ ": 1747,
|
| 1750 |
+
" ½ óåìæ®÷": 1748,
|
| 1751 |
+
"©®öéå÷¨": 1749,
|
| 1752 |
+
"©®öéå÷¨âóú¬ óåñßìåî": 1750,
|
| 1753 |
+
"©®öéå÷¨âóú¬ óåñßìåî¬ óåìæ®": 1751,
|
| 1754 |
+
"ôòáîóðïó娱¬ ": 1752,
|
| 1755 |
+
" éó îïô Îïîå": 1753,
|
| 1756 |
+
"ö ½ ": 1754,
|
| 1757 |
+
"ë¬ ö": 1755,
|
| 1758 |
+
"áô ": 1756,
|
| 1759 |
+
"óãïòå": 1757,
|
| 1760 |
+
"±©": 1758,
|
| 1761 |
+
"áôôî": 1759,
|
| 1762 |
+
"Æ®ó": 1760,
|
| 1763 |
+
"¬ ðáóôßëåùßöáìõå": 1761,
|
| 1764 |
+
"Æïò": 1762,
|
| 1765 |
+
"Óéîç": 1763,
|
| 1766 |
+
"Óéîçìå ": 1764,
|
| 1767 |
+
"ðòå": 1765,
|
| 1768 |
+
"äéí© óåìæ®": 1766,
|
| 1769 |
+
"ä ½ ": 1767,
|
| 1770 |
+
"´°¹": 1768,
|
| 1771 |
+
"´°¹¶": 1769,
|
| 1772 |
+
"¸¬ ": 1770,
|
| 1773 |
+
"±´": 1771,
|
| 1774 |
+
"±´³": 1772,
|
| 1775 |
+
"±´³³": 1773,
|
| 1776 |
+
"±´³³¶": 1774,
|
| 1777 |
+
"ôéåß÷åéçèô": 1775,
|
| 1778 |
+
"óé": 1776,
|
| 1779 |
+
"ó ¨": 1777,
|
| 1780 |
+
"ãïîæéç ½ ": 1778,
|
| 1781 |
+
" óåìæ®": 1779,
|
| 1782 |
+
"ôïëåîßåíâåääéîç": 1780,
|
| 1783 |
+
"¬ ãïîæéç®": 1781,
|
| 1784 |
+
"éîô ": 1782,
|
| 1785 |
+
"çåôßîõíßðáòáí": 1783,
|
| 1786 |
+
"éîéôéáìéúå": 1784,
|
| 1787 |
+
"åìéæ ": 1785,
|
| 1788 |
+
"îõíåì": 1786,
|
| 1789 |
+
"æìïáô¨§": 1787,
|
| 1790 |
+
"éîæ§": 1788,
|
| 1791 |
+
"óÛéÝ": 1789,
|
| 1792 |
+
" éæ ": 1790,
|
| 1793 |
+
"åìóå ": 1791,
|
| 1794 |
+
"íáøßîå÷ßôïëåîóº éîô ½ ": 1792,
|
| 1795 |
+
"áõ": 1793,
|
| 1796 |
+
"åöáì¨": 1794,
|
| 1797 |
+
" £ ": 1795,
|
| 1798 |
+
"ìïçéôóÛ": 1796,
|
| 1799 |
+
"¡½ ": 1797,
|
| 1800 |
+
"éîç éæ ": 1798,
|
| 1801 |
+
" ¾ ": 1799,
|
| 1802 |
+
"Û®®®": 1800,
|
| 1803 |
+
"ðòïâ": 1801,
|
| 1804 |
+
"±Ý": 1802,
|
| 1805 |
+
"áôôåò¨": 1803,
|
| 1806 |
+
" òåôõòî ": 1804,
|
| 1807 |
+
"© òåôõòî ": 1805,
|
| 1808 |
+
"¨ãïîæéç": 1806,
|
| 1809 |
+
"©éæ ßßîáíåßß ½½ ": 1807,
|
| 1810 |
+
"¢ßßíáéîßß": 1808,
|
| 1811 |
+
"¢ßßíáéîßߢ": 1809,
|
| 1812 |
+
"¢ßßíáéîßߢº ": 1810,
|
| 1813 |
+
"¢ßßíáéîßߢº £ ": 1811,
|
| 1814 |
+
"¢ßßíáéîßߢº £ Ôåóô ": 1812,
|
| 1815 |
+
"óº û": 1813,
|
| 1816 |
+
"ïõôðõôó ½ íïäåì": 1814,
|
| 1817 |
+
"ïõôðõôóÛ§ìïó": 1815,
|
| 1818 |
+
"ïõôðõôóÛ§ìïóó§Ý": 1816,
|
| 1819 |
+
"ý¢©<|endoftext|>": 1817,
|
| 1820 |
+
"òé": 1818,
|
| 1821 |
+
"Äé": 1819,
|
| 1822 |
+
"Äéóôòéâõôå": 1820,
|
| 1823 |
+
"ÆÓ": 1821,
|
| 1824 |
+
"ÆÓÄÐ": 1822,
|
| 1825 |
+
"ÄÄÐ": 1823,
|
| 1826 |
+
"áòçðáò": 1824,
|
| 1827 |
+
"áòçðáòóå": 1825,
|
| 1828 |
+
"æó": 1826,
|
| 1829 |
+
"æóä": 1827,
|
| 1830 |
+
"õôéìó®": 1828,
|
| 1831 |
+
"Ôåøô": 1829,
|
| 1832 |
+
"ÔåøôÆéìå": 1830,
|
| 1833 |
+
"õðßäé": 1831,
|
| 1834 |
+
"õðßäéóôòéâõôå": 1832,
|
| 1835 |
+
"õðßäéóôòéâõôåä¨": 1833,
|
| 1836 |
+
"òáîë¬ ": 1834,
|
| 1837 |
+
"óèõ": 1835,
|
| 1838 |
+
"óèõææ": 1836,
|
| 1839 |
+
"óèõææìå": 1837,
|
| 1840 |
+
"©º ¢¢¢Ãòåáôå ": 1838,
|
| 1841 |
+
"íïäåì®ôòáéî¨": 1839,
|
| 1842 |
+
"éîðõôßéäó§Ý": 1840,
|
| 1843 |
+
"ìáâåìó§Ý": 1841,
|
| 1844 |
+
"ïðôéíéúåò®": 1842,
|
| 1845 |
+
"äáôå": 1843,
|
| 1846 |
+
"éôåí¨": 1844,
|
| 1847 |
+
"öáìßìïáäåò": 1845,
|
| 1848 |
+
"¬ äåöéãå¬ ": 1846,
|
| 1849 |
+
"âåóôß": 1847,
|
| 1850 |
+
"ôòáéîßéôåòáôïò": 1848,
|
| 1851 |
+
"éôåò¨": 1849,
|
| 1852 |
+
"èáó": 1850,
|
| 1853 |
+
"èáóáô": 1851,
|
| 1854 |
+
"èáóáôôò": 1852,
|
| 1855 |
+
"ôéíå®": 1853,
|
| 1856 |
+
"ôéíå®ôéíå": 1854,
|
| 1857 |
+
"íáøßóôåð": 1855,
|
| 1858 |
+
"îåøô¨": 1856,
|
| 1859 |
+
"Ìïç": 1857,
|
| 1860 |
+
"Ìïçç": 1858,
|
| 1861 |
+
"ý¯": 1859,
|
| 1862 |
+
"óáöåßãèåãëðïéîô¨": 1860,
|
| 1863 |
+
"áòçó®ïõôðõôßäéò ¯ ": 1861,
|
| 1864 |
+
"ðô§": 1862,
|
| 1865 |
+
"© ": 1863,
|
| 1866 |
+
"óº âòåáë": 1864,
|
| 1867 |
+
"ãïíðìå": 1865,
|
| 1868 |
+
"éî û": 1866,
|
| 1869 |
+
"äáôáßìïáäåò": 1867,
|
| 1870 |
+
"ôïôáìßìïó": 1868,
|
| 1871 |
+
"óÝ": 1869,
|
| 1872 |
+
"íáéî¨": 1870,
|
| 1873 |
+
"Áò": 1871,
|
| 1874 |
+
"Ôòáéî ": 1872,
|
| 1875 |
+
"áòçó ": 1873,
|
| 1876 |
+
"áòçó ðáòóåò®áääßáòçõíåîô¨§": 1874,
|
| 1877 |
+
"°°°© ðáòóåò®áääßáòçõíåîô¨§": 1875,
|
| 1878 |
+
"íáø": 1876,
|
| 1879 |
+
"²°": 1877,
|
| 1880 |
+
"²°´": 1878,
|
| 1881 |
+
"ìåáòîéîç": 1879,
|
| 1882 |
+
"òáôå": 1880,
|
| 1883 |
+
"òåóõí": 1881,
|
| 1884 |
+
"õóå": 1882,
|
| 1885 |
+
"éóßäéóôòéâõôåä": 1883,
|
| 1886 |
+
" £ ": 1884,
|
| 1887 |
+
"éæ áòçó®òáîë ½½ °º ": 1885,
|
| 1888 |
+
"äéò¨": 1886,
|
| 1889 |
+
"ó½Ôòõå": 1887,
|
| 1890 |
+
"ÇÐ": 1888,
|
| 1891 |
+
"ôïòãè®ãõäá®éóß": 1889,
|
| 1892 |
+
"ôïòãè®ãõäá®éóßáöáéìáâìå": 1890,
|
| 1893 |
+
"ãðõ": 1891,
|
| 1894 |
+
"ãðõ§": 1892,
|
| 1895 |
+
"© £ Ìïáä ": 1893,
|
| 1896 |
+
"ôïëåîéúåò ": 1894,
|
| 1897 |
+
"ïó®ðáôè®": 1895,
|
| 1898 |
+
"ïó®ðáôè®åøéóô": 1896,
|
| 1899 |
+
"ïó®ðáôè®åøéóôó¨": 1897,
|
| 1900 |
+
"Ìïáäéîç ": 1898,
|
| 1901 |
+
"ôïëåîéúåò æòïí ": 1899,
|
| 1902 |
+
"ý¢© ": 1900,
|
| 1903 |
+
"ôïëåîéúåò ½ ÂùôåÌåöåìÂÐÅÔïëåîéúåò": 1901,
|
| 1904 |
+
"ìïáä¨áòçó®": 1902,
|
| 1905 |
+
"áôéîç ": 1903,
|
| 1906 |
+
"æ®": 1904,
|
| 1907 |
+
"®óôòéð¨": 1905,
|
| 1908 |
+
"§©º ": 1906,
|
| 1909 |
+
"äáôᬠ": 1907,
|
| 1910 |
+
"äáôáóåô ½ ×éëéðåäéáÄáôáóåô¨": 1908,
|
| 1911 |
+
"¬ íáøßìåîçôè": 1909,
|
| 1912 |
+
"ó¨íïäåì¬��": 1910,
|
| 1913 |
+
" ½ ôïòãè®ãõäá®": 1911,
|
| 1914 |
+
"®®®¢© ": 1912,
|
| 1915 |
+
"öïãáâ ½ ûý": 1913,
|
| 1916 |
+
"¾§º ": 1914,
|
| 1917 |
+
"ßçåôß": 1915,
|
| 1918 |
+
"ßçåôßóôáô": 1916,
|
| 1919 |
+
"ó ïæ ": 1917,
|
| 1920 |
+
"äåæáõìôäéãô¨": 1918,
|
| 1921 |
+
"äåæáõìôäéãô¨éîô": 1919,
|
| 1922 |
+
"äåæáõìôäéãô¨éîô© ": 1920,
|
| 1923 |
+
"äåæáõìôäéãô¨éîô© æïò ": 1921,
|
| 1924 |
+
" éî öïãáâ": 1922,
|
| 1925 |
+
" ±": 1923,
|
| 1926 |
+
"é « ±Ý": 1924,
|
| 1927 |
+
"©Ý ": 1925,
|
| 1928 |
+
"íåòçåß": 1926,
|
| 1929 |
+
"íåòçåßöïãáâ": 1927,
|
| 1930 |
+
"¬ öïãáâ": 1928,
|
| 1931 |
+
"âéçòáí": 1929,
|
| 1932 |
+
" ½ òå®": 1930,
|
| 1933 |
+
"򧨿": 1931,
|
| 1934 |
+
"¡ÜÓ©§": 1932,
|
| 1935 |
+
"ôïëåîéúå": 1933,
|
| 1936 |
+
"Üð": 1934,
|
| 1937 |
+
"Üðû": 1935,
|
| 1938 |
+
"Üó": 1936,
|
| 1939 |
+
"ôåøôóº ÌéóôÛ": 1937,
|
| 1940 |
+
"ôåøôóº ÌéóôÛóôòÝ": 1938,
|
| 1941 |
+
"öïãáâ ½ ": 1939,
|
| 1942 |
+
"ôåøô éî ôåøô": 1940,
|
| 1943 |
+
"ìï÷": 1941,
|
| 1944 |
+
"¼¯÷": 1942,
|
| 1945 |
+
"¼¯÷¾": 1943,
|
| 1946 |
+
"÷ïòä ½ ": 1944,
|
| 1947 |
+
"æïòíáô": 1945,
|
| 1948 |
+
"Áä": 1946,
|
| 1949 |
+
"ó ôï ": 1947,
|
| 1950 |
+
"©º éæ ": 1948,
|
| 1951 |
+
"îïô éî ": 1949,
|
| 1952 |
+
"îïô éî óåìæ®öïãáâ": 1950,
|
| 1953 |
+
"óåìæ®öïãáâÛ": 1951,
|
| 1954 |
+
"Ý ½ ìåî¨óåìæ®öïãáâ": 1952,
|
| 1955 |
+
"îõíßíåòçåó": 1953,
|
| 1956 |
+
"º âïïì": 1954,
|
| 1957 |
+
"º âïïì ½ Ôòõå": 1955,
|
| 1958 |
+
"Åîãïäå": 1956,
|
| 1959 |
+
" ÉÄ": 1957,
|
| 1960 |
+
"÷ïòäßôïëåî": 1958,
|
| 1961 |
+
"®çåô¨": 1959,
|
| 1962 |
+
"Äåãïäå": 1960,
|
| 1963 |
+
"òåöåòóåß": 1961,
|
| 1964 |
+
"ó ½ ÛÝ æïò ": 1962,
|
| 1965 |
+
"¨óåìæ¬ ðáôèº óôò": 1963,
|
| 1966 |
+
"¨óåìæ¬ ðáôèº óôò©º ¢¢¢": 1964,
|
| 1967 |
+
"æéì客¢¢ ": 1965,
|
| 1968 |
+
"÷éôè ïðåî¨ðáôè": 1966,
|
| 1969 |
+
"â§© áó æº ": 1967,
|
| 1970 |
+
"ðéãëìå®": 1968,
|
| 1971 |
+
"äáôá ½ ": 1969,
|
| 1972 |
+
"ìïá䍿": 1970,
|
| 1973 |
+
" ½ óåìæ®óðåãéáìßôïëåîóÛ§¼": 1971,
|
| 1974 |
+
"¾§Ý óåìæ®": 1972,
|
| 1975 |
+
"æòïí û": 1973,
|
| 1976 |
+
"óåñ éî ": 1974,
|
| 1977 |
+
"óåñ éî åîãïäåä": 1975,
|
| 1978 |
+
"íáøßìåî": 1976,
|
| 1979 |
+
"éîðõôßéäó§º ": 1977,
|
| 1980 |
+
"áôôåîôéïîßíáó맺 ": 1978,
|
| 1981 |
+
"âùôåóßôïß": 1979,
|
| 1982 |
+
"âùôåóßôïßõîéãïäå": 1980,
|
| 1983 |
+
"âùôåìåöåì ": 1981,
|
| 1984 |
+
"õôæ": 1982,
|
| 1985 |
+
"õôæ¸": 1983,
|
| 1986 |
+
"ãïòðõó ": 1984,
|
| 1987 |
+
"é ¼ ": 1985,
|
| 1988 |
+
"é ¼ ìåî¨": 1986,
|
| 1989 |
+
"é ¼ ìåî¨÷ïòä": 1987,
|
| 1990 |
+
" áîä ÷ïòäÛ": 1988,
|
| 1991 |
+
"º îå÷ß÷ïòä": 1989,
|
| 1992 |
+
"º îå÷ß÷ïòä®áððåîä¨": 1990,
|
| 1993 |
+
"© ôïëåîéúåò®": 1991,
|
| 1994 |
+
"óáíðìåßôåøô": 1992,
|
| 1995 |
+
"ó ½ Û ": 1993,
|
| 1996 |
+
"¬ Ý": 1994,
|
| 1997 |
+
"ôïëåîéúåò®äåãïäå¨": 1995,
|
| 1998 |
+
"òåñõåóô": 1996,
|
| 1999 |
+
"Éôåòáâìå": 1997,
|
| 2000 |
+
"ÉôåòáâìåÄáôáóåô": 1998,
|
| 2001 |
+
"íéîßáòôéãìå": 1999
|
| 2002 |
+
}
|