Tokenizer in JS is working
Browse files- README.md +66 -0
- build_models.py +68 -0
- js/example.mjs +77 -0
- js/model.json +0 -0
- js/package.json +15 -0
- js/tsconfig.json +23 -0
- multilingual.py +40 -1
- pyproject.toml +2 -0
- uv.lock +44 -0
README.md
CHANGED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language:
|
| 3 |
+
- en
|
| 4 |
+
- ar
|
| 5 |
+
- bg
|
| 6 |
+
- ca
|
| 7 |
+
- cs
|
| 8 |
+
- da
|
| 9 |
+
- de
|
| 10 |
+
- el
|
| 11 |
+
- es
|
| 12 |
+
- et
|
| 13 |
+
- fa
|
| 14 |
+
- fi
|
| 15 |
+
- fr
|
| 16 |
+
- gl
|
| 17 |
+
- gu
|
| 18 |
+
- he
|
| 19 |
+
- hi
|
| 20 |
+
- hu
|
| 21 |
+
- hy
|
| 22 |
+
- id
|
| 23 |
+
- it
|
| 24 |
+
- ja
|
| 25 |
+
- ka
|
| 26 |
+
- ko
|
| 27 |
+
- ku
|
| 28 |
+
- lt
|
| 29 |
+
- lv
|
| 30 |
+
- mk
|
| 31 |
+
- mn
|
| 32 |
+
- mr
|
| 33 |
+
- ms
|
| 34 |
+
- my
|
| 35 |
+
- nb
|
| 36 |
+
- nl
|
| 37 |
+
- pl
|
| 38 |
+
- pt
|
| 39 |
+
- ro
|
| 40 |
+
- ru
|
| 41 |
+
- sk
|
| 42 |
+
- sl
|
| 43 |
+
- sq
|
| 44 |
+
- sr
|
| 45 |
+
- sv
|
| 46 |
+
- th
|
| 47 |
+
- tr
|
| 48 |
+
- uk
|
| 49 |
+
- ur
|
| 50 |
+
- vi
|
| 51 |
+
- zh
|
| 52 |
+
- hr
|
| 53 |
+
license: apache-2.0
|
| 54 |
+
---
|
| 55 |
+
# Static Embeddings
|
| 56 |
+
|
| 57 |
+
This project contains multilingual static embeddings that are appropriate for generating
|
| 58 |
+
quick embeddings in edge devices. They are re-packaged from other projects in production
|
| 59 |
+
ready assets.
|
| 60 |
+
|
| 61 |
+
## Current model
|
| 62 |
+
|
| 63 |
+
The current model is using [sentence-transformers/static-similarity-mrl-multilingual-v1](https://huggingface.co/sentence-transformers/static-similarity-mrl-multilingual-v1)
|
| 64 |
+
as it is multi-lingual and uses Matryoshka Loss, which allows for the arbitrary truncation
|
| 65 |
+
of the embedding vectors. The length of the vector can be tuned for its use, and still
|
| 66 |
+
retain the semantic meaning.
|
build_models.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from zstandard import ZstdCompressor
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import io
|
| 5 |
+
from sentence_transformers import SentenceTransformer
|
| 6 |
+
from torch.nn import EmbeddingBag
|
| 7 |
+
import torch
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def save_data(path: Path, tensor: torch.Tensor):
|
| 11 |
+
"""Writes out the static embeddings to a .npy.zst file"""
|
| 12 |
+
assert str(path).endswith(".npy.zst")
|
| 13 |
+
buffer = io.BytesIO()
|
| 14 |
+
np.save(buffer, tensor.numpy())
|
| 15 |
+
|
| 16 |
+
with (
|
| 17 |
+
open(path, "wb") as outfile,
|
| 18 |
+
ZstdCompressor().stream_writer(outfile) as writer,
|
| 19 |
+
):
|
| 20 |
+
writer.write(buffer.getvalue())
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
data_path = Path("embeddings")
|
| 24 |
+
|
| 25 |
+
model_name = "sentence-transformers/static-similarity-mrl-multilingual-v1"
|
| 26 |
+
vocab_size = 105_879
|
| 27 |
+
dimensions = 1024
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def load_embeddings():
|
| 31 |
+
model = SentenceTransformer(model_name)
|
| 32 |
+
embedding_bag: EmbeddingBag = model[0].embedding # type: ignore
|
| 33 |
+
embeddings = torch.Tensor(embedding_bag.weight)
|
| 34 |
+
|
| 35 |
+
print(embeddings.shape)
|
| 36 |
+
assert embeddings.shape == torch.Size([vocab_size, dimensions])
|
| 37 |
+
|
| 38 |
+
print("float32")
|
| 39 |
+
print(f" 1024 dim - {embeddings.shape[0] * 1024 * 4 / 1024 / 1024:,.1f} MiB")
|
| 40 |
+
print(f" 512 dim - {embeddings.shape[0] * 512 * 4 / 1024 / 1024:,.1f} MiB")
|
| 41 |
+
print(f" 256 dim - {embeddings.shape[0] * 256 * 4 / 1024 / 1024:,.1f} MiB")
|
| 42 |
+
|
| 43 |
+
print("float16")
|
| 44 |
+
print(f" 1024 dim - {embeddings.shape[0] * 1024 * 2 / 1024 / 1024:,.1f} MiB")
|
| 45 |
+
print(f" 512 dim - {embeddings.shape[0] * 512 * 2 / 1024 / 1024:,.1f} MiB")
|
| 46 |
+
print(f" 256 dim - {embeddings.shape[0] * 256 * 2 / 1024 / 1024:,.1f} MiB")
|
| 47 |
+
|
| 48 |
+
for dim in (1024, 512, 384, 256, 128):
|
| 49 |
+
truncated = embeddings[:, :dim]
|
| 50 |
+
assert truncated.shape == torch.Size([vocab_size, dim])
|
| 51 |
+
|
| 52 |
+
save_data(data_path / f"static-embeddings.{dim}.fp32.npy.zst", embeddings)
|
| 53 |
+
save_data(
|
| 54 |
+
data_path / f"static-embeddings.{dim}.fp16.npy.zst",
|
| 55 |
+
embeddings.to(dtype=torch.float16),
|
| 56 |
+
)
|
| 57 |
+
save_data(
|
| 58 |
+
data_path / f"static-embeddings.{dim}.int8.npy.zst",
|
| 59 |
+
embeddings.to(dtype=torch.int8),
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def main() -> None:
|
| 64 |
+
load_embeddings()
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
if __name__ == "__main__":
|
| 68 |
+
main()
|
js/example.mjs
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { pipeline, AutoTokenizer, AutoModel, TokenizerModel } from '@huggingface/transformers';
|
| 2 |
+
import fs from 'node:fs/promises';
|
| 3 |
+
import { constants } from 'node:fs';
|
| 4 |
+
import path from 'path';
|
| 5 |
+
import { fileURLToPath } from 'url';
|
| 6 |
+
|
| 7 |
+
const DIR = path.dirname(fileURLToPath(import.meta.url));
|
| 8 |
+
|
| 9 |
+
await main()
|
| 10 |
+
|
| 11 |
+
async function main() {
|
| 12 |
+
const url = "https://huggingface.co/sentence-transformers/static-similarity-mrl-multilingual-v1/resolve/main/0_StaticEmbedding/tokenizer.json"
|
| 13 |
+
|
| 14 |
+
const config = await ensureTokenizerJson(url)
|
| 15 |
+
// const tokenizer = TokenizerModel.fromConfig(config.model)
|
| 16 |
+
|
| 17 |
+
const tokenizer = await AutoTokenizer.from_pretrained("./")
|
| 18 |
+
|
| 19 |
+
const examples = [
|
| 20 |
+
"This is an example of encoding",
|
| 21 |
+
"The quick brown fox jumps over the lazy dog.",
|
| 22 |
+
"Curaçao, naïve fiancé, jalapeño, déjà vu.",
|
| 23 |
+
"Привет, как дела?",
|
| 24 |
+
"Бързата кафява лисица прескача мързеливото куче.",
|
| 25 |
+
"Γρήγορη καφέ αλεπού πηδάει πάνω από τον τεμπέλη σκύλο.",
|
| 26 |
+
"اللغة العربية جميلة وغنية بالتاريخ.",
|
| 27 |
+
"مرحبا بالعالم!",
|
| 28 |
+
"Simplified: 快速的棕色狐狸跳过懒狗。",
|
| 29 |
+
"Traditional: 快速的棕色狐狸跳過懶狗。",
|
| 30 |
+
"素早い茶色の狐が怠け者の犬を飛び越える。",
|
| 31 |
+
"コンピュータープログラミング",
|
| 32 |
+
"빠른 갈색 여우가 게으른 개를 뛰어넘습니다.",
|
| 33 |
+
"तेज़ भूरी लोमड़ी आलसी कुत्ते के ऊपर कूदती है।",
|
| 34 |
+
"দ্রুত বাদামী শিয়াল অলস কুকুরের উপর দিয়ে লাফ দেয়।",
|
| 35 |
+
"வேகமான பழுப்பு நரி சோம்பேறி நாயின் மேல் குதிக்கிறது.",
|
| 36 |
+
"สุนัขจิ้งจอกสีน้ำตาลกระโดดข้ามสุนัขขี้เกียจ.",
|
| 37 |
+
"ብሩክ ቡናማ ቀበሮ ሰነፍ ውሻን ተዘልሏል።",
|
| 38 |
+
// Mixed scripts:
|
| 39 |
+
"Hello 世界 مرحبا 🌍",
|
| 40 |
+
"123, αβγ, абв, العربية, 中文, हिन्दी.",
|
| 41 |
+
];
|
| 42 |
+
for (const example of examples) {
|
| 43 |
+
console.log(tokenizer.tokenize(example))
|
| 44 |
+
}
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
/**
|
| 48 |
+
* @param {string} path
|
| 49 |
+
* @returns {Promise<string>}
|
| 50 |
+
*/
|
| 51 |
+
async function loadJSON(path) {
|
| 52 |
+
return JSON.parse(await fs.readFile(path, { encoding: 'utf8' }));
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
/**
|
| 56 |
+
* Download tokenizer.json if it does not already exist.
|
| 57 |
+
*
|
| 58 |
+
* @param {string} url - The URL to download tokenizer.json from
|
| 59 |
+
* @returns {Promise<any>} - Path to tokenizer.json
|
| 60 |
+
*/
|
| 61 |
+
export async function ensureTokenizerJson(url) {
|
| 62 |
+
const tokenizerPath = path.join(DIR, 'tokenizer.json');
|
| 63 |
+
|
| 64 |
+
try {
|
| 65 |
+
await fs.access(tokenizerPath, constants.F_OK);
|
| 66 |
+
console.log('Using', tokenizerPath);
|
| 67 |
+
return loadJSON(tokenizerPath);
|
| 68 |
+
} catch {}
|
| 69 |
+
|
| 70 |
+
console.log("Downloading", url);
|
| 71 |
+
const response = await fetch(url);
|
| 72 |
+
const data = Buffer.from(await response.arrayBuffer());
|
| 73 |
+
await fs.writeFile(tokenizerPath, data);
|
| 74 |
+
|
| 75 |
+
return loadJSON(tokenizerPath);
|
| 76 |
+
|
| 77 |
+
}
|
js/model.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
js/package.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "js",
|
| 3 |
+
"version": "1.0.0",
|
| 4 |
+
"description": "",
|
| 5 |
+
"main": "index.js",
|
| 6 |
+
"scripts": {
|
| 7 |
+
"test": "echo \"Error: no test specified\" && exit 1"
|
| 8 |
+
},
|
| 9 |
+
"keywords": [],
|
| 10 |
+
"author": "",
|
| 11 |
+
"license": "ISC",
|
| 12 |
+
"dependencies": {
|
| 13 |
+
"@huggingface/transformers": "^3.7.2"
|
| 14 |
+
}
|
| 15 |
+
}
|
js/tsconfig.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"compilerOptions": {
|
| 3 |
+
"module": "ESNext",
|
| 4 |
+
"moduleResolution": "nodenext",
|
| 5 |
+
// Set the baseUrl to the root of the project.
|
| 6 |
+
"baseUrl": "src",
|
| 7 |
+
// Make the type checking as strict as possible.
|
| 8 |
+
"strict": true,
|
| 9 |
+
// TypeScript will check JS files only if they have a @ts-check comment in them.
|
| 10 |
+
"allowJs": true,
|
| 11 |
+
"checkJs": true,
|
| 12 |
+
// Only type check, don't emit files.
|
| 13 |
+
"noEmit": true,
|
| 14 |
+
// Allow esnext syntax. Otherwise the default is ES5 only.
|
| 15 |
+
"target": "esnext",
|
| 16 |
+
"lib": ["esnext", "dom"],
|
| 17 |
+
"esModuleInterop": true
|
| 18 |
+
},
|
| 19 |
+
// Add a @ts-check comment to a JS file to start type checking it.
|
| 20 |
+
"include": ["example.mjs"],
|
| 21 |
+
// "files": ["src/@types/globals.d.ts"],
|
| 22 |
+
"exclude": []
|
| 23 |
+
}
|
multilingual.py
CHANGED
|
@@ -1,10 +1,49 @@
|
|
| 1 |
from sentence_transformers import SentenceTransformer
|
|
|
|
| 2 |
from torch.nn import EmbeddingBag
|
| 3 |
import torch
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
model = SentenceTransformer(
|
| 6 |
-
"sentence-transformers/static-similarity-mrl-multilingual-v1"
|
| 7 |
)
|
|
|
|
|
|
|
| 8 |
embedding_bag: EmbeddingBag = model[0].embedding # type: ignore
|
| 9 |
embeddings = torch.Tensor(embedding_bag.weight)
|
| 10 |
|
|
|
|
| 1 |
from sentence_transformers import SentenceTransformer
|
| 2 |
+
from tokenizers import Encoding, Tokenizer
|
| 3 |
from torch.nn import EmbeddingBag
|
| 4 |
import torch
|
| 5 |
|
| 6 |
+
examples = [
|
| 7 |
+
"This is an example of encoding",
|
| 8 |
+
"The quick brown fox jumps over the lazy dog.",
|
| 9 |
+
"Curaçao, naïve fiancé, jalapeño, déjà vu.",
|
| 10 |
+
"Привет, как дела?",
|
| 11 |
+
"Бързата кафява лисица прескача мързеливото куче.",
|
| 12 |
+
"Γρήγορη καφέ αλεπού πηδάει πάνω από τον τεμπέλη σκύλο.",
|
| 13 |
+
"اللغة العربية جميلة وغنية بالتاريخ.",
|
| 14 |
+
"مرحبا بالعالم!",
|
| 15 |
+
"Simplified: 快速的棕色狐狸跳过懒狗。",
|
| 16 |
+
"Traditional: 快速的棕色狐狸跳過懶狗。",
|
| 17 |
+
"素早い茶色の狐が怠け者の犬を飛び越える。",
|
| 18 |
+
"コンピュータープログラミング",
|
| 19 |
+
"빠른 갈색 여우가 게으른 개를 뛰어넘습니다.",
|
| 20 |
+
"तेज़ भूरी लोमड़ी आलसी कुत्ते के ऊपर कूदती है।",
|
| 21 |
+
"দ্রুত বাদামী শিয়াল অলস কুকুরের উপর দিয়ে লাফ দেয়।",
|
| 22 |
+
"வேகமான பழுப்பு நரி சோம்பேறி நாயின் மேல் குதிக்கிறது.",
|
| 23 |
+
"สุนัขจิ้งจอกสีน้ำตาลกระโดดข้ามสุนัขขี้เกียจ.",
|
| 24 |
+
"ብሩክ ቡናማ ቀበሮ ሰነፍ ውሻን ተዘልሏል።",
|
| 25 |
+
"Hello 世界 مرحبا 🌍",
|
| 26 |
+
"123, αβγ, абв, العربية, 中文, हिन्दी.",
|
| 27 |
+
]
|
| 28 |
+
|
| 29 |
+
tokenizer: Tokenizer = Tokenizer.from_file("js/tokenizer.json")
|
| 30 |
+
|
| 31 |
+
for example in examples:
|
| 32 |
+
encoding: Encoding = tokenizer.encode(example)
|
| 33 |
+
tokenizer.decode
|
| 34 |
+
print(example)
|
| 35 |
+
print(encoding.tokens)
|
| 36 |
+
print()
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
print("!!! tokenizer", tokenizer)
|
| 40 |
+
assert False
|
| 41 |
+
# https://huggingface.co/sentence-transformers/static-similarity-mrl-multilingual-v1
|
| 42 |
model = SentenceTransformer(
|
| 43 |
+
"sentence-transformers/static-similarity-mrl-multilingual-v1", device="cpu"
|
| 44 |
)
|
| 45 |
+
embeddings = model.encode(examples)
|
| 46 |
+
|
| 47 |
embedding_bag: EmbeddingBag = model[0].embedding # type: ignore
|
| 48 |
embeddings = torch.Tensor(embedding_bag.weight)
|
| 49 |
|
pyproject.toml
CHANGED
|
@@ -6,5 +6,7 @@ readme = "README.md"
|
|
| 6 |
requires-python = ">=3.13"
|
| 7 |
dependencies = [
|
| 8 |
"model2vec>=0.6.0",
|
|
|
|
| 9 |
"sentence-transformers>=5.1.0",
|
|
|
|
| 10 |
]
|
|
|
|
| 6 |
requires-python = ">=3.13"
|
| 7 |
dependencies = [
|
| 8 |
"model2vec>=0.6.0",
|
| 9 |
+
"numpy>=2.3.2",
|
| 10 |
"sentence-transformers>=5.1.0",
|
| 11 |
+
"zstandard>=0.24.0",
|
| 12 |
]
|
uv.lock
CHANGED
|
@@ -650,13 +650,17 @@ version = "0.1.0"
|
|
| 650 |
source = { virtual = "." }
|
| 651 |
dependencies = [
|
| 652 |
{ name = "model2vec" },
|
|
|
|
| 653 |
{ name = "sentence-transformers" },
|
|
|
|
| 654 |
]
|
| 655 |
|
| 656 |
[package.metadata]
|
| 657 |
requires-dist = [
|
| 658 |
{ name = "model2vec", specifier = ">=0.6.0" },
|
|
|
|
| 659 |
{ name = "sentence-transformers", specifier = ">=5.1.0" },
|
|
|
|
| 660 |
]
|
| 661 |
|
| 662 |
[[package]]
|
|
@@ -834,3 +838,43 @@ sdist = { url = "https://files.pythonhosted.org/packages/15/22/9ee70a2574a4f4599
|
|
| 834 |
wheels = [
|
| 835 |
{ url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" },
|
| 836 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 650 |
source = { virtual = "." }
|
| 651 |
dependencies = [
|
| 652 |
{ name = "model2vec" },
|
| 653 |
+
{ name = "numpy" },
|
| 654 |
{ name = "sentence-transformers" },
|
| 655 |
+
{ name = "zstandard" },
|
| 656 |
]
|
| 657 |
|
| 658 |
[package.metadata]
|
| 659 |
requires-dist = [
|
| 660 |
{ name = "model2vec", specifier = ">=0.6.0" },
|
| 661 |
+
{ name = "numpy", specifier = ">=2.3.2" },
|
| 662 |
{ name = "sentence-transformers", specifier = ">=5.1.0" },
|
| 663 |
+
{ name = "zstandard", specifier = ">=0.24.0" },
|
| 664 |
]
|
| 665 |
|
| 666 |
[[package]]
|
|
|
|
| 838 |
wheels = [
|
| 839 |
{ url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" },
|
| 840 |
]
|
| 841 |
+
|
| 842 |
+
[[package]]
|
| 843 |
+
name = "zstandard"
|
| 844 |
+
version = "0.24.0"
|
| 845 |
+
source = { registry = "https://pypi.org/simple" }
|
| 846 |
+
sdist = { url = "https://files.pythonhosted.org/packages/09/1b/c20b2ef1d987627765dcd5bf1dadb8ef6564f00a87972635099bb76b7a05/zstandard-0.24.0.tar.gz", hash = "sha256:fe3198b81c00032326342d973e526803f183f97aa9e9a98e3f897ebafe21178f", size = 905681, upload-time = "2025-08-17T18:36:36.352Z" }
|
| 847 |
+
wheels = [
|
| 848 |
+
{ url = "https://files.pythonhosted.org/packages/ec/ef/db949de3bf81ed122b8ee4db6a8d147a136fe070e1015f5a60d8a3966748/zstandard-0.24.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e4ebb000c0fe24a6d0f3534b6256844d9dbf042fdf003efe5cf40690cf4e0f3e", size = 795700, upload-time = "2025-08-17T18:22:50.851Z" },
|
| 849 |
+
{ url = "https://files.pythonhosted.org/packages/99/56/fc04395d6f5eabd2fe6d86c0800d198969f3038385cb918bfbe94f2b0c62/zstandard-0.24.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:498f88f5109666c19531f0243a90d2fdd2252839cd6c8cc6e9213a3446670fa8", size = 640343, upload-time = "2025-08-17T18:22:51.999Z" },
|
| 850 |
+
{ url = "https://files.pythonhosted.org/packages/9b/0f/0b0e0d55f2f051d5117a0d62f4f9a8741b3647440c0ee1806b7bd47ed5ae/zstandard-0.24.0-cp313-cp313-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:0a9e95ceb180ccd12a8b3437bac7e8a8a089c9094e39522900a8917745542184", size = 5342571, upload-time = "2025-08-17T18:22:53.734Z" },
|
| 851 |
+
{ url = "https://files.pythonhosted.org/packages/5d/43/d74e49f04fbd62d4b5d89aeb7a29d693fc637c60238f820cd5afe6ca8180/zstandard-0.24.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bcf69e0bcddbf2adcfafc1a7e864edcc204dd8171756d3a8f3340f6f6cc87b7b", size = 5062723, upload-time = "2025-08-17T18:22:55.624Z" },
|
| 852 |
+
{ url = "https://files.pythonhosted.org/packages/8e/97/df14384d4d6a004388e6ed07ded02933b5c7e0833a9150c57d0abc9545b7/zstandard-0.24.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:10e284748a7e7fbe2815ca62a9d6e84497d34cfdd0143fa9e8e208efa808d7c4", size = 5393282, upload-time = "2025-08-17T18:22:57.655Z" },
|
| 853 |
+
{ url = "https://files.pythonhosted.org/packages/7e/09/8f5c520e59a4d41591b30b7568595eda6fd71c08701bb316d15b7ed0613a/zstandard-0.24.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:1bda8a85e5b9d5e73af2e61b23609a8cc1598c1b3b2473969912979205a1ff25", size = 5450895, upload-time = "2025-08-17T18:22:59.749Z" },
|
| 854 |
+
{ url = "https://files.pythonhosted.org/packages/d9/3d/02aba892327a67ead8cba160ee835cfa1fc292a9dcb763639e30c07da58b/zstandard-0.24.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1b14bc92af065d0534856bf1b30fc48753163ea673da98857ea4932be62079b1", size = 5546353, upload-time = "2025-08-17T18:23:01.457Z" },
|
| 855 |
+
{ url = "https://files.pythonhosted.org/packages/6a/6e/96c52afcde44da6a5313a1f6c356349792079808f12d8b69a7d1d98ef353/zstandard-0.24.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:b4f20417a4f511c656762b001ec827500cbee54d1810253c6ca2df2c0a307a5f", size = 5046404, upload-time = "2025-08-17T18:23:03.418Z" },
|
| 856 |
+
{ url = "https://files.pythonhosted.org/packages/da/b6/eefee6b92d341a7db7cd1b3885d42d30476a093720fb5c181e35b236d695/zstandard-0.24.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:337572a7340e1d92fd7fb5248c8300d0e91071002d92e0b8cabe8d9ae7b58159", size = 5576095, upload-time = "2025-08-17T18:23:05.331Z" },
|
| 857 |
+
{ url = "https://files.pythonhosted.org/packages/a3/29/743de3131f6239ba6611e17199581e6b5e0f03f268924d42468e29468ca0/zstandard-0.24.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:df4be1cf6e8f0f2bbe2a3eabfff163ef592c84a40e1a20a8d7db7f27cfe08fc2", size = 4953448, upload-time = "2025-08-17T18:23:07.225Z" },
|
| 858 |
+
{ url = "https://files.pythonhosted.org/packages/c9/11/bd36ef49fba82e307d69d93b5abbdcdc47d6a0bcbc7ffbbfe0ef74c2fec5/zstandard-0.24.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:6885ae4b33aee8835dbdb4249d3dfec09af55e705d74d9b660bfb9da51baaa8b", size = 5267388, upload-time = "2025-08-17T18:23:09.127Z" },
|
| 859 |
+
{ url = "https://files.pythonhosted.org/packages/c0/23/a4cfe1b871d3f1ce1f88f5c68d7e922e94be0043f3ae5ed58c11578d1e21/zstandard-0.24.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:663848a8bac4fdbba27feea2926049fdf7b55ec545d5b9aea096ef21e7f0b079", size = 5433383, upload-time = "2025-08-17T18:23:11.343Z" },
|
| 860 |
+
{ url = "https://files.pythonhosted.org/packages/77/26/f3fb85f00e732cca617d4b9cd1ffa6484f613ea07fad872a8bdc3a0ce753/zstandard-0.24.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:05d27c953f2e0a3ecc8edbe91d6827736acc4c04d0479672e0400ccdb23d818c", size = 5813988, upload-time = "2025-08-17T18:23:13.194Z" },
|
| 861 |
+
{ url = "https://files.pythonhosted.org/packages/3d/8c/d7e3b424b73f3ce66e754595cbcb6d94ff49790c9ac37d50e40e8145cd44/zstandard-0.24.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:77b8b7b98893eaf47da03d262816f01f251c2aa059c063ed8a45c50eada123a5", size = 5359756, upload-time = "2025-08-17T18:23:15.021Z" },
|
| 862 |
+
{ url = "https://files.pythonhosted.org/packages/90/6c/f1f0e11f1b295138f9da7e7ae22dcd9a1bb96a9544fa3b31507e431288f5/zstandard-0.24.0-cp313-cp313-win32.whl", hash = "sha256:cf7fbb4e54136e9a03c7ed7691843c4df6d2ecc854a2541f840665f4f2bb2edd", size = 435957, upload-time = "2025-08-17T18:23:18.835Z" },
|
| 863 |
+
{ url = "https://files.pythonhosted.org/packages/9f/03/ab8b82ae5eb49eca4d3662705399c44442666cc1ce45f44f2d263bb1ae31/zstandard-0.24.0-cp313-cp313-win_amd64.whl", hash = "sha256:d64899cc0f33a8f446f1e60bffc21fa88b99f0e8208750d9144ea717610a80ce", size = 505171, upload-time = "2025-08-17T18:23:16.44Z" },
|
| 864 |
+
{ url = "https://files.pythonhosted.org/packages/db/12/89a2ecdea4bc73a934a30b66a7cfac5af352beac94d46cf289e103b65c34/zstandard-0.24.0-cp313-cp313-win_arm64.whl", hash = "sha256:57be3abb4313e0dd625596376bbb607f40059d801d51c1a1da94d7477e63b255", size = 461596, upload-time = "2025-08-17T18:23:17.603Z" },
|
| 865 |
+
{ url = "https://files.pythonhosted.org/packages/c9/56/f3d2c4d64aacee4aab89e788783636884786b6f8334c819f09bff1aa207b/zstandard-0.24.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:b7fa260dd2731afd0dfa47881c30239f422d00faee4b8b341d3e597cface1483", size = 795747, upload-time = "2025-08-17T18:23:19.968Z" },
|
| 866 |
+
{ url = "https://files.pythonhosted.org/packages/32/2d/9d3e5f6627e4cb5e511803788be1feee2f0c3b94594591e92b81db324253/zstandard-0.24.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:e05d66239d14a04b4717998b736a25494372b1b2409339b04bf42aa4663bf251", size = 640475, upload-time = "2025-08-17T18:23:21.5Z" },
|
| 867 |
+
{ url = "https://files.pythonhosted.org/packages/be/5d/48e66abf8c146d95330e5385633a8cfdd556fa8bd14856fe721590cbab2b/zstandard-0.24.0-cp314-cp314-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:622e1e04bd8a085994e02313ba06fbcf4f9ed9a488c6a77a8dbc0692abab6a38", size = 5343866, upload-time = "2025-08-17T18:23:23.351Z" },
|
| 868 |
+
{ url = "https://files.pythonhosted.org/packages/95/6c/65fe7ba71220a551e082e4a52790487f1d6bb8dfc2156883e088f975ad6d/zstandard-0.24.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:55872e818598319f065e8192ebefecd6ac05f62a43f055ed71884b0a26218f41", size = 5062719, upload-time = "2025-08-17T18:23:25.192Z" },
|
| 869 |
+
{ url = "https://files.pythonhosted.org/packages/cb/68/15ed0a813ff91be80cc2a610ac42e0fc8d29daa737de247bbf4bab9429a1/zstandard-0.24.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:bb2446a55b3a0fd8aa02aa7194bd64740015464a2daaf160d2025204e1d7c282", size = 5393090, upload-time = "2025-08-17T18:23:27.145Z" },
|
| 870 |
+
{ url = "https://files.pythonhosted.org/packages/d4/89/e560427b74fa2da6a12b8f3af8ee29104fe2bb069a25e7d314c35eec7732/zstandard-0.24.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:2825a3951f945fb2613ded0f517d402b1e5a68e87e0ee65f5bd224a8333a9a46", size = 5450383, upload-time = "2025-08-17T18:23:29.044Z" },
|
| 871 |
+
{ url = "https://files.pythonhosted.org/packages/a3/95/0498328cbb1693885509f2fc145402b108b750a87a3af65b7250b10bd896/zstandard-0.24.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:09887301001e7a81a3618156bc1759e48588de24bddfdd5b7a4364da9a8fbc20", size = 5546142, upload-time = "2025-08-17T18:23:31.281Z" },
|
| 872 |
+
{ url = "https://files.pythonhosted.org/packages/8a/8a/64aa15a726594df3bf5d8decfec14fe20cd788c60890f44fcfc74d98c2cc/zstandard-0.24.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:98ca91dc9602cf351497d5600aa66e6d011a38c085a8237b370433fcb53e3409", size = 4953456, upload-time = "2025-08-17T18:23:33.234Z" },
|
| 873 |
+
{ url = "https://files.pythonhosted.org/packages/b0/b6/e94879c5cd6017af57bcba08519ed1228b1ebb15681efd949f4a00199449/zstandard-0.24.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:e69f8e534b4e254f523e2f9d4732cf9c169c327ca1ce0922682aac9a5ee01155", size = 5268287, upload-time = "2025-08-17T18:23:35.145Z" },
|
| 874 |
+
{ url = "https://files.pythonhosted.org/packages/fd/e5/1a3b3a93f953dbe9e77e2a19be146e9cd2af31b67b1419d6cc8e8898d409/zstandard-0.24.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:444633b487a711e34f4bccc46a0c5dfbe1aee82c1a511e58cdc16f6bd66f187c", size = 5433197, upload-time = "2025-08-17T18:23:36.969Z" },
|
| 875 |
+
{ url = "https://files.pythonhosted.org/packages/39/83/b6eb1e1181de994b29804e1e0d2dc677bece4177f588c71653093cb4f6d5/zstandard-0.24.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:f7d3fe9e1483171e9183ffdb1fab07c5fef80a9c3840374a38ec2ab869ebae20", size = 5813161, upload-time = "2025-08-17T18:23:38.812Z" },
|
| 876 |
+
{ url = "https://files.pythonhosted.org/packages/f6/d3/2fb4166561591e9d75e8e35c79182aa9456644e2f4536f29e51216d1c513/zstandard-0.24.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:27b6fa72b57824a3f7901fc9cc4ce1c1c834b28f3a43d1d4254c64c8f11149d4", size = 5359831, upload-time = "2025-08-17T18:23:41.162Z" },
|
| 877 |
+
{ url = "https://files.pythonhosted.org/packages/11/94/6a9227315b774f64a67445f62152c69b4e5e49a52a3c7c4dad8520a55e20/zstandard-0.24.0-cp314-cp314-win32.whl", hash = "sha256:fdc7a52a4cdaf7293e10813fd6a3abc0c7753660db12a3b864ab1fb5a0c60c16", size = 444448, upload-time = "2025-08-17T18:23:45.151Z" },
|
| 878 |
+
{ url = "https://files.pythonhosted.org/packages/fc/de/67acaba311013e0798cb96d1a2685cb6edcdfc1cae378b297ea7b02c319f/zstandard-0.24.0-cp314-cp314-win_amd64.whl", hash = "sha256:656ed895b28c7e42dd5b40dfcea3217cfc166b6b7eef88c3da2f5fc62484035b", size = 516075, upload-time = "2025-08-17T18:23:42.8Z" },
|
| 879 |
+
{ url = "https://files.pythonhosted.org/packages/10/ae/45fd8921263cea0228b20aa31bce47cc66016b2aba1afae1c6adcc3dbb1f/zstandard-0.24.0-cp314-cp314-win_arm64.whl", hash = "sha256:0101f835da7de08375f380192ff75135527e46e3f79bef224e3c49cb640fef6a", size = 476847, upload-time = "2025-08-17T18:23:43.892Z" },
|
| 880 |
+
]
|