| import os |
| |
| os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" |
| os.environ["CUDA_VISIBLE_DEVICES"] = "2" |
| import torch |
| from datasets import load_dataset |
| from sentence_transformers import SentenceTransformer |
| import faiss |
| import numpy as np |
|
|
| |
| model_id = "Qwen/Qwen3-Embedding-4B" |
| lang_code = "en" |
| save_path = f"/home/mshahidul/readctrl/data/vector_db/qwen_em/{lang_code}_wikipedia_qwen3_index.faiss" |
| batch_size = 8 |
|
|
| |
| |
| model = SentenceTransformer(model_id, trust_remote_code=True, model_kwargs={"torch_dtype": torch.bfloat16}) |
|
|
| |
| ds = load_dataset("wikimedia/wikipedia", f"20231101.{lang_code}", split='train', streaming=True) |
|
|
| def embed_wikipedia(dataset, model, batch_size): |
| index = None |
| metadata = [] |
| |
| batch_texts = [] |
| print("Starting embedding process...") |
| |
| for i, item in enumerate(dataset): |
| batch_texts.append(item['text']) |
| |
| if len(batch_texts) == batch_size: |
| |
| embeddings = model.encode(batch_texts, show_progress_bar=False) |
| embeddings = np.array(embeddings).astype('float32') |
| |
| |
| if index is None: |
| dimension = embeddings.shape[1] |
| index = faiss.IndexFlatL2(dimension) |
| |
| index.add(embeddings) |
| |
| |
| |
| |
| |
| batch_texts = [] |
| |
| if i % 100 == 0: |
| print(f"Processed {i} documents...") |
|
|
| return index |
|
|
| |
| vector_index = embed_wikipedia(ds, model, batch_size) |
| faiss.write_index(vector_index, save_path) |
| print(f"Index saved to {save_path}") |