Spaces:
Runtime error
Runtime error
| from tqdm.autonotebook import tqdm | |
| from qdrant_client import QdrantClient, models | |
| qdrant = QdrantClient(':memory:') # create in-mem instance of vector db | |
| # qdrant = QdrantClient( | |
| # url=os.environ['QDRANT_URL'], | |
| # api_key=os.environ['QDRANT_API_KEY'], | |
| # ) | |
| def build_index(passages, batch_size, start_idx, encoder, collection_name): | |
| collections_names = list(map(lambda x: x.name, qdrant.get_collections().collections)) | |
| if collection_name in collections_names: | |
| print('collection is already there!') | |
| return | |
| qdrant.recreate_collection( | |
| collection_name=collection_name, | |
| vectors_config=models.VectorParams( | |
| size=encoder.get_sentence_embedding_dimension(), | |
| distance=models.Distance.COSINE, | |
| on_disk=True, | |
| ), | |
| optimizers_config=models.OptimizersConfigDiff( | |
| memmap_threshold=10000, | |
| default_segment_number=5, | |
| indexing_threshold=0, | |
| ), | |
| quantization_config=models.BinaryQuantization( | |
| binary=models.BinaryQuantizationConfig(always_ram=True), | |
| ), | |
| ) | |
| passage_batch = [] | |
| for idx, entry in enumerate(tqdm(passages[start_idx:], desc='Uploading vector embeddings in batch size of {}'.format(batch_size))): | |
| if len(passage_batch) < batch_size: | |
| passage_batch.append({ | |
| 'payload': entry, | |
| 'id': idx + start_idx | |
| }) | |
| else: | |
| passage_list = [item['payload']['passage'] for item in passage_batch] | |
| embedding_batch = encoder.encode(passage_list).tolist() | |
| records = [ | |
| models.Record( | |
| id=entry['id'], | |
| payload=entry['payload'], | |
| vector=embedding | |
| ) for entry, embedding in zip(passage_batch, embedding_batch) | |
| ] | |
| qdrant.upload_records( | |
| collection_name=collection_name, | |
| records=records, | |
| parallel=10, | |
| ) | |
| passage_batch = [] | |
| return True |