mys commited on
Commit
02f62c3
1 Parent(s): b45851d

Use conf values from `config.py`

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. create_index.py +4 -18
  3. requirements-indexing.txt +5 -0
README.md CHANGED
@@ -11,4 +11,4 @@ license: apache-2.0
11
  ---
12
 
13
  # hf-spaces-demo
14
- A semantic search demo on HuggingFace Spaces backed by Qdrant Cloud
 
11
  ---
12
 
13
  # hf-spaces-demo
14
+ A semantic image search demo on HuggingFace Spaces backed by Qdrant Cloud
create_index.py CHANGED
@@ -13,16 +13,10 @@ from qdrant_client import QdrantClient
13
  from qdrant_client.models import Distance, Record, VectorParams, OptimizersConfigDiff, Payload
14
 
15
 
16
- LOGGER = logging.getLogger(__name__)
17
 
 
18
 
19
- def truncate(n, decimals=0):
20
- try:
21
- multiplier = 10 ** decimals
22
- return int(n * multiplier) / multiplier
23
- except:
24
- LOGGER.warn(f"Cannot {n} as a number, returning 0.0")
25
- return 0.0
26
 
27
 
28
  def get_vector_size_and_number(img_emb_files):
@@ -43,9 +37,7 @@ def get_embeddings_and_records(img_emb_files, txt_emb_files, metadata_files):
43
  payload_data.drop(columns=["image_path", "hash", "key", "status",
44
  "error_message", "width", "height", "exif", "sha256", "original_width", "original_height"], errors="ignore", inplace=True)
45
  payload_data = payload_data.to_dict(orient='records')
46
- payload_data = [{k: truncate(v, 5) if not isinstance(
47
- v, str) else v for k, v in p.items()} for p in payload_data]
48
-
49
  img_embeddings = np.load(img_file)
50
  txt_embeddings = np.load(txt_file)
51
 
@@ -57,12 +49,6 @@ def get_embeddings_and_records(img_emb_files, txt_emb_files, metadata_files):
57
 
58
  def clip_index(
59
  embeddings_folder,
60
- collection_name,
61
- host="localhost",
62
- api_key=None,
63
- port=6333,
64
- grpc_port=6334,
65
- prefer_grpc=True,
66
  batch_size=64,
67
  parallel=2,
68
  max_retries=5,
@@ -71,7 +57,7 @@ def clip_index(
71
  ):
72
  """indexes clip embeddings using Qdrant"""
73
  client = QdrantClient(
74
- host=host, api_key=api_key, prefer_grpc=prefer_grpc, port=port, grpc_port=grpc_port)
75
 
76
  image_folder = f"{embeddings_folder}/{image_subfolder}"
77
  text_folder = f"{embeddings_folder}/{text_subfolder}"
 
13
  from qdrant_client.models import Distance, Record, VectorParams, OptimizersConfigDiff, Payload
14
 
15
 
16
+ from config import api_key, collection_name, host_url
17
 
18
+ LOGGER = logging.getLogger(__name__)
19
 
 
 
 
 
 
 
 
20
 
21
 
22
  def get_vector_size_and_number(img_emb_files):
 
37
  payload_data.drop(columns=["image_path", "hash", "key", "status",
38
  "error_message", "width", "height", "exif", "sha256", "original_width", "original_height"], errors="ignore", inplace=True)
39
  payload_data = payload_data.to_dict(orient='records')
40
+
 
 
41
  img_embeddings = np.load(img_file)
42
  txt_embeddings = np.load(txt_file)
43
 
 
49
 
50
  def clip_index(
51
  embeddings_folder,
 
 
 
 
 
 
52
  batch_size=64,
53
  parallel=2,
54
  max_retries=5,
 
57
  ):
58
  """indexes clip embeddings using Qdrant"""
59
  client = QdrantClient(
60
+ host=host_url, api_key=api_key, prefer_grpc=True)
61
 
62
  image_folder = f"{embeddings_folder}/{image_subfolder}"
63
  text_folder = f"{embeddings_folder}/{text_subfolder}"
requirements-indexing.txt CHANGED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch
2
+ tqdm
3
+ qdrant-client
4
+ fire
5
+ pandas