semantic-query-trainer / link-check.py
jamescalam's picture
upgrade to contrastive learning and unsplash lite dataset
88172be
import pinecone
import requests
from tqdm.auto import tqdm
import logging
# we run this to check for broken links
PINECONE_API_KEY = "<<API_KEY_HERE>>"
INDEX = "unsplash-25k-clip"
pinecone.init(
api_key=PINECONE_API_KEY,
environment="us-west1-gcp"
)
index = pinecone.Index(INDEX)
dim = index.describe_index_stats()['dimension']
total = int(index.describe_index_stats()['totalVectorCount'])
xq = [0.0] * dim
count = 0
ID_LIST = []
logging.info("Checking links...")
with tqdm(total=total) as pbar:
while True:
xc = index.query(
xq, top_k=100, include_metadata=True,
filter={"link_check": {"$ne": True}}
)
matches = xc['matches']
if len(matches) == 0:
break
for match in matches:
photo_url = match['metadata']['photo_url']+"/download?force=true&w=640"
res = requests.get(photo_url)
if res.status_code == 200:
good_url = "photo_url"
else:
res = requests.get(match['metadata']['photo_image_url'])
if res.status_code == 200:
good_url = "photo_image_url"
else:
good_url = "not_found"
index.update(match['id'], set_metadata={
'good_url': good_url,
'link_check': True
})
ID_LIST.append(match['id'])
pbar.update(1)
logging.info("Refreshing 'link_check' field...")
for _id in tqdm(ID_LIST):
index.update(_id, set_metadata={
'link_check': False
})