import pinecone import requests from tqdm.auto import tqdm import logging # we run this to check for broken links PINECONE_API_KEY = "<>" INDEX = "unsplash-25k-clip" pinecone.init( api_key=PINECONE_API_KEY, environment="us-west1-gcp" ) index = pinecone.Index(INDEX) dim = index.describe_index_stats()['dimension'] total = int(index.describe_index_stats()['totalVectorCount']) xq = [0.0] * dim count = 0 ID_LIST = [] logging.info("Checking links...") with tqdm(total=total) as pbar: while True: xc = index.query( xq, top_k=100, include_metadata=True, filter={"link_check": {"$ne": True}} ) matches = xc['matches'] if len(matches) == 0: break for match in matches: photo_url = match['metadata']['photo_url']+"/download?force=true&w=640" res = requests.get(photo_url) if res.status_code == 200: good_url = "photo_url" else: res = requests.get(match['metadata']['photo_image_url']) if res.status_code == 200: good_url = "photo_image_url" else: good_url = "not_found" index.update(match['id'], set_metadata={ 'good_url': good_url, 'link_check': True }) ID_LIST.append(match['id']) pbar.update(1) logging.info("Refreshing 'link_check' field...") for _id in tqdm(ID_LIST): index.update(_id, set_metadata={ 'link_check': False })