Spaces:
Runtime error
Runtime error
import pinecone | |
import requests | |
from tqdm.auto import tqdm | |
import logging | |
# we run this to check for broken links | |
PINECONE_API_KEY = "<<API_KEY_HERE>>" | |
INDEX = "unsplash-25k-clip" | |
pinecone.init( | |
api_key=PINECONE_API_KEY, | |
environment="us-west1-gcp" | |
) | |
index = pinecone.Index(INDEX) | |
dim = index.describe_index_stats()['dimension'] | |
total = int(index.describe_index_stats()['totalVectorCount']) | |
xq = [0.0] * dim | |
count = 0 | |
ID_LIST = [] | |
logging.info("Checking links...") | |
with tqdm(total=total) as pbar: | |
while True: | |
xc = index.query( | |
xq, top_k=100, include_metadata=True, | |
filter={"link_check": {"$ne": True}} | |
) | |
matches = xc['matches'] | |
if len(matches) == 0: | |
break | |
for match in matches: | |
photo_url = match['metadata']['photo_url']+"/download?force=true&w=640" | |
res = requests.get(photo_url) | |
if res.status_code == 200: | |
good_url = "photo_url" | |
else: | |
res = requests.get(match['metadata']['photo_image_url']) | |
if res.status_code == 200: | |
good_url = "photo_image_url" | |
else: | |
good_url = "not_found" | |
index.update(match['id'], set_metadata={ | |
'good_url': good_url, | |
'link_check': True | |
}) | |
ID_LIST.append(match['id']) | |
pbar.update(1) | |
logging.info("Refreshing 'link_check' field...") | |
for _id in tqdm(ID_LIST): | |
index.update(_id, set_metadata={ | |
'link_check': False | |
}) |