Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 3,708 Bytes
3e2784f 4d0e134 3e2784f 4d0e134 3e2784f 4d0e134 3e2784f 4d0e134 3e2784f 4d0e134 3e2784f 4d0e134 3e2784f 4d0e134 3e2784f 4d0e134 3e2784f 4d0e134 3e2784f 4d0e134 3e2784f 4d0e134 3e2784f 4d0e134 3e2784f 4d0e134 3e2784f 4d0e134 3e2784f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import asyncio
import logging
import chromadb
import requests
import stamina
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from huggingface_hub import InferenceClient
from tqdm.auto import tqdm
from tqdm.contrib.concurrent import thread_map
from prep_viewer_data import prep_data
from utils import get_chroma_client
# Set up logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
EMBEDDING_MODEL_NAME = "davanstrien/dataset-viewer-descriptions-processed-st"
EMBEDDING_MODEL_REVISION = "d09abf1227ac41c6955eb9dd53c21771b0984ade"
INFERENCE_MODEL_URL = (
"https://bm143rfir2on1bkw.us-east-1.aws.endpoints.huggingface.cloud"
)
def initialize_clients():
logger.info("Initializing clients")
chroma_client = get_chroma_client()
inference_client = InferenceClient(
INFERENCE_MODEL_URL,
)
return chroma_client, inference_client
def create_collection(chroma_client):
logger.info("Creating or getting collection")
embedding_function = SentenceTransformerEmbeddingFunction(
model_name=EMBEDDING_MODEL_NAME,
trust_remote_code=True,
revision=EMBEDDING_MODEL_REVISION,
)
logger.info(f"Embedding function: {embedding_function}")
logger.info(f"Embedding model name: {EMBEDDING_MODEL_NAME}")
logger.info(f"Embedding model revision: {EMBEDDING_MODEL_REVISION}")
return chroma_client.create_collection(
name="dataset-viewer-descriptions",
get_or_create=True,
embedding_function=embedding_function,
metadata={"hnsw:space": "cosine"},
)
@stamina.retry(on=requests.HTTPError, attempts=3, wait_initial=10)
def embed_card(text, client):
text = text[:8192]
return client.feature_extraction(text)
def embed_and_upsert_datasets(
dataset_rows_and_ids: list[dict[str, str]],
collection: chromadb.Collection,
inference_client: InferenceClient,
batch_size: int = 10,
):
logger.info(
f"Embedding and upserting {len(dataset_rows_and_ids)} datasets for viewer data"
)
for i in tqdm(range(0, len(dataset_rows_and_ids), batch_size)):
batch = dataset_rows_and_ids[i : i + batch_size]
ids = []
documents = []
for item in batch:
ids.append(item["dataset_id"])
documents.append(f"HUB_DATASET_PREVIEW: {item['formatted_prompt']}")
results = thread_map(
lambda doc: embed_card(doc, inference_client), documents, leave=False
)
logger.info(f"Results: {len(results)}")
collection.upsert(
ids=ids,
embeddings=[embedding.tolist()[0] for embedding in results],
)
logger.debug(f"Processed batch {i//batch_size + 1}")
async def refresh_viewer_data(sample_size=200_000, min_likes=2):
logger.info(
f"Refreshing viewer data with sample_size={sample_size} and min_likes={min_likes}"
)
chroma_client, inference_client = initialize_clients()
collection = create_collection(chroma_client)
logger.info("Collection created successfully")
logger.info("Preparing data")
df = await prep_data(sample_size=sample_size, min_likes=min_likes)
df.write_parquet("viewer_data.parquet")
if df is not None:
logger.info("Data prepared successfully")
logger.info(f"Data: {df}")
dataset_rows_and_ids = df.to_dicts()
logger.info(f"Embedding and upserting {len(dataset_rows_and_ids)} datasets")
embed_and_upsert_datasets(dataset_rows_and_ids, collection, inference_client)
logger.info("Refresh completed successfully")
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
asyncio.run(refresh_viewer_data())
|