Spaces:
Sleeping
Sleeping
| import ast | |
| import faiss | |
| import logging | |
| import numpy as np | |
| import pandas as pd | |
| from datasets import load_dataset | |
| app_logger = logging.getLogger("app_logger") | |
| error_logger = logging.getLogger("error_logger") | |
| class DataLoader: | |
| def __init__(self): | |
| self.caption_dataset = load_dataset("DvorakInnovationAI/rt-genai-dataset-v1", revision="openai-embeddings") | |
| self.ideas_dataset = load_dataset("DvorakInnovationAI/rt-genai-imdb-ideas-v1", revision='openai-embeddings') | |
| def _load_vector_index(self , dataset): | |
| df = dataset["train"].to_pandas() | |
| df['embeddings'] = df['embeddings'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x) | |
| embeddings = np.vstack(df['embeddings'].values).astype('float32') | |
| faiss.normalize_L2(embeddings) | |
| index = faiss.IndexFlatIP(embeddings.shape[1]) | |
| index.add(embeddings) | |
| return df, embeddings, index | |
| def load_caption(self): | |
| return self._load_vector_index(self.caption_dataset) | |
| def load_imdb_ideas(self): | |
| return self._load_vector_index(self.ideas_dataset) | |
| try: | |
| data_loader = DataLoader() | |
| app_logger.info('Dataset loaded from Hugging Face.') | |
| except Exception as e: | |
| error_logger.error('Unable to load dataset:', e) | |
| raise | |
| try: | |
| caption_df, caption_embeddings, caption_index = data_loader.load_caption() | |
| ideas_df , ideas_embeddings , ideas_index = data_loader.load_imdb_ideas() | |
| app_logger.info('Loaded the embeddings.') | |
| except Exception as e: | |
| error_logger.error('Unable to load the embeddings:', e) | |