subashpoudel's picture
Implemented logging and try catch
5c271a3
import ast
import faiss
import logging
import numpy as np
import pandas as pd
from datasets import load_dataset
app_logger = logging.getLogger("app_logger")
error_logger = logging.getLogger("error_logger")
class DataLoader:
def __init__(self):
self.caption_dataset = load_dataset("DvorakInnovationAI/rt-genai-dataset-v1", revision="openai-embeddings")
self.ideas_dataset = load_dataset("DvorakInnovationAI/rt-genai-imdb-ideas-v1", revision='openai-embeddings')
def _load_vector_index(self , dataset):
df = dataset["train"].to_pandas()
df['embeddings'] = df['embeddings'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
embeddings = np.vstack(df['embeddings'].values).astype('float32')
faiss.normalize_L2(embeddings)
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)
return df, embeddings, index
def load_caption(self):
return self._load_vector_index(self.caption_dataset)
def load_imdb_ideas(self):
return self._load_vector_index(self.ideas_dataset)
try:
data_loader = DataLoader()
app_logger.info('Dataset loaded from Hugging Face.')
except Exception as e:
error_logger.error('Unable to load dataset:', e)
raise
try:
caption_df, caption_embeddings, caption_index = data_loader.load_caption()
ideas_df , ideas_embeddings , ideas_index = data_loader.load_imdb_ideas()
app_logger.info('Loaded the embeddings.')
except Exception as e:
error_logger.error('Unable to load the embeddings:', e)