advanced-rag / hugging_face.py
bstraehle's picture
Update hugging_face.py
39d8524 verified
raw
history blame
1.13 kB
import pandas as pd
from datasets import load_dataset
def get_listings():
dataset = load_dataset("MongoDB/airbnb_embeddings", streaming=True, split="train")
dataset = dataset.take(100)
dataset_df = pd.DataFrame(dataset)
dataset_df.head(5)
print("Columns:", dataset_df.columns)
records = dataset_df.to_dict(orient='records')
# To handle catch `NaT` values
for record in records:
for key, value in record.items():
# Check if the value is list-like; if so, process each element.
if isinstance(value, list):
processed_list = [None if pd.isnull(v) else v for v in value]
record[key] = processed_list
# For scalar values, continue as before.
else:
if pd.isnull(value):
record[key] = None
try:
# Convert each dictionary to a Movie instance
listings = [Listing(**record).dict() for record in records]
# Get an overview of a single datapoint
print(listings[0].keys())
return listings
except ValidationError as e:
print(e)