from fastapi import FastAPI, Query from datasets import load_dataset from typing import List app = FastAPI() # Load the dataset in streaming mode for memory efficiency dataset = load_dataset("togethercomputer/RedPajama-Data-1T", streaming=True) @app.get("/") def greet_json(): return {"message": "Welcome to the RedPajama Dataset API"} @app.get("/get_data/") def get_data(chunk_size: int = 10): """ Returns a small chunk of the dataset. Parameters: - chunk_size: The number of examples to return (default: 10). Returns: - A list of examples from the dataset. """ data_chunk = [] for i, example in enumerate(dataset["train"]): # Adjust split if needed data_chunk.append(example) if i + 1 == chunk_size: break return {"data": data_chunk} @app.get("/search_data/") def search_data(keyword: str, max_results: int = 10): """ Searches the dataset for a specific keyword in the text fields. Parameters: - keyword: The keyword to search for. - max_results: The maximum number of results to return (default: 10). Returns: - A list of examples containing the keyword. """ results = [] for example in dataset["train"]: # Adjust split if needed if keyword.lower() in str(example).lower(): results.append(example) if len(results) == max_results: break return {"results": results} @app.get("/data_summary/") def data_summary(): """ Provides a basic summary of the dataset. Returns: - A dictionary with dataset details (e.g., number of splits). """ return {"dataset_splits": dataset.keys()}