Spaces:
Runtime error
Runtime error
Canstralian
commited on
Commit
•
b044f34
1
Parent(s):
3526e73
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, Query
|
2 |
+
from datasets import load_dataset
|
3 |
+
from typing import List
|
4 |
+
|
5 |
+
app = FastAPI()
|
6 |
+
|
7 |
+
# Load the dataset in streaming mode for memory efficiency
|
8 |
+
dataset = load_dataset("togethercomputer/RedPajama-Data-1T", streaming=True)
|
9 |
+
|
10 |
+
@app.get("/")
|
11 |
+
def greet_json():
|
12 |
+
return {"message": "Welcome to the RedPajama Dataset API"}
|
13 |
+
|
14 |
+
@app.get("/get_data/")
|
15 |
+
def get_data(chunk_size: int = 10):
|
16 |
+
"""
|
17 |
+
Returns a small chunk of the dataset.
|
18 |
+
|
19 |
+
Parameters:
|
20 |
+
- chunk_size: The number of examples to return (default: 10).
|
21 |
+
|
22 |
+
Returns:
|
23 |
+
- A list of examples from the dataset.
|
24 |
+
"""
|
25 |
+
data_chunk = []
|
26 |
+
for i, example in enumerate(dataset["train"]): # Adjust split if needed
|
27 |
+
data_chunk.append(example)
|
28 |
+
if i + 1 == chunk_size:
|
29 |
+
break
|
30 |
+
return {"data": data_chunk}
|
31 |
+
|
32 |
+
@app.get("/search_data/")
|
33 |
+
def search_data(keyword: str, max_results: int = 10):
|
34 |
+
"""
|
35 |
+
Searches the dataset for a specific keyword in the text fields.
|
36 |
+
|
37 |
+
Parameters:
|
38 |
+
- keyword: The keyword to search for.
|
39 |
+
- max_results: The maximum number of results to return (default: 10).
|
40 |
+
|
41 |
+
Returns:
|
42 |
+
- A list of examples containing the keyword.
|
43 |
+
"""
|
44 |
+
results = []
|
45 |
+
for example in dataset["train"]: # Adjust split if needed
|
46 |
+
if keyword.lower() in str(example).lower():
|
47 |
+
results.append(example)
|
48 |
+
if len(results) == max_results:
|
49 |
+
break
|
50 |
+
return {"results": results}
|
51 |
+
|
52 |
+
@app.get("/data_summary/")
|
53 |
+
def data_summary():
|
54 |
+
"""
|
55 |
+
Provides a basic summary of the dataset.
|
56 |
+
|
57 |
+
Returns:
|
58 |
+
- A dictionary with dataset details (e.g., number of splits).
|
59 |
+
"""
|
60 |
+
return {"dataset_splits": dataset.keys()}
|