Spaces:
Runtime error
Runtime error
Commit
·
9a6e4f2
1
Parent(s):
f06e8ae
refactor: Update main.py with scheduled data refresh and background task
Browse files
main.py
CHANGED
@@ -1,10 +1,15 @@
|
|
|
|
|
|
1 |
import json
|
2 |
import logging
|
|
|
3 |
import sqlite3
|
4 |
from contextlib import asynccontextmanager
|
5 |
from typing import List
|
6 |
|
7 |
import numpy as np
|
|
|
|
|
8 |
from cashews import NOT_NONE, cache
|
9 |
from fastapi import FastAPI, HTTPException, Query
|
10 |
from pandas import Timestamp
|
@@ -13,6 +18,8 @@ from starlette.responses import RedirectResponse
|
|
13 |
|
14 |
from data_loader import refresh_data
|
15 |
|
|
|
|
|
16 |
cache.setup("mem://?check_interval=10&size=10000")
|
17 |
logger = logging.getLogger(__name__)
|
18 |
|
@@ -72,47 +79,91 @@ def serialize_numpy(obj):
|
|
72 |
raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
|
73 |
|
74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
@asynccontextmanager
|
76 |
async def lifespan(app: FastAPI):
|
77 |
setup_database()
|
78 |
-
logger.info("
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
82 |
|
83 |
-
c = conn.cursor()
|
84 |
-
c.executemany(
|
85 |
-
"""
|
86 |
-
INSERT OR REPLACE INTO datasets
|
87 |
-
(hub_id, likes, downloads, tags, created_at, last_modified, license, language, config_name, column_names, features)
|
88 |
-
VALUES (?, ?, ?, json(?), ?, ?, json(?), json(?), ?, json(?), json(?))
|
89 |
-
""",
|
90 |
-
[
|
91 |
-
(
|
92 |
-
data["hub_id"],
|
93 |
-
data.get("likes", 0),
|
94 |
-
data.get("downloads", 0),
|
95 |
-
json.dumps(data.get("tags", []), default=serialize_numpy),
|
96 |
-
int(data["created_at"].timestamp())
|
97 |
-
if isinstance(data["created_at"], Timestamp)
|
98 |
-
else data.get("created_at", 0),
|
99 |
-
int(data["last_modified"].timestamp())
|
100 |
-
if isinstance(data["last_modified"], Timestamp)
|
101 |
-
else data.get("last_modified", 0),
|
102 |
-
json.dumps(data.get("license", []), default=serialize_numpy),
|
103 |
-
json.dumps(data.get("language", []), default=serialize_numpy),
|
104 |
-
data.get("config_name", ""),
|
105 |
-
json.dumps(data.get("column_names", []), default=serialize_numpy),
|
106 |
-
json.dumps(data.get("features", []), default=serialize_numpy),
|
107 |
-
)
|
108 |
-
for data in datasets
|
109 |
-
],
|
110 |
-
)
|
111 |
-
conn.commit()
|
112 |
-
conn.close()
|
113 |
-
logger.info("Data refreshed")
|
114 |
yield
|
115 |
|
|
|
|
|
116 |
|
117 |
app = FastAPI(lifespan=lifespan)
|
118 |
|
|
|
1 |
+
import asyncio
|
2 |
+
import concurrent.futures
|
3 |
import json
|
4 |
import logging
|
5 |
+
import os
|
6 |
import sqlite3
|
7 |
from contextlib import asynccontextmanager
|
8 |
from typing import List
|
9 |
|
10 |
import numpy as np
|
11 |
+
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
12 |
+
from apscheduler.triggers.cron import CronTrigger
|
13 |
from cashews import NOT_NONE, cache
|
14 |
from fastapi import FastAPI, HTTPException, Query
|
15 |
from pandas import Timestamp
|
|
|
18 |
|
19 |
from data_loader import refresh_data
|
20 |
|
21 |
+
UPDATE_SCHEDULE = {"hour": os.getenv("UPDATE_INTERVAL_HOURS", "*/6")}
|
22 |
+
|
23 |
cache.setup("mem://?check_interval=10&size=10000")
|
24 |
logger = logging.getLogger(__name__)
|
25 |
|
|
|
79 |
raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
|
80 |
|
81 |
|
82 |
+
def background_refresh_data():
|
83 |
+
logger.info("Starting background data refresh")
|
84 |
+
try:
|
85 |
+
return refresh_data()
|
86 |
+
except Exception as e:
|
87 |
+
logger.error(f"Error in background data refresh: {str(e)}")
|
88 |
+
return None
|
89 |
+
|
90 |
+
|
91 |
+
async def update_database():
|
92 |
+
logger.info("Starting scheduled data refresh")
|
93 |
+
|
94 |
+
# Run refresh_data in a background thread
|
95 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
96 |
+
future = executor.submit(background_refresh_data)
|
97 |
+
|
98 |
+
# Wait for the background task to complete, but allow for cancellation
|
99 |
+
try:
|
100 |
+
datasets = await asyncio.get_event_loop().run_in_executor(
|
101 |
+
None, future.result
|
102 |
+
)
|
103 |
+
except asyncio.CancelledError:
|
104 |
+
future.cancel()
|
105 |
+
logger.info("Data refresh cancelled")
|
106 |
+
return
|
107 |
+
|
108 |
+
if datasets is None:
|
109 |
+
logger.error("Data refresh failed, skipping database update")
|
110 |
+
return
|
111 |
+
|
112 |
+
conn = get_db_connection()
|
113 |
+
try:
|
114 |
+
c = conn.cursor()
|
115 |
+
c.executemany(
|
116 |
+
"""
|
117 |
+
INSERT OR REPLACE INTO datasets
|
118 |
+
(hub_id, likes, downloads, tags, created_at, last_modified, license, language, config_name, column_names, features)
|
119 |
+
VALUES (?, ?, ?, json(?), ?, ?, json(?), json(?), ?, json(?), json(?))
|
120 |
+
""",
|
121 |
+
[
|
122 |
+
(
|
123 |
+
data["hub_id"],
|
124 |
+
data.get("likes", 0),
|
125 |
+
data.get("downloads", 0),
|
126 |
+
json.dumps(data.get("tags", []), default=serialize_numpy),
|
127 |
+
int(data["created_at"].timestamp())
|
128 |
+
if isinstance(data["created_at"], Timestamp)
|
129 |
+
else data.get("created_at", 0),
|
130 |
+
int(data["last_modified"].timestamp())
|
131 |
+
if isinstance(data["last_modified"], Timestamp)
|
132 |
+
else data.get("last_modified", 0),
|
133 |
+
json.dumps(data.get("license", []), default=serialize_numpy),
|
134 |
+
json.dumps(data.get("language", []), default=serialize_numpy),
|
135 |
+
data.get("config_name", ""),
|
136 |
+
json.dumps(data.get("column_names", []), default=serialize_numpy),
|
137 |
+
json.dumps(data.get("features", []), default=serialize_numpy),
|
138 |
+
)
|
139 |
+
for data in datasets
|
140 |
+
],
|
141 |
+
)
|
142 |
+
conn.commit()
|
143 |
+
logger.info("Scheduled data refresh completed")
|
144 |
+
except Exception as e:
|
145 |
+
logger.error(f"Error during database update: {str(e)}")
|
146 |
+
conn.rollback()
|
147 |
+
finally:
|
148 |
+
conn.close()
|
149 |
+
|
150 |
+
|
151 |
@asynccontextmanager
|
152 |
async def lifespan(app: FastAPI):
|
153 |
setup_database()
|
154 |
+
logger.info("Performing initial data refresh")
|
155 |
+
await update_database()
|
156 |
+
|
157 |
+
# Set up the scheduler
|
158 |
+
scheduler = AsyncIOScheduler()
|
159 |
+
# Schedule the update_database function using the UPDATE_SCHEDULE configuration
|
160 |
+
scheduler.add_job(update_database, CronTrigger(**UPDATE_SCHEDULE))
|
161 |
+
scheduler.start()
|
162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
yield
|
164 |
|
165 |
+
scheduler.shutdown()
|
166 |
+
|
167 |
|
168 |
app = FastAPI(lifespan=lifespan)
|
169 |
|