yangdx
commited on
Commit
·
38074de
1
Parent(s):
a940648
Fix race condition for health_check and ensure_workers
Browse files- lightrag/utils.py +44 -57
lightrag/utils.py
CHANGED
|
@@ -289,9 +289,10 @@ def priority_limit_async_func_call(max_size: int, max_queue_size: int = 1000):
|
|
| 289 |
def final_decro(func):
|
| 290 |
queue = asyncio.PriorityQueue(maxsize=max_queue_size)
|
| 291 |
tasks = set()
|
| 292 |
-
|
| 293 |
counter = 0
|
| 294 |
shutdown_event = asyncio.Event()
|
|
|
|
| 295 |
worker_health_check_task = None
|
| 296 |
|
| 297 |
# Track active future objects for cleanup
|
|
@@ -352,76 +353,62 @@ def priority_limit_async_func_call(max_size: int, max_queue_size: int = 1000):
|
|
| 352 |
while not shutdown_event.is_set():
|
| 353 |
await asyncio.sleep(5) # Check every 5 seconds
|
| 354 |
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
|
|
|
|
|
|
| 358 |
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
except Exception as e:
|
| 371 |
logger.error(f"limit_async: Error in health check: {str(e)}")
|
| 372 |
finally:
|
| 373 |
logger.warning("limit_async: Health check task exiting")
|
| 374 |
|
| 375 |
-
# Ensure worker tasks are started
|
| 376 |
async def ensure_workers():
|
| 377 |
-
"""Ensure worker
|
| 378 |
-
nonlocal tasks, worker_health_check_task
|
| 379 |
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
lock_acquired = await asyncio.wait_for(lock.acquire(), timeout=5.0)
|
| 386 |
-
except asyncio.TimeoutError:
|
| 387 |
-
logger.error(
|
| 388 |
-
"limit_async: Timeout acquiring lock in ensure_workers"
|
| 389 |
-
)
|
| 390 |
-
# Even if acquiring the lock times out, continue trying to create workers
|
| 391 |
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
if (
|
| 395 |
-
worker_health_check_task is None
|
| 396 |
-
or worker_health_check_task.done()
|
| 397 |
-
):
|
| 398 |
-
worker_health_check_task = asyncio.create_task(health_check())
|
| 399 |
|
| 400 |
-
|
| 401 |
-
|
|
|
|
| 402 |
|
| 403 |
-
|
| 404 |
-
active_tasks_count = len(tasks)
|
| 405 |
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
if workers_needed > 0:
|
| 409 |
-
for _ in range(workers_needed):
|
| 410 |
-
task = asyncio.create_task(worker())
|
| 411 |
-
tasks.add(task)
|
| 412 |
-
task.add_done_callback(tasks.discard)
|
| 413 |
-
finally:
|
| 414 |
-
# Ensure the lock is released
|
| 415 |
-
if lock_acquired:
|
| 416 |
-
lock.release()
|
| 417 |
-
except Exception as e:
|
| 418 |
-
logger.error(f"limit_async: Error in ensure_workers: {str(e)}")
|
| 419 |
-
# Even if an exception occurs, try to create at least one worker
|
| 420 |
-
if not any(not t.done() for t in tasks):
|
| 421 |
task = asyncio.create_task(worker())
|
| 422 |
tasks.add(task)
|
| 423 |
task.add_done_callback(tasks.discard)
|
| 424 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 425 |
async def shutdown():
|
| 426 |
"""Gracefully shut down all workers and the queue"""
|
| 427 |
logger.info("limit_async: Shutting down priority queue workers")
|
|
@@ -480,7 +467,7 @@ def priority_limit_async_func_call(max_size: int, max_queue_size: int = 1000):
|
|
| 480 |
QueueFullError: If the queue is full and waiting times out
|
| 481 |
Any exception raised by the decorated function
|
| 482 |
"""
|
| 483 |
-
# Ensure
|
| 484 |
await ensure_workers()
|
| 485 |
|
| 486 |
# Create a future for the result
|
|
@@ -488,7 +475,7 @@ def priority_limit_async_func_call(max_size: int, max_queue_size: int = 1000):
|
|
| 488 |
active_futures.add(future)
|
| 489 |
|
| 490 |
nonlocal counter
|
| 491 |
-
async with
|
| 492 |
current_count = counter
|
| 493 |
counter += 1
|
| 494 |
|
|
|
|
| 289 |
def final_decro(func):
|
| 290 |
queue = asyncio.PriorityQueue(maxsize=max_queue_size)
|
| 291 |
tasks = set()
|
| 292 |
+
initialization_lock = asyncio.Lock()
|
| 293 |
counter = 0
|
| 294 |
shutdown_event = asyncio.Event()
|
| 295 |
+
initialized = False # Global initialization flag
|
| 296 |
worker_health_check_task = None
|
| 297 |
|
| 298 |
# Track active future objects for cleanup
|
|
|
|
| 353 |
while not shutdown_event.is_set():
|
| 354 |
await asyncio.sleep(5) # Check every 5 seconds
|
| 355 |
|
| 356 |
+
# No longer acquire lock, directly operate on task set
|
| 357 |
+
# Use a copy of the task set to avoid concurrent modification
|
| 358 |
+
current_tasks = set(tasks)
|
| 359 |
+
done_tasks = {t for t in current_tasks if t.done()}
|
| 360 |
+
tasks.difference_update(done_tasks)
|
| 361 |
|
| 362 |
+
# Calculate active tasks count
|
| 363 |
+
active_tasks_count = len(tasks)
|
| 364 |
+
workers_needed = max_size - active_tasks_count
|
| 365 |
+
|
| 366 |
+
if workers_needed > 0:
|
| 367 |
+
logger.info(
|
| 368 |
+
f"limit_async: Creating {workers_needed} new workers"
|
| 369 |
+
)
|
| 370 |
+
new_tasks = set()
|
| 371 |
+
for _ in range(workers_needed):
|
| 372 |
+
task = asyncio.create_task(worker())
|
| 373 |
+
new_tasks.add(task)
|
| 374 |
+
task.add_done_callback(tasks.discard)
|
| 375 |
+
# Update task set in one operation
|
| 376 |
+
tasks.update(new_tasks)
|
| 377 |
except Exception as e:
|
| 378 |
logger.error(f"limit_async: Error in health check: {str(e)}")
|
| 379 |
finally:
|
| 380 |
logger.warning("limit_async: Health check task exiting")
|
| 381 |
|
|
|
|
| 382 |
async def ensure_workers():
|
| 383 |
+
"""Ensure worker threads and health check system are available
|
|
|
|
| 384 |
|
| 385 |
+
This function checks if the worker system is already initialized.
|
| 386 |
+
If not, it performs a one-time initialization of all worker threads
|
| 387 |
+
and starts the health check system.
|
| 388 |
+
"""
|
| 389 |
+
nonlocal initialized, worker_health_check_task, tasks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 390 |
|
| 391 |
+
if initialized:
|
| 392 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
|
| 394 |
+
async with initialization_lock:
|
| 395 |
+
if initialized:
|
| 396 |
+
return
|
| 397 |
|
| 398 |
+
logger.info("limit_async: Initializing worker system")
|
|
|
|
| 399 |
|
| 400 |
+
# Create initial worker tasks
|
| 401 |
+
for _ in range(max_size):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 402 |
task = asyncio.create_task(worker())
|
| 403 |
tasks.add(task)
|
| 404 |
task.add_done_callback(tasks.discard)
|
| 405 |
|
| 406 |
+
# Start health check
|
| 407 |
+
worker_health_check_task = asyncio.create_task(health_check())
|
| 408 |
+
|
| 409 |
+
initialized = True
|
| 410 |
+
logger.info("limit_async: Worker system initialized")
|
| 411 |
+
|
| 412 |
async def shutdown():
|
| 413 |
"""Gracefully shut down all workers and the queue"""
|
| 414 |
logger.info("limit_async: Shutting down priority queue workers")
|
|
|
|
| 467 |
QueueFullError: If the queue is full and waiting times out
|
| 468 |
Any exception raised by the decorated function
|
| 469 |
"""
|
| 470 |
+
# Ensure worker system is initialized
|
| 471 |
await ensure_workers()
|
| 472 |
|
| 473 |
# Create a future for the result
|
|
|
|
| 475 |
active_futures.add(future)
|
| 476 |
|
| 477 |
nonlocal counter
|
| 478 |
+
async with initialization_lock:
|
| 479 |
current_count = counter
|
| 480 |
counter += 1
|
| 481 |
|