Spaces:
Sleeping
Sleeping
Auto commit at 23-2025-08 13:31:17
Browse files
lily_llm_api/app_v2_origin.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
lily_llm_api/services/generation_service.py
CHANGED
@@ -358,24 +358,35 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
|
|
358 |
if 'vision_grid_thw' in combined_image_metas:
|
359 |
vision_grid = combined_image_metas['vision_grid_thw']
|
360 |
if isinstance(vision_grid, list):
|
361 |
-
# π Kanana λͺ¨λΈ μꡬμ¬ν:
|
362 |
if len(vision_grid) == 1 and len(vision_grid[0]) == 3:
|
363 |
-
# [(1, 34, 52)] -> (1, 34, 52) ν
μλ‘ λ³ν
|
364 |
t, h, w = vision_grid[0]
|
365 |
-
# π
|
366 |
-
processed_image_metas['vision_grid_thw'] = torch.tensor([[t, h, w]], dtype=torch.long)
|
367 |
print(f"π [DEBUG] vision_grid_thw ν
μ λ³ν: {vision_grid} -> {processed_image_metas['vision_grid_thw'].shape}")
|
368 |
else:
|
369 |
-
# π λ€λ₯Έ ννμ κ²½μ°
|
370 |
-
processed_image_metas['vision_grid_thw'] = torch.tensor(vision_grid, dtype=torch.long)
|
371 |
print(f"π [DEBUG] vision_grid_thw ν
μ λ³ν (κΈ°λ³Έ): {vision_grid} -> {processed_image_metas['vision_grid_thw'].shape}")
|
372 |
else:
|
373 |
-
|
|
|
|
|
|
|
|
|
374 |
|
375 |
-
# π λ€λ₯Έ
|
376 |
for key, value in combined_image_metas.items():
|
377 |
if key != 'vision_grid_thw':
|
378 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
379 |
|
380 |
generate_kwargs = {
|
381 |
'input_ids': input_ids,
|
|
|
358 |
if 'vision_grid_thw' in combined_image_metas:
|
359 |
vision_grid = combined_image_metas['vision_grid_thw']
|
360 |
if isinstance(vision_grid, list):
|
361 |
+
# π Kanana λͺ¨λΈ μꡬμ¬ν: λ°°μΉ μ°¨μμ λ§μΆ€
|
362 |
if len(vision_grid) == 1 and len(vision_grid[0]) == 3:
|
363 |
+
# [(1, 34, 52)] -> (1, 1, 34, 52) ν
μλ‘ λ³ν (λ°°μΉ μ°¨μ μΆκ°)
|
364 |
t, h, w = vision_grid[0]
|
365 |
+
# π 4μ°¨μ ν
μλ‘ λ³ν: (batch_size, T, H, W) νν
|
366 |
+
processed_image_metas['vision_grid_thw'] = torch.tensor([[[t, h, w]]], dtype=torch.long)
|
367 |
print(f"π [DEBUG] vision_grid_thw ν
μ λ³ν: {vision_grid} -> {processed_image_metas['vision_grid_thw'].shape}")
|
368 |
else:
|
369 |
+
# π λ€λ₯Έ ννμ κ²½μ° λ°°μΉ μ°¨μ μΆκ°
|
370 |
+
processed_image_metas['vision_grid_thw'] = torch.tensor([vision_grid], dtype=torch.long)
|
371 |
print(f"π [DEBUG] vision_grid_thw ν
μ λ³ν (κΈ°λ³Έ): {vision_grid} -> {processed_image_metas['vision_grid_thw'].shape}")
|
372 |
else:
|
373 |
+
# ν
μμΈ κ²½μ° λ°°μΉ μ°¨μ νμΈ λ° μΆκ°
|
374 |
+
if len(vision_grid.shape) == 3:
|
375 |
+
processed_image_metas['vision_grid_thw'] = vision_grid.unsqueeze(0)
|
376 |
+
else:
|
377 |
+
processed_image_metas['vision_grid_thw'] = vision_grid
|
378 |
|
379 |
+
# π λ€λ₯Έ λ©νλ°μ΄ν°λ λ°°μΉ μ°¨μ λ§μΆ€
|
380 |
for key, value in combined_image_metas.items():
|
381 |
if key != 'vision_grid_thw':
|
382 |
+
if isinstance(value, list):
|
383 |
+
# 리μ€νΈμΈ κ²½μ° λ°°μΉ μ°¨μ μΆκ°
|
384 |
+
processed_image_metas[key] = [value]
|
385 |
+
elif isinstance(value, torch.Tensor) and len(value.shape) == 2:
|
386 |
+
# 2μ°¨μ ν
μμΈ κ²½μ° λ°°μΉ μ°¨μ μΆκ°
|
387 |
+
processed_image_metas[key] = value.unsqueeze(0)
|
388 |
+
else:
|
389 |
+
processed_image_metas[key] = value
|
390 |
|
391 |
generate_kwargs = {
|
392 |
'input_ids': input_ids,
|