gbrabbit commited on
Commit
cbf7778
Β·
1 Parent(s): 84635f1

Auto commit at 23-2025-08 13:31:17

Browse files
lily_llm_api/app_v2_origin.py ADDED
The diff for this file is too large to render. See raw diff
 
lily_llm_api/services/generation_service.py CHANGED
@@ -358,24 +358,35 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
358
  if 'vision_grid_thw' in combined_image_metas:
359
  vision_grid = combined_image_metas['vision_grid_thw']
360
  if isinstance(vision_grid, list):
361
- # πŸ”„ Kanana λͺ¨λΈ μš”κ΅¬μ‚¬ν•­: (T, H, W) ν˜•νƒœμ˜ 3차원 ν…μ„œ
362
  if len(vision_grid) == 1 and len(vision_grid[0]) == 3:
363
- # [(1, 34, 52)] -> (1, 34, 52) ν…μ„œλ‘œ λ³€ν™˜
364
  t, h, w = vision_grid[0]
365
- # πŸ”„ 3차원 ν…μ„œλ‘œ λ³€ν™˜: (1, H, W) ν˜•νƒœ
366
- processed_image_metas['vision_grid_thw'] = torch.tensor([[t, h, w]], dtype=torch.long)
367
  print(f"πŸ” [DEBUG] vision_grid_thw ν…μ„œ λ³€ν™˜: {vision_grid} -> {processed_image_metas['vision_grid_thw'].shape}")
368
  else:
369
- # πŸ”„ λ‹€λ₯Έ ν˜•νƒœμ˜ 경우 원본 μœ μ§€
370
- processed_image_metas['vision_grid_thw'] = torch.tensor(vision_grid, dtype=torch.long)
371
  print(f"πŸ” [DEBUG] vision_grid_thw ν…μ„œ λ³€ν™˜ (κΈ°λ³Έ): {vision_grid} -> {processed_image_metas['vision_grid_thw'].shape}")
372
  else:
373
- processed_image_metas['vision_grid_thw'] = vision_grid
 
 
 
 
374
 
375
- # πŸ”„ λ‹€λ₯Έ λ©”νƒ€λ°μ΄ν„°λŠ” κ·ΈλŒ€λ‘œ μœ μ§€
376
  for key, value in combined_image_metas.items():
377
  if key != 'vision_grid_thw':
378
- processed_image_metas[key] = value
 
 
 
 
 
 
 
379
 
380
  generate_kwargs = {
381
  'input_ids': input_ids,
 
358
  if 'vision_grid_thw' in combined_image_metas:
359
  vision_grid = combined_image_metas['vision_grid_thw']
360
  if isinstance(vision_grid, list):
361
+ # πŸ”„ Kanana λͺ¨λΈ μš”κ΅¬μ‚¬ν•­: 배치 차원을 맞좀
362
  if len(vision_grid) == 1 and len(vision_grid[0]) == 3:
363
+ # [(1, 34, 52)] -> (1, 1, 34, 52) ν…μ„œλ‘œ λ³€ν™˜ (배치 차원 μΆ”κ°€)
364
  t, h, w = vision_grid[0]
365
+ # πŸ”„ 4차원 ν…μ„œλ‘œ λ³€ν™˜: (batch_size, T, H, W) ν˜•νƒœ
366
+ processed_image_metas['vision_grid_thw'] = torch.tensor([[[t, h, w]]], dtype=torch.long)
367
  print(f"πŸ” [DEBUG] vision_grid_thw ν…μ„œ λ³€ν™˜: {vision_grid} -> {processed_image_metas['vision_grid_thw'].shape}")
368
  else:
369
+ # πŸ”„ λ‹€λ₯Έ ν˜•νƒœμ˜ 경우 배치 차원 μΆ”κ°€
370
+ processed_image_metas['vision_grid_thw'] = torch.tensor([vision_grid], dtype=torch.long)
371
  print(f"πŸ” [DEBUG] vision_grid_thw ν…μ„œ λ³€ν™˜ (κΈ°λ³Έ): {vision_grid} -> {processed_image_metas['vision_grid_thw'].shape}")
372
  else:
373
+ # ν…μ„œμΈ 경우 배치 차원 확인 및 μΆ”κ°€
374
+ if len(vision_grid.shape) == 3:
375
+ processed_image_metas['vision_grid_thw'] = vision_grid.unsqueeze(0)
376
+ else:
377
+ processed_image_metas['vision_grid_thw'] = vision_grid
378
 
379
+ # πŸ”„ λ‹€λ₯Έ 메타데이터도 배치 차원 맞좀
380
  for key, value in combined_image_metas.items():
381
  if key != 'vision_grid_thw':
382
+ if isinstance(value, list):
383
+ # 리슀트인 경우 배치 차원 μΆ”κ°€
384
+ processed_image_metas[key] = [value]
385
+ elif isinstance(value, torch.Tensor) and len(value.shape) == 2:
386
+ # 2차원 ν…μ„œμΈ 경우 배치 차원 μΆ”κ°€
387
+ processed_image_metas[key] = value.unsqueeze(0)
388
+ else:
389
+ processed_image_metas[key] = value
390
 
391
  generate_kwargs = {
392
  'input_ids': input_ids,