update
Browse files
README.md
CHANGED
@@ -44,7 +44,6 @@ We provide the following models:
|
|
44 |
| Eagle2-1B | [Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) | Siglip | 16K| [🤗 link](https://huggingface.co/NVIDIA/Eagle2-1B)|
|
45 |
| Eagle2-2B | [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | Siglip | 16K| [🤗 link](https://huggingface.co/NVIDIA/Eagle2-2B)|
|
46 |
| Eagle2-9B | [Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | Siglip+ConvNext | 16K| [🤗 link](https://huggingface.co/NVIDIA/Eagle2-9B)|
|
47 |
-
| Eagle2-32B | [Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) | Siglip+ConvNext | 16K| [🤗 link](https://huggingface.co/NVIDIA/Eagle2-32B)|
|
48 |
|
49 |
## Benchmark Results
|
50 |
| Benchmark | MiniCPM-Llama3-V-2_5 | InternVL-Chat-V1-5 | InternVL2-8B |QwenVL2-7B| Eagle2-9B|
|
|
|
44 |
| Eagle2-1B | [Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) | Siglip | 16K| [🤗 link](https://huggingface.co/NVIDIA/Eagle2-1B)|
|
45 |
| Eagle2-2B | [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | Siglip | 16K| [🤗 link](https://huggingface.co/NVIDIA/Eagle2-2B)|
|
46 |
| Eagle2-9B | [Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | Siglip+ConvNext | 16K| [🤗 link](https://huggingface.co/NVIDIA/Eagle2-9B)|
|
|
|
47 |
|
48 |
## Benchmark Results
|
49 |
| Benchmark | MiniCPM-Llama3-V-2_5 | InternVL-Chat-V1-5 | InternVL2-8B |QwenVL2-7B| Eagle2-9B|
|
configuration_eagle_chat.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
# --------------------------------------------------------
|
2 |
# Eagle2
|
3 |
# Copyright (c) 2025 NVIDIA
|
4 |
-
# Licensed under The
|
5 |
# --------------------------------------------------------
|
6 |
|
7 |
import copy
|
@@ -36,6 +36,7 @@ class Eagle2ChatConfig(PretrainedConfig):
|
|
36 |
mlp_checkpoint=True,
|
37 |
pre_feature_reduction=False,
|
38 |
keep_aspect_ratio=False,
|
|
|
39 |
**kwargs):
|
40 |
super().__init__(**kwargs)
|
41 |
|
@@ -73,6 +74,7 @@ class Eagle2ChatConfig(PretrainedConfig):
|
|
73 |
self.mlp_checkpoint = mlp_checkpoint
|
74 |
self.pre_feature_reduction = pre_feature_reduction
|
75 |
self.keep_aspect_ratio = keep_aspect_ratio
|
|
|
76 |
logger.info(f'keep_aspect_ratio: {self.keep_aspect_ratio}')
|
77 |
logger.info(f'vision_select_layer: {self.select_layer}')
|
78 |
logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
|
|
|
1 |
# --------------------------------------------------------
|
2 |
# Eagle2
|
3 |
# Copyright (c) 2025 NVIDIA
|
4 |
+
# Licensed under The Apache License [see LICENSE for details]
|
5 |
# --------------------------------------------------------
|
6 |
|
7 |
import copy
|
|
|
36 |
mlp_checkpoint=True,
|
37 |
pre_feature_reduction=False,
|
38 |
keep_aspect_ratio=False,
|
39 |
+
vocab_size=-1,
|
40 |
**kwargs):
|
41 |
super().__init__(**kwargs)
|
42 |
|
|
|
74 |
self.mlp_checkpoint = mlp_checkpoint
|
75 |
self.pre_feature_reduction = pre_feature_reduction
|
76 |
self.keep_aspect_ratio = keep_aspect_ratio
|
77 |
+
self.vocab_size = self.llm_config.vocab_size
|
78 |
logger.info(f'keep_aspect_ratio: {self.keep_aspect_ratio}')
|
79 |
logger.info(f'vision_select_layer: {self.select_layer}')
|
80 |
logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
|
configuration_multi_backbone_channel_concatentation_model.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
# --------------------------------------------------------
|
2 |
# Eagle2
|
3 |
# Copyright (c) 2025 NVIDIA
|
4 |
-
# Licensed under The
|
5 |
# --------------------------------------------------------
|
6 |
|
7 |
import os
|
|
|
1 |
# --------------------------------------------------------
|
2 |
# Eagle2
|
3 |
# Copyright (c) 2025 NVIDIA
|
4 |
+
# Licensed under The Apache License [see LICENSE for details]
|
5 |
# --------------------------------------------------------
|
6 |
|
7 |
import os
|
modeling_eagle_chat.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
# --------------------------------------------------------
|
2 |
# Eagle2
|
3 |
# Copyright (c) 2025 NVIDIA
|
4 |
-
# Licensed under The
|
5 |
# --------------------------------------------------------
|
6 |
|
7 |
import warnings
|
|
|
1 |
# --------------------------------------------------------
|
2 |
# Eagle2
|
3 |
# Copyright (c) 2025 NVIDIA
|
4 |
+
# Licensed under The Apache License [see LICENSE for details]
|
5 |
# --------------------------------------------------------
|
6 |
|
7 |
import warnings
|
modeling_siglip.py
CHANGED
@@ -1,3 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# coding=utf-8
|
2 |
# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
|
3 |
#
|
@@ -374,6 +382,10 @@ class SiglipAttention(nn.Module):
|
|
374 |
"""Input shape: Batch x Time x Channel"""
|
375 |
if self.use_flash_attn:
|
376 |
return self._flash_attn(hidden_states)
|
|
|
|
|
|
|
|
|
377 |
batch_size, q_len, _ = hidden_states.size()
|
378 |
|
379 |
query_states = self.q_proj(hidden_states)
|
|
|
1 |
+
# --------------------------------------------------------
|
2 |
+
# Eagle2
|
3 |
+
# Copyright (c) 2025 NVIDIA
|
4 |
+
# Licensed under The Apache License [see LICENSE for details]
|
5 |
+
# Support flash-attention in SigLIP
|
6 |
+
# --------------------------------------------------------
|
7 |
+
|
8 |
+
|
9 |
# coding=utf-8
|
10 |
# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
|
11 |
#
|
|
|
382 |
"""Input shape: Batch x Time x Channel"""
|
383 |
if self.use_flash_attn:
|
384 |
return self._flash_attn(hidden_states)
|
385 |
+
else:
|
386 |
+
return self._vanilla_attn(hidden_states, attention_mask, output_attentions)
|
387 |
+
|
388 |
+
def _vanilla_attn(self, hidden_states, attention_mask=None, output_attentions=False):
|
389 |
batch_size, q_len, _ = hidden_states.size()
|
390 |
|
391 |
query_states = self.q_proj(hidden_states)
|
multi_backbone_channel_concatenation_encoder.py
CHANGED
@@ -1,13 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import torch, os
|
2 |
import torch.nn as nn
|
3 |
from torch.utils.checkpoint import checkpoint
|
4 |
|
5 |
from .siglip_vision_tower import SiglipVisionTower
|
6 |
|
7 |
-
# from .hr_clip_encoder import HRCLIPVisionTower
|
8 |
-
# from .eva_vit import EVAVITVisionTower
|
9 |
-
# from .SAM.modeling_sam import SAMVisionTower
|
10 |
-
# from .pix2struct_large import Pix2StructLargeVisionTower
|
11 |
import torch.nn.functional as F
|
12 |
from torch.nn.init import trunc_normal_
|
13 |
from copy import deepcopy
|
|
|
1 |
+
# --------------------------------------------------------
|
2 |
+
# Eagle2
|
3 |
+
# Copyright (c) 2025 NVIDIA
|
4 |
+
# Licensed under The Apache License [see LICENSE for details]
|
5 |
+
# --------------------------------------------------------
|
6 |
+
|
7 |
import torch, os
|
8 |
import torch.nn as nn
|
9 |
from torch.utils.checkpoint import checkpoint
|
10 |
|
11 |
from .siglip_vision_tower import SiglipVisionTower
|
12 |
|
|
|
|
|
|
|
|
|
13 |
import torch.nn.functional as F
|
14 |
from torch.nn.init import trunc_normal_
|
15 |
from copy import deepcopy
|
multi_backbone_channel_concatentation_model.py
CHANGED
@@ -1,3 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import torch.nn as nn
|
2 |
|
3 |
from transformers.modeling_outputs import BaseModelOutputWithPooling
|
|
|
1 |
+
# --------------------------------------------------------
|
2 |
+
# Eagle2
|
3 |
+
# Copyright (c) 2025 NVIDIA
|
4 |
+
# Licensed under The Apache License [see LICENSE for details]
|
5 |
+
# --------------------------------------------------------
|
6 |
+
|
7 |
import torch.nn as nn
|
8 |
|
9 |
from transformers.modeling_outputs import BaseModelOutputWithPooling
|