| | |
| | |
| | |
| | |
| |
|
| | from transformers.configuration_utils import PretrainedConfig |
| | from transformers.utils import logging |
| | from .configuration_nemotron_h import NemotronHConfig |
| | from .configuration_radio import RADIOConfig |
| |
|
| | logger = logging.get_logger(__name__) |
| |
|
| | class NemotronH_Nano_VL_V2_Config(PretrainedConfig): |
| | model_type = 'NemotronH_Nano_VL_V2' |
| | is_composition = True |
| |
|
| | def __init__( |
| | self, |
| | vision_config=None, |
| | llm_config=None, |
| | force_image_size=None, |
| | downsample_ratio=0.5, |
| | template=None, |
| | ps_version='v1', |
| | image_tag_type="internvl", |
| | projector_hidden_size=4096, |
| | vit_hidden_size=1280, |
| | attn_implementation="flash_attention_2", |
| | video_pruning_rate: float = 0.0, |
| | **kwargs |
| | ): |
| | super().__init__(**kwargs) |
| |
|
| | if vision_config is not None: |
| | self.vision_config = RADIOConfig(**vision_config) |
| | else: |
| | self.vision_config = RADIOConfig() |
| |
|
| | |
| | if llm_config is not None: |
| | self.llm_config = NemotronHConfig(**llm_config) |
| | else: |
| | self.llm_config = NemotronHConfig() |
| |
|
| | |
| | self.force_image_size = force_image_size |
| | self.downsample_ratio = downsample_ratio |
| | self.template = template |
| | self.ps_version = ps_version |
| | self.image_tag_type = image_tag_type |
| | self.projector_hidden_size = projector_hidden_size |
| | self.vit_hidden_size = vit_hidden_size |
| | self.video_pruning_rate = video_pruning_rate |
| |
|
| | self._attn_implementation = attn_implementation |
| | self.vision_config.use_flash_attn = self._attn_implementation is not None and "flash_attention" in self._attn_implementation |
| | self.llm_config._attn_implementation = self._attn_implementation |
| |
|