mPLUG-Owl3-2B-241014_a14065.../configuration_mplugowl3.py

48 lines
1.5 KiB
Python

# coding=utf-8
""" mPLUGOwl3 model configuration"""
import os
from typing import Union
from transformers.utils import logging
from .configuration_hyper_qwen2 import HyperQwen2Config
from transformers.models.siglip.configuration_siglip import SiglipVisionConfig
logger = logging.get_logger(__name__)
class mPLUGOwl3Config(HyperQwen2Config):
model_type = "mplugowl3"
keys_to_ignore_at_inference = ["past_key_values"]
default_vision_config = {
"hidden_size": 1152,
"image_size": 384,
"intermediate_size": 4304,
"model_type": "siglip_vision_model",
"num_attention_heads": 16,
"num_hidden_layers": 27,
"patch_size": 14
}
def __init__(
self,
use_cache=True,
vision_config=None,
**kwargs,
):
self.use_cache = use_cache
# same as HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit add tgt_sizes
if vision_config is None:
self.vision_config = SiglipVisionConfig(**self.default_vision_config)
logger.info("vision_config is None, using default vision config")
elif isinstance(vision_config, dict):
self.vision_config = SiglipVisionConfig(**vision_config)
elif isinstance(vision_config, SiglipVisionConfig):
self.vision_config = vision_config
self.image_size = self.vision_config.image_size
self.patch_size = self.vision_config.patch_size
super().__init__(**kwargs)