first commit

2025-01-21 00:18:16 +08:00 · 2025-01-21 00:18:16 +08:00 · a3e7a7b7e6
parent 4dbfea4b67
commit a3e7a7b7e6
17 changed files with 152310 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -1,3 +1,284 @@
 ---
 license: apache-2.0
 language:
 - en
 - it
 - fr
 - de
 - es
 base_model:
 - MrLight/dse-qwen2-2b-mrl-v1
 tags:
 - transformers
 - sentence-transformers
 - Qwen2-VL
 datasets:
 - llamaindex/vdr-multilingual-train
 ---
 # vdr-2b-multi-v1
-vdr-2b-multi-v1
+![](cover.png)
 vdr-2b-multi-v1 is a multilingual embedding model designed for visual document retrieval across multiple languages and domains. It encodes document page screenshots into dense single-vector representations, this will effectively allow to search and query visually rich multilingual documents without the need for any OCR, data extraction pipelines, chunking...
 - **Trained on 🇮🇹 Italian, 🇪🇸 Spanish, 🇬🇧 English, 🇫🇷 French and 🇩🇪 German:** together they form a new large, open-source, multilingual training dataset of 500k high-quality samples.
 - **Cross-lingual Retrieval**: substantially better on real-world scenarios. For example, this allows for searching german documents with italian queries.
 - **Matryoshka Representation Learning**: You can reduce the vectors size 3x and still keep 98% of the embeddings quality.
 # Usage
 The model uses bf16 tensors and allocates ~4.4GB of VRAM when loaded. You can easily run inference and generate embeddings using 768 image patches and a batch size of 16 even on a cheap NVIDIA T4 GPU. This table reports the memory footprint (GB) under conditions of different batch sizes with HuggingFace Transformers and maximum 768 image patches.
 | Batch Size | GPU Memory (GB) |
 |------------|-----------------|
 |          4 |             6.9 |
 |          8 |             8.8 |
 |         16 |            11.5 |
 |         32 |            19.7 |
 You can generate embeddings with this model in many different ways:
 <details open>
 <summary>
 via LlamaIndex
 </summary>
 ```bash
 pip install -U llama-index-embeddings-huggingface
 ```
 ```python
 from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 model = HuggingFaceEmbedding(
    model_name="llamaindex/vdr-2b-multi-v1",
    device="cpu",  # "mps" for mac, "cuda" for nvidia GPUs
    trust_remote_code=True,
 )
 image_embedding = model.get_image_embedding("image.png")
 query_embedding = model.get_query_embedding("some query")
 ```
 </details>
 <details>
 <summary>
 via HuggingFace Transformers
 </summary>
 ```python
 from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 from PIL import Image
 import torch
 import math
 # more pixels -> better embeddings -> more VRAM -> slower inference
 # From my experience, 768 image patches is the right spot for compute efficient embeddings.
 max_pixels = 768 * 28 * 28
 min_pixels = 1 * 28 * 28
 # Load the embedding model and processor
 model = Qwen2VLForConditionalGeneration.from_pretrained(
    'llamaindex/vdr-2b-multi-v1',
    # These are the recommended kwargs for the model, but change them as needed
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    device_map="cuda:0"
 ).eval()
 processor = AutoProcessor.from_pretrained(
    'llamaindex/vdr-2b-multi-v1',
    min_pixels=min_pixels,
    max_pixels=max_pixels
 )
 model.padding_side = "left"
 processor.tokenizer.padding_side = "left"
 document_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is shown in this image?<|im_end|>\n<|endoftext|>"
 query_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Query: %s<|im_end|>\n<|endoftext|>"
 ```
 **Encode queries**
 ```python
 def encode_queries(queries: list[str], dimension: int) -> torch.Tensor:
    """
    Encode a list of queries into a tensor of embeddings.
    Args:
        queries: A list of strings, each representing a query.
        dimension: The desired dimension of the output embeddings.
    Returns:
        A tensor of shape (num_queries, dimension) containing the encoded queries.
    """
    dummy_image = Image.new('RGB', (56, 56))
    inputs = processor(
        text=[query_prompt % x for x in queries],
        images=[dummy_image for _ in queries],
        videos=None,
        padding='longest',
        return_tensors='pt'
    ).to('cuda:0')
    cache_position = torch.arange(0, len(queries))
    inputs = model.prepare_inputs_for_generation(
        **inputs, cache_position=cache_position, use_cache=False)
    with torch.no_grad():
        output = self.model(
            **inputs,
            return_dict=True,
            output_hidden_states=True
        )
    embeddings = output.hidden_states[-1][:, -1]
    return torch.nn.functional.normalize(embeddings[:, :dimension], p=2, dim=-1)
 ```
 **Encode documents**
 ```python
 def round_by_factor(number: float, factor: int) -> int:
    return round(number / factor) * factor
 def ceil_by_factor(number: float, factor: int) -> int:
    return math.ceil(number / factor) * factor
 def floor_by_factor(number: float, factor: int) -> int:
    return math.floor(number / factor) * factor
 def smart_resize(height: int, width: int) -> tuple[int, int]:
    h_bar = max(28, round_by_factor(height, 28))
    w_bar = max(28, round_by_factor(width, 28))
    if h_bar * w_bar > max_pixels:
        beta = math.sqrt((height * width) / max_pixels)
        h_bar = floor_by_factor(height / beta, 28)
        w_bar = floor_by_factor(width / beta, 28)
    elif h_bar * w_bar < min_pixels:
        beta = math.sqrt(min_pixels / (height * width))
        h_bar = ceil_by_factor(height * beta, 28)
        w_bar = ceil_by_factor(width * beta, 28)
    return w_bar, h_bar
 def resize(image: Image.Image):
    new_size = smart_resize(image.height, image.width)
    return image.resize(new_size)
 def encode_documents(documents: list[Image.Image], dimension: int):
    """
    Encode a list of images into a tensor of embeddings.
    Args:
        documents: A list of PIL Image objects.
        dimension: The desired dimension of the output embeddings.
    Returns:
        A tensor of shape (num_documents, dimension) containing the encoded images.
    """
    inputs = processor(
        text=[document_prompt] * len(documents),
        images=[resize(x) for x in documents],
        videos=None,
        padding='longest',
        return_tensors='pt'
    ).to('cuda:0')
    cache_position = torch.arange(0, len(queries))
    inputs = model.prepare_inputs_for_generation(
        **inputs, cache_position=cache_position, use_cache=False)
    with torch.no_grad():
        output = self.model(
            **inputs,
            return_dict=True,
            output_hidden_states=True
        )
    embeddings = output.hidden_states[-1][:, -1]
    return torch.nn.functional.normalize(embeddings[:, :dimension], p=2, dim=-1)
 ```
 </details>
 <details>
 <summary>
 via SentenceTransformers
 </summary>
 ```python
 from sentence_transformers import SentenceTransformer
 model = SentenceTransformer(
    model_name_or_path="llamaindex/vdr-2b-multi-v1",
    device="cuda",
    trust_remote_code=True,
    # These are the recommended kwargs for the model, but change them as needed if you don't have CUDA
    model_kwargs={
        "torch_dtype": torch.bfloat16, 
        "device_map": "cuda:0", 
        "attn_implementation": "flash_attention_2"
    },
 )
 embeddings = model.encode("image.png")
 ```
 </details>
 # Training
 The model is based on [MrLight/dse-qwen2-2b-mrl-v1](https://huggingface.co/MrLight/dse-qwen2-2b-mrl-v1) and it was trained on the new [vdr-multilingual-train](https://huggingface.co/datasets/llamaindex/vdr-multilingual-train) dataset that consinsists of 500k high quality, multilingual query image pairs. It was trained for 1 epoch using the [DSE approach](https://arxiv.org/abs/2406.11251), with a batch size of 128 and hard-mined negatives.
 # Results
 ![](ndcgtop.png)
 The model has been evaluated on the Vidore benchmark and on custom-built evaluation sets that allow testing its multilingual capabilities on text-only, visual-only and mixed page screenshots. The evaluation dataset is publicly available [here on HuggingFace](https://huggingface.co/datasets/llamaindex/vdr-multilingual-test).
 All evaluations are performed by calculating **NDCG@5** scores using **1536 dimensions** vectors and an image resolution that can be represented with **maximum 768 tokens**.
 |                     | Avg      | Italian (text) | Italian (visual) | Italian (mix) |
 |---------------------|----------|----------------|------------------|---------------|
 | dse-qwen2-2b-mrl-v1 |     95.1 |           95.1 |               94 |          96.2 |
 | vdr-2b-multi-v1     | **97.0** |       **96.4** |         **96.3** |      **98.4** |
 |                     |  **+2%** |                |                  |               |
 |                     | Avg       | French (text) | French (visual) | French (mix) |
 |---------------------|-----------|---------------|-----------------|--------------|
 | dse-qwen2-2b-mrl-v1 |      93.5 |          94.7 |            90.8 |         95.1 |
 | vdr-2b-multi-v1     |  **95.6** |      **95.6** |        **93.3** |     **97.9** |
 |                     | **+2.2%** |               |                 |              |
 |                     | Avg       | Spanish (text) | Spanish (visual) | Spanish (mix) |
 |---------------------|-----------|----------------|------------------|---------------|
 | dse-qwen2-2b-mrl-v1 |      96.7 |           97.2 |             94.7 |          98.2 |
 | vdr-2b-multi-v1     |  **98.1** |       **98.3** |         **96.9** |      **99.1** |
 |                     | **+1.4%** |                |                  |               |
 |                     | Avg       | German (text) | German (visual) | German (mix) |
 |---------------------|-----------|---------------|-----------------|--------------|
 | dse-qwen2-2b-mrl-v1 |      93.0 |          93.4 |              90 |         95.5 |
 | vdr-2b-multi-v1     |  **96.2** |      **94.8** |        **95.7** |     **98.1** |
 |                     | **+3.4%** |               |                 |              |
 |                     | Avg       | English (text) | English (visual) | English (mix) |
 |---------------------|-----------|----------------|------------------|---------------|
 | dse-qwen2-2b-mrl-v1 | 98.0      | **98.3**       | 98.5             | 97.1          |
 | vdr-2b-multi-v1     | **98.1**  | 97.9           | **99.1**         | **97.3**      |
 |                     | **+0.1%** |                |                  |               |
 |                     |  **Avg** | **shiftproject** | **government** | **healthcare** | **energy** | **ai**     | **docvqa** | **arxivqa** | **tatdqa** | **infovqa** | **tabfquad** |
 |--------------------:|---------:|-----------------:|---------------:|---------------:|-----------:|-----------:|-----------:|------------:|-----------:|------------:|-------------:|
 | dse-qwen2-2b-mrl-v1 |     83.6 |             79.8 |       **95.7** |       **96.9** |     **92** |   98.2     |       56.3 |    **85.2** |   **53.9** |    **87.5** |         90.3 |
 |     vdr-2b-multi-v1 | **84.0** |         **82.4** |           95.5 |           96.5 |       91.2 |   **98.5** |   **58.5** |        84.7 |       53.6 |        87.1 |     **92.2** |
--- a/added_tokens.json
+++ b/added_tokens.json
@ -0,0 +1,16 @@
 {
  "<|box_end|>": 151649,
  "<|box_start|>": 151648,
  "<|endoftext|>": 151643,
  "<|im_end|>": 151645,
  "<|im_start|>": 151644,
  "<|image_pad|>": 151655,
  "<|object_ref_end|>": 151647,
  "<|object_ref_start|>": 151646,
  "<|quad_end|>": 151651,
  "<|quad_start|>": 151650,
  "<|video_pad|>": 151656,
  "<|vision_end|>": 151653,
  "<|vision_pad|>": 151654,
  "<|vision_start|>": 151652
 }
--- a/chat_template.json
+++ b/chat_template.json
@ -0,0 +1,3 @@
 {
  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
 }
--- a/config.json
+++ b/config.json
@ -0,0 +1,48 @@
 {
  "_name_or_path": "MrLight/dse-qwen2-2b-mrl-v1",
  "architectures": [
    "Qwen2VLForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "hidden_act": "silu",
  "hidden_size": 1536,
  "image_token_id": 151655,
  "initializer_range": 0.02,
  "intermediate_size": 8960,
  "max_position_embeddings": 32768,
  "max_window_layers": 28,
  "model_type": "qwen2_vl",
  "num_attention_heads": 12,
  "num_hidden_layers": 28,
  "num_key_value_heads": 2,
  "rms_norm_eps": 1e-06,
  "rope_scaling": {
    "mrope_section": [
      16,
      24,
      24
    ],
    "rope_type": "default",
    "type": "default"
  },
  "rope_theta": 1000000.0,
  "sliding_window": 32768,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.47.1",
  "use_cache": true,
  "use_sliding_window": false,
  "video_token_id": 151656,
  "vision_config": {
    "hidden_size": 1536,
    "in_chans": 3,
    "model_type": "qwen2_vl",
    "spatial_patch_size": 14
  },
  "vision_end_token_id": 151653,
  "vision_start_token_id": 151652,
  "vision_token_id": 151654,
  "vocab_size": 151936
 }
--- a/config_sentence_transformers.json
+++ b/config_sentence_transformers.json
@ -0,0 +1,13 @@
 {
    "__version__": {
      "sentence_transformers": "3.3.0",
      "transformers": "4.46.2",
      "pytorch": "2.2.2"
    },
    "prompts":{
      "image": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is shown in this image?<|im_end|>\n<|endoftext|>",
      "query": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Query: %s<|im_end|>\n<|endoftext|>"
    },
    "default_prompt_name": null,
    "similarity_fn_name": "cosine"
  }
--- a/cover.png
+++ b/cover.png
--- a/custom_st.py
+++ b/custom_st.py
@ -0,0 +1,314 @@
 import base64
 import json
 import os
 import math
 from io import BytesIO
 from typing import Any, Dict, List, Literal, Optional, Union
 from urllib.parse import urlparse
 import requests
 import torch
 from PIL import Image
 from torch import nn
 from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 class Transformer(nn.Module):
    save_in_root: bool = True
    def __init__(
        self,
        model_name_or_path: str = 'llamaindex/vdr-2b-multi-v1',
        processor_name_or_path: Optional[str] = None,
        max_pixels: int = 768 * 28 * 28,
        min_pixels: int = 1 * 28 * 28,
        dimension: int = 2048,
        max_seq_length: Optional[int] = None,
        model_args: Optional[Dict[str, Any]] = None,
        processor_args: Optional[Dict[str, Any]] = None,
        tokenizer_args: Optional[Dict[str, Any]] = None,
        config_args: Optional[Dict[str, Any]] = None,
        cache_dir: Optional[str] = None,
        backend: Literal['torch', 'onnx', 'openvino'] = 'torch',
        **kwargs,
    ) -> None:
        super(Transformer, self).__init__()
        if backend != 'torch':
            raise ValueError(
                f'Backend \'{backend}\' is not supported, please use \'torch\' instead'
            )
        self.dimension = dimension
        self.max_pixels = max_pixels
        self.min_pixels = min_pixels
        self.max_seq_length = max_seq_length
        # Handle args
        model_kwargs = model_args or {}
        model_kwargs.update(kwargs)
        processor_kwargs = processor_args or {}
        processor_kwargs.update({
            'min_pixels': min_pixels,
            'max_pixels': max_pixels,
            'cache_dir': cache_dir
        })
        # Initialize model
        self.model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path,
            cache_dir=cache_dir,
            **model_kwargs
        ).eval()
        # Initialize processor
        self.processor = AutoProcessor.from_pretrained(
            processor_name_or_path or model_name_or_path,
            **processor_kwargs
        )
        # Set padding sides
        self.model.padding_side = "left"
        self.processor.tokenizer.padding_side = "left"
        # Store prompts
        self.document_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is shown in this image?<|im_end|>\n<|endoftext|>"
        self.query_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Query: %s<|im_end|>\n<|endoftext|>"
        # Try to infer max_seq_length if not provided
        if self.max_seq_length is None:
            if (
                hasattr(self.model, 'config') 
                and hasattr(self.model.config, 'max_position_embeddings')
                and hasattr(self.processor.tokenizer, 'model_max_length')
            ):
                self.max_seq_length = min(
                    self.model.config.max_position_embeddings,
                    self.processor.tokenizer.model_max_length,
                )
    def _smart_resize(self, height: int, width: int) -> tuple[int, int]:
        h_bar = max(28, self._round_by_factor(height, 28))
        w_bar = max(28, self._round_by_factor(width, 28))
        if h_bar * w_bar > self.max_pixels:
            beta = math.sqrt((height * width) / self.max_pixels)
            h_bar = self._floor_by_factor(height / beta, 28)
            w_bar = self._floor_by_factor(width / beta, 28)
        elif h_bar * w_bar < self.min_pixels:
            beta = math.sqrt(self.min_pixels / (height * width))
            h_bar = self._ceil_by_factor(height * beta, 28)
            w_bar = self._ceil_by_factor(width * beta, 28)
        return w_bar, h_bar
    @staticmethod
    def _round_by_factor(number: float, factor: int) -> int:
        return round(number / factor) * factor
    @staticmethod
    def _ceil_by_factor(number: float, factor: int) -> int:
        return math.ceil(number / factor) * factor
    @staticmethod
    def _floor_by_factor(number: float, factor: int) -> int:
        return math.floor(number / factor) * factor
    def _resize_image(self, image: Image.Image) -> Image.Image:
        new_size = self._smart_resize(image.height, image.width)
        return image.resize(new_size)
    @staticmethod
    def _decode_data_image(data_image_str: str) -> Image.Image:
        header, data = data_image_str.split(',', 1)
        image_data = base64.b64decode(data)
        return Image.open(BytesIO(image_data))
    @staticmethod
    def _is_valid_url(url: str) -> bool:
        try:
            result = urlparse(url)
            # Check if scheme and netloc are present and scheme is http/https
            return all([result.scheme in ('http', 'https'), result.netloc])
        except Exception:
            return False
    @staticmethod
    def _is_safe_path(path: str) -> bool:
        try:
            # Convert to absolute path and normalize
            abs_path = os.path.abspath(os.path.normpath(path))
            # Check if file exists and is a regular file (not a directory or special file)
            return os.path.isfile(abs_path)
        except Exception:
            return False
    @staticmethod
    def _load_image_from_url(url: str) -> Image.Image:
        try:
            response = requests.get(
                url, 
                stream=True, 
                timeout=10,  # Add timeout
                headers={'User-Agent': 'Mozilla/5.0'}  # Add user agent
            )
            response.raise_for_status()
            # Check content type
            content_type = response.headers.get('content-type', '')
            if not content_type.startswith('image/'):
                raise ValueError(f"Invalid content type: {content_type}")
            # Limit file size (e.g., 10MB)
            content = BytesIO()
            size = 0
            max_size = 10 * 1024 * 1024  # 10MB
            for chunk in response.iter_content(chunk_size=8192):
                size += len(chunk)
                if size > max_size:
                    raise ValueError("File too large")
                content.write(chunk)
            content.seek(0)
            return Image.open(content)
        except Exception as e:
            raise ValueError(f"Failed to load image from URL: {str(e)}")
    @staticmethod
    def _load_image_from_path(image_path: str) -> Image.Image:
        try:
            # Convert to absolute path and normalize
            abs_path = os.path.abspath(os.path.normpath(image_path))
            # Check file size before loading
            file_size = os.path.getsize(abs_path)
            max_size = 10 * 1024 * 1024  # 10MB
            if file_size > max_size:
                raise ValueError("File too large")
            with Image.open(abs_path) as img:
                # Make a copy to ensure file handle is closed
                return img.copy()
        except Exception as e:
            raise ValueError(f"Failed to load image from path: {str(e)}")
    @staticmethod
    def _load_image_from_bytes(image_bytes: bytes) -> Image.Image:
        try:
            # Check size
            if len(image_bytes) > 10 * 1024 * 1024:  # 10MB
                raise ValueError("Image data too large")
            return Image.open(BytesIO(image_bytes))
        except Exception as e:
            raise ValueError(f"Failed to load image from bytes: {str(e)}")
    def _process_input(self, texts: List[Union[str, Image.Image, bytes]]) -> tuple[List[str], List[Image.Image]]:
        processed_texts = []
        processed_images = []
        dummy_image = Image.new('RGB', (56, 56))
        for sample in texts:
            if isinstance(sample, str):
                # Check if the string is a valid URL
                if self._is_valid_url(sample):
                    try:
                        img = self._load_image_from_url(sample)
                        processed_texts.append(self.document_prompt)
                        processed_images.append(self._resize_image(img))
                    except Exception as e:
                        # If URL loading fails, treat as regular text
                        processed_texts.append(self.query_prompt % sample)
                        processed_images.append(dummy_image)
                # Check if the string is a valid file path
                elif self._is_safe_path(sample):
                    try:
                        img = self._load_image_from_path(sample)
                        processed_texts.append(self.document_prompt)
                        processed_images.append(self._resize_image(img))
                    except Exception as e:
                        # If image loading fails, treat as regular text
                        processed_texts.append(self.query_prompt % sample)
                        processed_images.append(dummy_image)
                else:
                    # Regular text query
                    processed_texts.append(self.query_prompt % sample)
                    processed_images.append(dummy_image)
            elif isinstance(sample, Image.Image):
                processed_texts.append(self.document_prompt)
                processed_images.append(self._resize_image(sample))
            elif isinstance(sample, bytes):
                try:
                    img = self._load_image_from_bytes(sample)
                    processed_texts.append(self.document_prompt)
                    processed_images.append(self._resize_image(img))
                except Exception as e:
                    # If bytes can't be converted to image, use dummy
                    processed_texts.append(self.document_prompt)
                    processed_images.append(dummy_image)
        return processed_texts, processed_images
    def forward(self, features: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
        cache_position = torch.arange(0, features['input_ids'].shape[1])
        inputs = self.model.prepare_inputs_for_generation(
            **features, cache_position=cache_position, use_cache=False
        )
        # ensure inputs are on the same device as the model
        device = next(self.model.parameters()).device
        inputs = {k: v.to(device) for k, v in inputs.items() if isinstance(v, torch.Tensor)}
        with torch.no_grad():
            output = self.model(
                **inputs,
                return_dict=True,
                output_hidden_states=True
            )
        embeddings = output.hidden_states[-1][:, -1]
        features['sentence_embedding'] = torch.nn.functional.normalize(
            embeddings[:, :self.dimension], p=2, dim=-1
        )
        return features
    def tokenize(self, texts: List[Union[str, Image.Image, bytes]], padding: str = 'longest') -> Dict[str, torch.Tensor]:
        processed_texts, processed_images = self._process_input(texts)
        return self.processor(
            text=processed_texts,
            images=processed_images,
            videos=None,
            padding=padding,
            return_tensors='pt'
        )
    def save(self, output_path: str, safe_serialization: bool = True) -> None:
        """Save the model, tokenizer and processor to the given path."""
        self.model.save_pretrained(output_path, safe_serialization=safe_serialization)
        self.processor.save_pretrained(output_path)
        # Save the configuration
        config = {
            'model_name_or_path': output_path,
            'max_pixels': self.max_pixels,
            'min_pixels': self.min_pixels,
            'dimension': self.dimension,
            'max_seq_length': self.max_seq_length,
        }
        config_path = os.path.join(output_path, 'sentence_bert_config.json')
        with open(config_path, 'w') as f:
            json.dump(config, f)
    @staticmethod
    def load(input_path: str) -> 'Transformer':
        """Load a saved model from the given path."""
        # Load configuration
        config_path = os.path.join(input_path, 'sentence_bert_config.json')
        if os.path.exists(config_path):
            with open(config_path) as f:
                config = json.load(f)
        else:
            config = {'model_name_or_path': input_path}
        return Transformer(**config)
--- a/generation_config.json
+++ b/generation_config.json
@ -0,0 +1,14 @@
 {
  "attn_implementation": "flash_attention_2",
  "bos_token_id": 151643,
  "do_sample": true,
  "eos_token_id": [
    151645,
    151643
  ],
  "pad_token_id": 151643,
  "temperature": 0.01,
  "top_k": 1,
  "top_p": 0.001,
  "transformers_version": "4.47.1"
 }
--- a/merges.txt
+++ b/merges.txt
--- a/model.safetensors
+++ b/model.safetensors
--- a/modules.json
+++ b/modules.json
@ -0,0 +1,18 @@
 [
    {
        "idx": 0,
        "name": "transformer",
        "path": "",
        "type": "custom_st.Transformer",
        "model_name_or_path": "llamaindex/vdr-2b-multi-v1",
        "dimension": 2048,
        "max_pixels": 602112,
        "min_pixels": 784
    },
    {
        "idx": 1,
        "name": "normalizer",
        "path": "1_Normalize",
        "type": "sentence_transformers.models.Normalize"
    }
 ]
--- a/ndcgtop.png
+++ b/ndcgtop.png
--- a/preprocessor_config.json
+++ b/preprocessor_config.json
@ -0,0 +1,29 @@
 {
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_processor_type": "Qwen2VLImageProcessor",
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "max_pixels": 602112,
  "merge_size": 2,
  "min_pixels": 784,
  "patch_size": 14,
  "processor_class": "Qwen2VLProcessor",
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "max_pixels": 12845056,
    "min_pixels": 3136
  },
  "temporal_patch_size": 2
 }
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@ -0,0 +1,31 @@
 {
  "additional_special_tokens": [
    "<|im_start|>",
    "<|im_end|>",
    "<|object_ref_start|>",
    "<|object_ref_end|>",
    "<|box_start|>",
    "<|box_end|>",
    "<|quad_start|>",
    "<|quad_end|>",
    "<|vision_start|>",
    "<|vision_end|>",
    "<|vision_pad|>",
    "<|image_pad|>",
    "<|video_pad|>"
  ],
  "eos_token": {
    "content": "<|im_end|>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "pad_token": {
    "content": "<|endoftext|>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  }
 }
--- a/tokenizer.json
+++ b/tokenizer.json
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
@ -0,0 +1,147 @@
 {
  "add_prefix_space": false,
  "added_tokens_decoder": {
    "151643": {
      "content": "<|endoftext|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151644": {
      "content": "<|im_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151645": {
      "content": "<|im_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151646": {
      "content": "<|object_ref_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151647": {
      "content": "<|object_ref_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151648": {
      "content": "<|box_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151649": {
      "content": "<|box_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151650": {
      "content": "<|quad_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151651": {
      "content": "<|quad_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151652": {
      "content": "<|vision_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151653": {
      "content": "<|vision_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151654": {
      "content": "<|vision_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151655": {
      "content": "<|image_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151656": {
      "content": "<|video_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    }
  },
  "additional_special_tokens": [
    "<|im_start|>",
    "<|im_end|>",
    "<|object_ref_start|>",
    "<|object_ref_end|>",
    "<|box_start|>",
    "<|box_end|>",
    "<|quad_start|>",
    "<|quad_end|>",
    "<|vision_start|>",
    "<|vision_end|>",
    "<|vision_pad|>",
    "<|image_pad|>",
    "<|video_pad|>"
  ],
  "bos_token": null,
  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
  "clean_up_tokenization_spaces": false,
  "eos_token": "<|im_end|>",
  "errors": "replace",
  "extra_special_tokens": {},
  "max_pixels": 602112,
  "min_pixels": 784,
  "model_max_length": 32768,
  "pad_token": "<|endoftext|>",
  "padding_side": "left",
  "processor_class": "Qwen2VLProcessor",
  "split_special_tokens": false,
  "tokenizer_class": "Qwen2Tokenizer",
  "unk_token": null
 }
--- a/vocab.json
+++ b/vocab.json