first commit

This commit is contained in:
xxl 2025-01-21 00:18:16 +08:00
parent 4dbfea4b67
commit a3e7a7b7e6
17 changed files with 152310 additions and 1 deletions

283
README.md
View File

@ -1,3 +1,284 @@
---
license: apache-2.0
language:
- en
- it
- fr
- de
- es
base_model:
- MrLight/dse-qwen2-2b-mrl-v1
tags:
- transformers
- sentence-transformers
- Qwen2-VL
datasets:
- llamaindex/vdr-multilingual-train
---
# vdr-2b-multi-v1 # vdr-2b-multi-v1
vdr-2b-multi-v1 ![](cover.png)
vdr-2b-multi-v1 is a multilingual embedding model designed for visual document retrieval across multiple languages and domains. It encodes document page screenshots into dense single-vector representations, this will effectively allow to search and query visually rich multilingual documents without the need for any OCR, data extraction pipelines, chunking...
- **Trained on 🇮🇹 Italian, 🇪🇸 Spanish, 🇬🇧 English, 🇫🇷 French and 🇩🇪 German:** together they form a new large, open-source, multilingual training dataset of 500k high-quality samples.
- **Cross-lingual Retrieval**: substantially better on real-world scenarios. For example, this allows for searching german documents with italian queries.
- **Matryoshka Representation Learning**: You can reduce the vectors size 3x and still keep 98% of the embeddings quality.
# Usage
The model uses bf16 tensors and allocates ~4.4GB of VRAM when loaded. You can easily run inference and generate embeddings using 768 image patches and a batch size of 16 even on a cheap NVIDIA T4 GPU. This table reports the memory footprint (GB) under conditions of different batch sizes with HuggingFace Transformers and maximum 768 image patches.
| Batch Size | GPU Memory (GB) |
|------------|-----------------|
| 4 | 6.9 |
| 8 | 8.8 |
| 16 | 11.5 |
| 32 | 19.7 |
You can generate embeddings with this model in many different ways:
<details open>
<summary>
via LlamaIndex
</summary>
```bash
pip install -U llama-index-embeddings-huggingface
```
```python
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
model = HuggingFaceEmbedding(
model_name="llamaindex/vdr-2b-multi-v1",
device="cpu", # "mps" for mac, "cuda" for nvidia GPUs
trust_remote_code=True,
)
image_embedding = model.get_image_embedding("image.png")
query_embedding = model.get_query_embedding("some query")
```
</details>
<details>
<summary>
via HuggingFace Transformers
</summary>
```python
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from PIL import Image
import torch
import math
# more pixels -> better embeddings -> more VRAM -> slower inference
# From my experience, 768 image patches is the right spot for compute efficient embeddings.
max_pixels = 768 * 28 * 28
min_pixels = 1 * 28 * 28
# Load the embedding model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
'llamaindex/vdr-2b-multi-v1',
# These are the recommended kwargs for the model, but change them as needed
attn_implementation="flash_attention_2",
torch_dtype=torch.bfloat16,
device_map="cuda:0"
).eval()
processor = AutoProcessor.from_pretrained(
'llamaindex/vdr-2b-multi-v1',
min_pixels=min_pixels,
max_pixels=max_pixels
)
model.padding_side = "left"
processor.tokenizer.padding_side = "left"
document_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is shown in this image?<|im_end|>\n<|endoftext|>"
query_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Query: %s<|im_end|>\n<|endoftext|>"
```
**Encode queries**
```python
def encode_queries(queries: list[str], dimension: int) -> torch.Tensor:
"""
Encode a list of queries into a tensor of embeddings.
Args:
queries: A list of strings, each representing a query.
dimension: The desired dimension of the output embeddings.
Returns:
A tensor of shape (num_queries, dimension) containing the encoded queries.
"""
dummy_image = Image.new('RGB', (56, 56))
inputs = processor(
text=[query_prompt % x for x in queries],
images=[dummy_image for _ in queries],
videos=None,
padding='longest',
return_tensors='pt'
).to('cuda:0')
cache_position = torch.arange(0, len(queries))
inputs = model.prepare_inputs_for_generation(
**inputs, cache_position=cache_position, use_cache=False)
with torch.no_grad():
output = self.model(
**inputs,
return_dict=True,
output_hidden_states=True
)
embeddings = output.hidden_states[-1][:, -1]
return torch.nn.functional.normalize(embeddings[:, :dimension], p=2, dim=-1)
```
**Encode documents**
```python
def round_by_factor(number: float, factor: int) -> int:
return round(number / factor) * factor
def ceil_by_factor(number: float, factor: int) -> int:
return math.ceil(number / factor) * factor
def floor_by_factor(number: float, factor: int) -> int:
return math.floor(number / factor) * factor
def smart_resize(height: int, width: int) -> tuple[int, int]:
h_bar = max(28, round_by_factor(height, 28))
w_bar = max(28, round_by_factor(width, 28))
if h_bar * w_bar > max_pixels:
beta = math.sqrt((height * width) / max_pixels)
h_bar = floor_by_factor(height / beta, 28)
w_bar = floor_by_factor(width / beta, 28)
elif h_bar * w_bar < min_pixels:
beta = math.sqrt(min_pixels / (height * width))
h_bar = ceil_by_factor(height * beta, 28)
w_bar = ceil_by_factor(width * beta, 28)
return w_bar, h_bar
def resize(image: Image.Image):
new_size = smart_resize(image.height, image.width)
return image.resize(new_size)
def encode_documents(documents: list[Image.Image], dimension: int):
"""
Encode a list of images into a tensor of embeddings.
Args:
documents: A list of PIL Image objects.
dimension: The desired dimension of the output embeddings.
Returns:
A tensor of shape (num_documents, dimension) containing the encoded images.
"""
inputs = processor(
text=[document_prompt] * len(documents),
images=[resize(x) for x in documents],
videos=None,
padding='longest',
return_tensors='pt'
).to('cuda:0')
cache_position = torch.arange(0, len(queries))
inputs = model.prepare_inputs_for_generation(
**inputs, cache_position=cache_position, use_cache=False)
with torch.no_grad():
output = self.model(
**inputs,
return_dict=True,
output_hidden_states=True
)
embeddings = output.hidden_states[-1][:, -1]
return torch.nn.functional.normalize(embeddings[:, :dimension], p=2, dim=-1)
```
</details>
<details>
<summary>
via SentenceTransformers
</summary>
```python
from sentence_transformers import SentenceTransformer
model = SentenceTransformer(
model_name_or_path="llamaindex/vdr-2b-multi-v1",
device="cuda",
trust_remote_code=True,
# These are the recommended kwargs for the model, but change them as needed if you don't have CUDA
model_kwargs={
"torch_dtype": torch.bfloat16,
"device_map": "cuda:0",
"attn_implementation": "flash_attention_2"
},
)
embeddings = model.encode("image.png")
```
</details>
# Training
The model is based on [MrLight/dse-qwen2-2b-mrl-v1](https://huggingface.co/MrLight/dse-qwen2-2b-mrl-v1) and it was trained on the new [vdr-multilingual-train](https://huggingface.co/datasets/llamaindex/vdr-multilingual-train) dataset that consinsists of 500k high quality, multilingual query image pairs. It was trained for 1 epoch using the [DSE approach](https://arxiv.org/abs/2406.11251), with a batch size of 128 and hard-mined negatives.
# Results
![](ndcgtop.png)
The model has been evaluated on the Vidore benchmark and on custom-built evaluation sets that allow testing its multilingual capabilities on text-only, visual-only and mixed page screenshots. The evaluation dataset is publicly available [here on HuggingFace](https://huggingface.co/datasets/llamaindex/vdr-multilingual-test).
All evaluations are performed by calculating **NDCG@5** scores using **1536 dimensions** vectors and an image resolution that can be represented with **maximum 768 tokens**.
| | Avg | Italian (text) | Italian (visual) | Italian (mix) |
|---------------------|----------|----------------|------------------|---------------|
| dse-qwen2-2b-mrl-v1 | 95.1 | 95.1 | 94 | 96.2 |
| vdr-2b-multi-v1 | **97.0** | **96.4** | **96.3** | **98.4** |
| | **+2%** | | | |
| | Avg | French (text) | French (visual) | French (mix) |
|---------------------|-----------|---------------|-----------------|--------------|
| dse-qwen2-2b-mrl-v1 | 93.5 | 94.7 | 90.8 | 95.1 |
| vdr-2b-multi-v1 | **95.6** | **95.6** | **93.3** | **97.9** |
| | **+2.2%** | | | |
| | Avg | Spanish (text) | Spanish (visual) | Spanish (mix) |
|---------------------|-----------|----------------|------------------|---------------|
| dse-qwen2-2b-mrl-v1 | 96.7 | 97.2 | 94.7 | 98.2 |
| vdr-2b-multi-v1 | **98.1** | **98.3** | **96.9** | **99.1** |
| | **+1.4%** | | | |
| | Avg | German (text) | German (visual) | German (mix) |
|---------------------|-----------|---------------|-----------------|--------------|
| dse-qwen2-2b-mrl-v1 | 93.0 | 93.4 | 90 | 95.5 |
| vdr-2b-multi-v1 | **96.2** | **94.8** | **95.7** | **98.1** |
| | **+3.4%** | | | |
| | Avg | English (text) | English (visual) | English (mix) |
|---------------------|-----------|----------------|------------------|---------------|
| dse-qwen2-2b-mrl-v1 | 98.0 | **98.3** | 98.5 | 97.1 |
| vdr-2b-multi-v1 | **98.1** | 97.9 | **99.1** | **97.3** |
| | **+0.1%** | | | |
| | **Avg** | **shiftproject** | **government** | **healthcare** | **energy** | **ai** | **docvqa** | **arxivqa** | **tatdqa** | **infovqa** | **tabfquad** |
|--------------------:|---------:|-----------------:|---------------:|---------------:|-----------:|-----------:|-----------:|------------:|-----------:|------------:|-------------:|
| dse-qwen2-2b-mrl-v1 | 83.6 | 79.8 | **95.7** | **96.9** | **92** | 98.2 | 56.3 | **85.2** | **53.9** | **87.5** | 90.3 |
| vdr-2b-multi-v1 | **84.0** | **82.4** | 95.5 | 96.5 | 91.2 | **98.5** | **58.5** | 84.7 | 53.6 | 87.1 | **92.2** |

16
added_tokens.json Normal file
View File

@ -0,0 +1,16 @@
{
"<|box_end|>": 151649,
"<|box_start|>": 151648,
"<|endoftext|>": 151643,
"<|im_end|>": 151645,
"<|im_start|>": 151644,
"<|image_pad|>": 151655,
"<|object_ref_end|>": 151647,
"<|object_ref_start|>": 151646,
"<|quad_end|>": 151651,
"<|quad_start|>": 151650,
"<|video_pad|>": 151656,
"<|vision_end|>": 151653,
"<|vision_pad|>": 151654,
"<|vision_start|>": 151652
}

3
chat_template.json Normal file
View File

@ -0,0 +1,3 @@
{
"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
}

48
config.json Normal file
View File

@ -0,0 +1,48 @@
{
"_name_or_path": "MrLight/dse-qwen2-2b-mrl-v1",
"architectures": [
"Qwen2VLForConditionalGeneration"
],
"attention_dropout": 0.0,
"bos_token_id": 151643,
"eos_token_id": 151645,
"hidden_act": "silu",
"hidden_size": 1536,
"image_token_id": 151655,
"initializer_range": 0.02,
"intermediate_size": 8960,
"max_position_embeddings": 32768,
"max_window_layers": 28,
"model_type": "qwen2_vl",
"num_attention_heads": 12,
"num_hidden_layers": 28,
"num_key_value_heads": 2,
"rms_norm_eps": 1e-06,
"rope_scaling": {
"mrope_section": [
16,
24,
24
],
"rope_type": "default",
"type": "default"
},
"rope_theta": 1000000.0,
"sliding_window": 32768,
"tie_word_embeddings": true,
"torch_dtype": "bfloat16",
"transformers_version": "4.47.1",
"use_cache": true,
"use_sliding_window": false,
"video_token_id": 151656,
"vision_config": {
"hidden_size": 1536,
"in_chans": 3,
"model_type": "qwen2_vl",
"spatial_patch_size": 14
},
"vision_end_token_id": 151653,
"vision_start_token_id": 151652,
"vision_token_id": 151654,
"vocab_size": 151936
}

View File

@ -0,0 +1,13 @@
{
"__version__": {
"sentence_transformers": "3.3.0",
"transformers": "4.46.2",
"pytorch": "2.2.2"
},
"prompts":{
"image": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is shown in this image?<|im_end|>\n<|endoftext|>",
"query": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Query: %s<|im_end|>\n<|endoftext|>"
},
"default_prompt_name": null,
"similarity_fn_name": "cosine"
}

BIN
cover.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 205 KiB

314
custom_st.py Normal file
View File

@ -0,0 +1,314 @@
import base64
import json
import os
import math
from io import BytesIO
from typing import Any, Dict, List, Literal, Optional, Union
from urllib.parse import urlparse
import requests
import torch
from PIL import Image
from torch import nn
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
class Transformer(nn.Module):
save_in_root: bool = True
def __init__(
self,
model_name_or_path: str = 'llamaindex/vdr-2b-multi-v1',
processor_name_or_path: Optional[str] = None,
max_pixels: int = 768 * 28 * 28,
min_pixels: int = 1 * 28 * 28,
dimension: int = 2048,
max_seq_length: Optional[int] = None,
model_args: Optional[Dict[str, Any]] = None,
processor_args: Optional[Dict[str, Any]] = None,
tokenizer_args: Optional[Dict[str, Any]] = None,
config_args: Optional[Dict[str, Any]] = None,
cache_dir: Optional[str] = None,
backend: Literal['torch', 'onnx', 'openvino'] = 'torch',
**kwargs,
) -> None:
super(Transformer, self).__init__()
if backend != 'torch':
raise ValueError(
f'Backend \'{backend}\' is not supported, please use \'torch\' instead'
)
self.dimension = dimension
self.max_pixels = max_pixels
self.min_pixels = min_pixels
self.max_seq_length = max_seq_length
# Handle args
model_kwargs = model_args or {}
model_kwargs.update(kwargs)
processor_kwargs = processor_args or {}
processor_kwargs.update({
'min_pixels': min_pixels,
'max_pixels': max_pixels,
'cache_dir': cache_dir
})
# Initialize model
self.model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
cache_dir=cache_dir,
**model_kwargs
).eval()
# Initialize processor
self.processor = AutoProcessor.from_pretrained(
processor_name_or_path or model_name_or_path,
**processor_kwargs
)
# Set padding sides
self.model.padding_side = "left"
self.processor.tokenizer.padding_side = "left"
# Store prompts
self.document_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is shown in this image?<|im_end|>\n<|endoftext|>"
self.query_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Query: %s<|im_end|>\n<|endoftext|>"
# Try to infer max_seq_length if not provided
if self.max_seq_length is None:
if (
hasattr(self.model, 'config')
and hasattr(self.model.config, 'max_position_embeddings')
and hasattr(self.processor.tokenizer, 'model_max_length')
):
self.max_seq_length = min(
self.model.config.max_position_embeddings,
self.processor.tokenizer.model_max_length,
)
def _smart_resize(self, height: int, width: int) -> tuple[int, int]:
h_bar = max(28, self._round_by_factor(height, 28))
w_bar = max(28, self._round_by_factor(width, 28))
if h_bar * w_bar > self.max_pixels:
beta = math.sqrt((height * width) / self.max_pixels)
h_bar = self._floor_by_factor(height / beta, 28)
w_bar = self._floor_by_factor(width / beta, 28)
elif h_bar * w_bar < self.min_pixels:
beta = math.sqrt(self.min_pixels / (height * width))
h_bar = self._ceil_by_factor(height * beta, 28)
w_bar = self._ceil_by_factor(width * beta, 28)
return w_bar, h_bar
@staticmethod
def _round_by_factor(number: float, factor: int) -> int:
return round(number / factor) * factor
@staticmethod
def _ceil_by_factor(number: float, factor: int) -> int:
return math.ceil(number / factor) * factor
@staticmethod
def _floor_by_factor(number: float, factor: int) -> int:
return math.floor(number / factor) * factor
def _resize_image(self, image: Image.Image) -> Image.Image:
new_size = self._smart_resize(image.height, image.width)
return image.resize(new_size)
@staticmethod
def _decode_data_image(data_image_str: str) -> Image.Image:
header, data = data_image_str.split(',', 1)
image_data = base64.b64decode(data)
return Image.open(BytesIO(image_data))
@staticmethod
def _is_valid_url(url: str) -> bool:
try:
result = urlparse(url)
# Check if scheme and netloc are present and scheme is http/https
return all([result.scheme in ('http', 'https'), result.netloc])
except Exception:
return False
@staticmethod
def _is_safe_path(path: str) -> bool:
try:
# Convert to absolute path and normalize
abs_path = os.path.abspath(os.path.normpath(path))
# Check if file exists and is a regular file (not a directory or special file)
return os.path.isfile(abs_path)
except Exception:
return False
@staticmethod
def _load_image_from_url(url: str) -> Image.Image:
try:
response = requests.get(
url,
stream=True,
timeout=10, # Add timeout
headers={'User-Agent': 'Mozilla/5.0'} # Add user agent
)
response.raise_for_status()
# Check content type
content_type = response.headers.get('content-type', '')
if not content_type.startswith('image/'):
raise ValueError(f"Invalid content type: {content_type}")
# Limit file size (e.g., 10MB)
content = BytesIO()
size = 0
max_size = 10 * 1024 * 1024 # 10MB
for chunk in response.iter_content(chunk_size=8192):
size += len(chunk)
if size > max_size:
raise ValueError("File too large")
content.write(chunk)
content.seek(0)
return Image.open(content)
except Exception as e:
raise ValueError(f"Failed to load image from URL: {str(e)}")
@staticmethod
def _load_image_from_path(image_path: str) -> Image.Image:
try:
# Convert to absolute path and normalize
abs_path = os.path.abspath(os.path.normpath(image_path))
# Check file size before loading
file_size = os.path.getsize(abs_path)
max_size = 10 * 1024 * 1024 # 10MB
if file_size > max_size:
raise ValueError("File too large")
with Image.open(abs_path) as img:
# Make a copy to ensure file handle is closed
return img.copy()
except Exception as e:
raise ValueError(f"Failed to load image from path: {str(e)}")
@staticmethod
def _load_image_from_bytes(image_bytes: bytes) -> Image.Image:
try:
# Check size
if len(image_bytes) > 10 * 1024 * 1024: # 10MB
raise ValueError("Image data too large")
return Image.open(BytesIO(image_bytes))
except Exception as e:
raise ValueError(f"Failed to load image from bytes: {str(e)}")
def _process_input(self, texts: List[Union[str, Image.Image, bytes]]) -> tuple[List[str], List[Image.Image]]:
processed_texts = []
processed_images = []
dummy_image = Image.new('RGB', (56, 56))
for sample in texts:
if isinstance(sample, str):
# Check if the string is a valid URL
if self._is_valid_url(sample):
try:
img = self._load_image_from_url(sample)
processed_texts.append(self.document_prompt)
processed_images.append(self._resize_image(img))
except Exception as e:
# If URL loading fails, treat as regular text
processed_texts.append(self.query_prompt % sample)
processed_images.append(dummy_image)
# Check if the string is a valid file path
elif self._is_safe_path(sample):
try:
img = self._load_image_from_path(sample)
processed_texts.append(self.document_prompt)
processed_images.append(self._resize_image(img))
except Exception as e:
# If image loading fails, treat as regular text
processed_texts.append(self.query_prompt % sample)
processed_images.append(dummy_image)
else:
# Regular text query
processed_texts.append(self.query_prompt % sample)
processed_images.append(dummy_image)
elif isinstance(sample, Image.Image):
processed_texts.append(self.document_prompt)
processed_images.append(self._resize_image(sample))
elif isinstance(sample, bytes):
try:
img = self._load_image_from_bytes(sample)
processed_texts.append(self.document_prompt)
processed_images.append(self._resize_image(img))
except Exception as e:
# If bytes can't be converted to image, use dummy
processed_texts.append(self.document_prompt)
processed_images.append(dummy_image)
return processed_texts, processed_images
def forward(self, features: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
cache_position = torch.arange(0, features['input_ids'].shape[1])
inputs = self.model.prepare_inputs_for_generation(
**features, cache_position=cache_position, use_cache=False
)
# ensure inputs are on the same device as the model
device = next(self.model.parameters()).device
inputs = {k: v.to(device) for k, v in inputs.items() if isinstance(v, torch.Tensor)}
with torch.no_grad():
output = self.model(
**inputs,
return_dict=True,
output_hidden_states=True
)
embeddings = output.hidden_states[-1][:, -1]
features['sentence_embedding'] = torch.nn.functional.normalize(
embeddings[:, :self.dimension], p=2, dim=-1
)
return features
def tokenize(self, texts: List[Union[str, Image.Image, bytes]], padding: str = 'longest') -> Dict[str, torch.Tensor]:
processed_texts, processed_images = self._process_input(texts)
return self.processor(
text=processed_texts,
images=processed_images,
videos=None,
padding=padding,
return_tensors='pt'
)
def save(self, output_path: str, safe_serialization: bool = True) -> None:
"""Save the model, tokenizer and processor to the given path."""
self.model.save_pretrained(output_path, safe_serialization=safe_serialization)
self.processor.save_pretrained(output_path)
# Save the configuration
config = {
'model_name_or_path': output_path,
'max_pixels': self.max_pixels,
'min_pixels': self.min_pixels,
'dimension': self.dimension,
'max_seq_length': self.max_seq_length,
}
config_path = os.path.join(output_path, 'sentence_bert_config.json')
with open(config_path, 'w') as f:
json.dump(config, f)
@staticmethod
def load(input_path: str) -> 'Transformer':
"""Load a saved model from the given path."""
# Load configuration
config_path = os.path.join(input_path, 'sentence_bert_config.json')
if os.path.exists(config_path):
with open(config_path) as f:
config = json.load(f)
else:
config = {'model_name_or_path': input_path}
return Transformer(**config)

14
generation_config.json Normal file
View File

@ -0,0 +1,14 @@
{
"attn_implementation": "flash_attention_2",
"bos_token_id": 151643,
"do_sample": true,
"eos_token_id": [
151645,
151643
],
"pad_token_id": 151643,
"temperature": 0.01,
"top_k": 1,
"top_p": 0.001,
"transformers_version": "4.47.1"
}

151388
merges.txt Normal file

File diff suppressed because it is too large Load Diff

BIN
model.safetensors (Stored with Git LFS) Normal file

Binary file not shown.

18
modules.json Normal file
View File

@ -0,0 +1,18 @@
[
{
"idx": 0,
"name": "transformer",
"path": "",
"type": "custom_st.Transformer",
"model_name_or_path": "llamaindex/vdr-2b-multi-v1",
"dimension": 2048,
"max_pixels": 602112,
"min_pixels": 784
},
{
"idx": 1,
"name": "normalizer",
"path": "1_Normalize",
"type": "sentence_transformers.models.Normalize"
}
]

BIN
ndcgtop.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 46 KiB

29
preprocessor_config.json Normal file
View File

@ -0,0 +1,29 @@
{
"do_convert_rgb": true,
"do_normalize": true,
"do_rescale": true,
"do_resize": true,
"image_mean": [
0.48145466,
0.4578275,
0.40821073
],
"image_processor_type": "Qwen2VLImageProcessor",
"image_std": [
0.26862954,
0.26130258,
0.27577711
],
"max_pixels": 602112,
"merge_size": 2,
"min_pixels": 784,
"patch_size": 14,
"processor_class": "Qwen2VLProcessor",
"resample": 3,
"rescale_factor": 0.00392156862745098,
"size": {
"max_pixels": 12845056,
"min_pixels": 3136
},
"temporal_patch_size": 2
}

31
special_tokens_map.json Normal file
View File

@ -0,0 +1,31 @@
{
"additional_special_tokens": [
"<|im_start|>",
"<|im_end|>",
"<|object_ref_start|>",
"<|object_ref_end|>",
"<|box_start|>",
"<|box_end|>",
"<|quad_start|>",
"<|quad_end|>",
"<|vision_start|>",
"<|vision_end|>",
"<|vision_pad|>",
"<|image_pad|>",
"<|video_pad|>"
],
"eos_token": {
"content": "<|im_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"pad_token": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
}
}

BIN
tokenizer.json (Stored with Git LFS) Normal file

Binary file not shown.

147
tokenizer_config.json Normal file
View File

@ -0,0 +1,147 @@
{
"add_prefix_space": false,
"added_tokens_decoder": {
"151643": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151644": {
"content": "<|im_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151645": {
"content": "<|im_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151646": {
"content": "<|object_ref_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151647": {
"content": "<|object_ref_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151648": {
"content": "<|box_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151649": {
"content": "<|box_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151650": {
"content": "<|quad_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151651": {
"content": "<|quad_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151652": {
"content": "<|vision_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151653": {
"content": "<|vision_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151654": {
"content": "<|vision_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151655": {
"content": "<|image_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151656": {
"content": "<|video_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
}
},
"additional_special_tokens": [
"<|im_start|>",
"<|im_end|>",
"<|object_ref_start|>",
"<|object_ref_end|>",
"<|box_start|>",
"<|box_end|>",
"<|quad_start|>",
"<|quad_end|>",
"<|vision_start|>",
"<|vision_end|>",
"<|vision_pad|>",
"<|image_pad|>",
"<|video_pad|>"
],
"bos_token": null,
"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
"clean_up_tokenization_spaces": false,
"eos_token": "<|im_end|>",
"errors": "replace",
"extra_special_tokens": {},
"max_pixels": 602112,
"min_pixels": 784,
"model_max_length": 32768,
"pad_token": "<|endoftext|>",
"padding_side": "left",
"processor_class": "Qwen2VLProcessor",
"split_special_tokens": false,
"tokenizer_class": "Qwen2Tokenizer",
"unk_token": null
}

1
vocab.json Normal file

File diff suppressed because one or more lines are too long