commit message
This commit is contained in:
parent
9985771c63
commit
0b359fa0cb
|
@ -1,35 +1,32 @@
|
|||
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||
*.model filter=lfs diff=lfs merge=lfs -text
|
||||
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
||||
*.tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||
*.db* filter=lfs diff=lfs merge=lfs -text
|
||||
*.ark* filter=lfs diff=lfs merge=lfs -text
|
||||
**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
|
||||
**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
|
||||
**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
|
||||
|
|
203
README.md
203
README.md
|
@ -1,3 +1,202 @@
|
|||
# ViT_image_classification
|
||||
---
|
||||
tasks:
|
||||
- image-classification
|
||||
model-type:
|
||||
- Transformer
|
||||
domain:
|
||||
- cv
|
||||
frameworks:
|
||||
- pytorch
|
||||
backbone:
|
||||
- ViT-Base
|
||||
metrics:
|
||||
- accuracy
|
||||
finetune-support: True
|
||||
customized-quickstart: True
|
||||
integrating: False
|
||||
license: Apache License 2.0
|
||||
tags:
|
||||
- Alibaba
|
||||
- Image classification
|
||||
- Transformer
|
||||
- Dailylife tags
|
||||
datasets:
|
||||
evaluation:
|
||||
- online dataset
|
||||
train:
|
||||
- tany0699/dailytags
|
||||
test:
|
||||
- tany0699/dailytags
|
||||
|
||||
ViT是2020年Google团队提出的将Transformer应用在图像分类的模型,虽然不是第一篇将transformer应用在视觉任务的论文,但是因为其模型‘’简单”且效果好,可扩展性强(scalable,模型越大效果越好),成为了transformer在CV领域应用的里程碑著作,也引爆了后续相关研究。
|
||||
widgets:
|
||||
- task: image-classification
|
||||
inputs:
|
||||
- type: image
|
||||
examples:
|
||||
- name: 1
|
||||
inputs:
|
||||
- name: image
|
||||
data: git://resources/test.jpg
|
||||
---
|
||||
|
||||
# 日常物体识别模型介绍
|
||||
自建1300类常见物体标签体系,覆盖常见的日用品,动物,植物,家具,设备,食物等物体,标签从海量中文互联网社区语料进行提取,保留了出现频率较高的常见物体名称。模型结构采用最新的ViT-Base结构。
|
||||
创空间快速可视化展示: [ViT图像分类-中文-日常物品](https://modelscope.cn/studios/tany0699/cv_vit-base_image-classification_Dailylife-labels/summary)
|
||||
|
||||
本系列还有如下模型,欢迎试用:
|
||||
- [ViT图像分类-通用](https://modelscope.cn/models/damo/cv_vit-base_image-classification_ImageNet-labels/summary)
|
||||
- [NextViT实时图像分类-中文-日常物品](https://modelscope.cn/models/damo/cv_nextvit-small_image-classification_Dailylife-labels/summary)
|
||||
- [ConvNeXt图像分类-中文-垃圾分类](https://modelscope.cn/models/damo/cv_convnext-base_image-classification_garbage/summary)
|
||||
- [BEiTv2图像分类-通用-base](https://modelscope.cn/models/damo/cv_beitv2-base_image-classification_patch16_224_pt1k_ft22k_in1k/summary)
|
||||
- [BEiTv2图像分类-通用-large](https://modelscope.cn/models/damo/cv_beitv2-large_image-classification_patch16_224_pt1k_ft22k_in1k/summary)
|
||||
|
||||
## 模型描述
|
||||
|
||||
采用Transformer经典的[ViT-Base](https://github.com/google-research/vision_transformer)结构, 并采用了DeiT的知识蒸馏方式进行训练。
|
||||
<img src="./resources/overview.jpg" alt="overview"/>
|
||||
|
||||
## 期望模型使用方式以及适用范围
|
||||
|
||||
本模型适用范围较广,覆盖大部分日常生活常见的物品类目,包括日用品,动物,植物,家具,设备,食物等。也可作为下游任务的预训练backbone。
|
||||
|
||||
### 如何使用
|
||||
|
||||
在ModelScope框架上,提供输入图片,即可通过简单的Pipeline调用来使用。
|
||||
|
||||
#### 代码范例
|
||||
```python
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
img_path = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/bird.JPEG'
|
||||
image_classification = pipeline(Tasks.image_classification,
|
||||
model='damo/cv_vit-base_image-classification_Dailylife-labels')
|
||||
result = image_classification(img_path)
|
||||
print(result)
|
||||
```
|
||||
|
||||
### 模型局限性以及可能的偏差
|
||||
|
||||
- 支持1300类常见物体识别
|
||||
|
||||
|
||||
## 训练数据介绍
|
||||
|
||||
- 140万包含常见物体的图像集
|
||||
|
||||
|
||||
## 模型训练流程
|
||||
|
||||
- 主要训练参数参考[DeiT论文](https://arxiv.org/abs/2012.12877)的设置,除了weight decay在复现时设置为0.1,模型训练未使用pretrained参数进行初始化。
|
||||
|
||||
### 预处理
|
||||
|
||||
测试时主要的预处理如下:
|
||||
- Resize:先将原始图片的短边缩放至256
|
||||
- Normalize:图像归一化,减均值除以标准差
|
||||
- CenterCrop:裁切为224x224
|
||||
|
||||
## 数据评估及结果
|
||||
|
||||
模型在自建测试集进行测试,结果如下:
|
||||
|
||||
| Model | top-1 acc | top-5 acc | #params | Remark |
|
||||
|:--------:|:-------:|:--------:|:-------:|--------------|
|
||||
| ViT-base | 74.3 | 95.3 | 86M | modelscope |
|
||||
|
||||
|
||||
## 模型训练
|
||||
使用托管在modelscope DatasetHub上的小型数据集[mini_imagenet100](https://modelscope.cn/datasets/tany0699/mini_imagenet100/summary)进行finetune训练的示例代码:
|
||||
|
||||
```python
|
||||
from modelscope.msdatasets import MsDataset
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.trainers import build_trainer
|
||||
import tempfile
|
||||
|
||||
model_id = 'damo/cv_vit-base_image-classification_Dailylife-labels'
|
||||
|
||||
# 加载数据
|
||||
ms_train_dataset = MsDataset.load(
|
||||
'mini_imagenet100', namespace='tany0699',
|
||||
subset_name='default', split='train') # 加载训练集
|
||||
|
||||
ms_val_dataset = MsDataset.load(
|
||||
'mini_imagenet100', namespace='tany0699',
|
||||
subset_name='default', split='validation') # 加载验证集
|
||||
|
||||
tmp_dir = tempfile.TemporaryDirectory().name # 使用临时目录作为工作目录
|
||||
|
||||
# 修改配置文件
|
||||
def cfg_modify_fn(cfg):
|
||||
cfg.train.dataloader.batch_size_per_gpu = 32 # batch大小
|
||||
cfg.train.dataloader.workers_per_gpu = 2 # 每个gpu的worker数目
|
||||
cfg.train.max_epochs = 1 # 最大训练epoch数
|
||||
cfg.model.mm_model.head.num_classes = 100 # 分类数
|
||||
cfg.model.mm_model.train_cfg.augments[0].num_classes = 100 # 分类数
|
||||
cfg.model.mm_model.train_cfg.augments[1].num_classes = 100 # 分类数
|
||||
cfg.train.optimizer.lr = 1e-4 # 学习率
|
||||
cfg.train.lr_config.warmup_iters = 1 # 预热次数
|
||||
cfg.train.evaluation.metric_options = {'topk': (1, 5)} # 训练时的评估指标
|
||||
cfg.evaluation.metric_options = {'topk': (1, 5)} # 评估时的评估指标
|
||||
return cfg
|
||||
|
||||
# 构建训练器
|
||||
kwargs = dict(
|
||||
model=model_id, # 模型id
|
||||
work_dir=tmp_dir, # 工作目录
|
||||
train_dataset=ms_train_dataset, # 训练集
|
||||
eval_dataset=ms_val_dataset, # 验证集
|
||||
cfg_modify_fn=cfg_modify_fn, # 用于修改训练配置文件的回调函数
|
||||
model_revision='v1.0.2'
|
||||
)
|
||||
trainer = build_trainer(name=Trainers.image_classification, default_args=kwargs)
|
||||
|
||||
# 进行训练
|
||||
trainer.train()
|
||||
|
||||
# 进行评估
|
||||
result = trainer.evaluate()
|
||||
print('result:', result)
|
||||
```
|
||||
训练说明见示例代码中的注释,更详细的训练说明和用法见官方的[训练文档](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E8%AE%AD%E7%BB%83Train)。训练过程产生的log和模型权重文件保存在work_dir工作目录中,并以前缀为'best_'的文件保存了在验证集上最优精度的权重。evaluate()默认使用精度最高的模型权重进行评估。
|
||||
|
||||
|
||||
## 模型评估
|
||||
使用训练好的模型对需要评估的数据集进行精度评估示例代码如下:
|
||||
|
||||
```python
|
||||
from modelscope.msdatasets import MsDataset
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.trainers import build_trainer
|
||||
import tempfile
|
||||
|
||||
model_id = 'damo/cv_vit-base_image-classification_Dailylife-labels'
|
||||
|
||||
# 加载用于评估的数据集
|
||||
ms_val_dataset = MsDataset.load(
|
||||
'dailytags', namespace='tany0699',
|
||||
subset_name='default', split='validation')
|
||||
|
||||
tmp_dir = tempfile.TemporaryDirectory().name # 使用临时目录作为工作目录
|
||||
|
||||
# 构建训练器
|
||||
kwargs = dict(
|
||||
model=model_id, # 模型id
|
||||
work_dir=tmp_dir, # 工作目录
|
||||
train_dataset=None,
|
||||
eval_dataset=ms_val_dataset, # 评估的数据集
|
||||
model_revision='v1.0.2'
|
||||
)
|
||||
trainer = build_trainer(name=Trainers.image_classification, default_args=kwargs)
|
||||
|
||||
# 开始评估
|
||||
result = trainer.evaluate()
|
||||
print('result:', result)
|
||||
```
|
||||
评估过程默认使用模型中自带的预训练权重进行评估。
|
||||
|
||||
#### Clone with HTTP
|
||||
```bash
|
||||
git clone https://www.modelscope.cn/damo/cv_vit-base_image-classification_Dailylife-labels.git
|
||||
```
|
|
@ -0,0 +1,354 @@
|
|||
rand_increasing_policies = [
|
||||
dict(type='AutoContrast'),
|
||||
dict(type='Equalize'),
|
||||
dict(type='Invert'),
|
||||
dict(type='Rotate', magnitude_key='angle', magnitude_range=(0, 30)),
|
||||
dict(type='Posterize', magnitude_key='bits', magnitude_range=(4, 0)),
|
||||
dict(type='Solarize', magnitude_key='thr', magnitude_range=(256, 0)),
|
||||
dict(
|
||||
type='SolarizeAdd',
|
||||
magnitude_key='magnitude',
|
||||
magnitude_range=(0, 110)),
|
||||
dict(
|
||||
type='ColorTransform',
|
||||
magnitude_key='magnitude',
|
||||
magnitude_range=(0, 0.9)),
|
||||
dict(type='Contrast', magnitude_key='magnitude', magnitude_range=(0, 0.9)),
|
||||
dict(
|
||||
type='Brightness', magnitude_key='magnitude',
|
||||
magnitude_range=(0, 0.9)),
|
||||
dict(
|
||||
type='Sharpness', magnitude_key='magnitude', magnitude_range=(0, 0.9)),
|
||||
dict(
|
||||
type='Shear',
|
||||
magnitude_key='magnitude',
|
||||
magnitude_range=(0, 0.3),
|
||||
direction='horizontal'),
|
||||
dict(
|
||||
type='Shear',
|
||||
magnitude_key='magnitude',
|
||||
magnitude_range=(0, 0.3),
|
||||
direction='vertical'),
|
||||
dict(
|
||||
type='Translate',
|
||||
magnitude_key='magnitude',
|
||||
magnitude_range=(0, 0.45),
|
||||
direction='horizontal'),
|
||||
dict(
|
||||
type='Translate',
|
||||
magnitude_key='magnitude',
|
||||
magnitude_range=(0, 0.45),
|
||||
direction='vertical')
|
||||
]
|
||||
dataset_type = 'ImageNet'
|
||||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='RandomResizedCrop',
|
||||
size=224,
|
||||
backend='pillow',
|
||||
interpolation='bicubic'),
|
||||
dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'),
|
||||
dict(
|
||||
type='RandAugment',
|
||||
policies=[
|
||||
dict(type='AutoContrast'),
|
||||
dict(type='Equalize'),
|
||||
dict(type='Invert'),
|
||||
dict(
|
||||
type='Rotate', magnitude_key='angle', magnitude_range=(0, 30)),
|
||||
dict(
|
||||
type='Posterize', magnitude_key='bits',
|
||||
magnitude_range=(4, 0)),
|
||||
dict(
|
||||
type='Solarize', magnitude_key='thr',
|
||||
magnitude_range=(256, 0)),
|
||||
dict(
|
||||
type='SolarizeAdd',
|
||||
magnitude_key='magnitude',
|
||||
magnitude_range=(0, 110)),
|
||||
dict(
|
||||
type='ColorTransform',
|
||||
magnitude_key='magnitude',
|
||||
magnitude_range=(0, 0.9)),
|
||||
dict(
|
||||
type='Contrast',
|
||||
magnitude_key='magnitude',
|
||||
magnitude_range=(0, 0.9)),
|
||||
dict(
|
||||
type='Brightness',
|
||||
magnitude_key='magnitude',
|
||||
magnitude_range=(0, 0.9)),
|
||||
dict(
|
||||
type='Sharpness',
|
||||
magnitude_key='magnitude',
|
||||
magnitude_range=(0, 0.9)),
|
||||
dict(
|
||||
type='Shear',
|
||||
magnitude_key='magnitude',
|
||||
magnitude_range=(0, 0.3),
|
||||
direction='horizontal'),
|
||||
dict(
|
||||
type='Shear',
|
||||
magnitude_key='magnitude',
|
||||
magnitude_range=(0, 0.3),
|
||||
direction='vertical'),
|
||||
dict(
|
||||
type='Translate',
|
||||
magnitude_key='magnitude',
|
||||
magnitude_range=(0, 0.45),
|
||||
direction='horizontal'),
|
||||
dict(
|
||||
type='Translate',
|
||||
magnitude_key='magnitude',
|
||||
magnitude_range=(0, 0.45),
|
||||
direction='vertical')
|
||||
],
|
||||
num_policies=2,
|
||||
total_level=10,
|
||||
magnitude_level=9,
|
||||
magnitude_std=0.5,
|
||||
hparams=dict(pad_val=[104, 116, 124], interpolation='bicubic')),
|
||||
dict(
|
||||
type='RandomErasing',
|
||||
erase_prob=0.25,
|
||||
mode='rand',
|
||||
min_area_ratio=0.02,
|
||||
max_area_ratio=0.3333333333333333,
|
||||
fill_color=[103.53, 116.28, 123.675],
|
||||
fill_std=[57.375, 57.12, 58.395]),
|
||||
dict(
|
||||
type='Normalize',
|
||||
mean=[123.675, 116.28, 103.53],
|
||||
std=[58.395, 57.12, 57.375],
|
||||
to_rgb=True),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='ToTensor', keys=['gt_label']),
|
||||
dict(type='Collect', keys=['img', 'gt_label'])
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='Resize',
|
||||
size=(256, -1),
|
||||
backend='pillow',
|
||||
interpolation='bicubic'),
|
||||
dict(type='CenterCrop', crop_size=224),
|
||||
dict(
|
||||
type='Normalize',
|
||||
mean=[123.675, 116.28, 103.53],
|
||||
std=[58.395, 57.12, 57.375],
|
||||
to_rgb=True),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img'])
|
||||
]
|
||||
data = dict(
|
||||
samples_per_gpu=32,
|
||||
workers_per_gpu=16,
|
||||
train=dict(
|
||||
type='ImageNet',
|
||||
data_prefix='/data/oss_bucket_0/',
|
||||
pipeline=[
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='RandomResizedCrop',
|
||||
size=224,
|
||||
backend='pillow',
|
||||
interpolation='bicubic'),
|
||||
dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'),
|
||||
dict(
|
||||
type='RandAugment',
|
||||
policies=[
|
||||
dict(type='AutoContrast'),
|
||||
dict(type='Equalize'),
|
||||
dict(type='Invert'),
|
||||
dict(
|
||||
type='Rotate',
|
||||
magnitude_key='angle',
|
||||
magnitude_range=(0, 30)),
|
||||
dict(
|
||||
type='Posterize',
|
||||
magnitude_key='bits',
|
||||
magnitude_range=(4, 0)),
|
||||
dict(
|
||||
type='Solarize',
|
||||
magnitude_key='thr',
|
||||
magnitude_range=(256, 0)),
|
||||
dict(
|
||||
type='SolarizeAdd',
|
||||
magnitude_key='magnitude',
|
||||
magnitude_range=(0, 110)),
|
||||
dict(
|
||||
type='ColorTransform',
|
||||
magnitude_key='magnitude',
|
||||
magnitude_range=(0, 0.9)),
|
||||
dict(
|
||||
type='Contrast',
|
||||
magnitude_key='magnitude',
|
||||
magnitude_range=(0, 0.9)),
|
||||
dict(
|
||||
type='Brightness',
|
||||
magnitude_key='magnitude',
|
||||
magnitude_range=(0, 0.9)),
|
||||
dict(
|
||||
type='Sharpness',
|
||||
magnitude_key='magnitude',
|
||||
magnitude_range=(0, 0.9)),
|
||||
dict(
|
||||
type='Shear',
|
||||
magnitude_key='magnitude',
|
||||
magnitude_range=(0, 0.3),
|
||||
direction='horizontal'),
|
||||
dict(
|
||||
type='Shear',
|
||||
magnitude_key='magnitude',
|
||||
magnitude_range=(0, 0.3),
|
||||
direction='vertical'),
|
||||
dict(
|
||||
type='Translate',
|
||||
magnitude_key='magnitude',
|
||||
magnitude_range=(0, 0.45),
|
||||
direction='horizontal'),
|
||||
dict(
|
||||
type='Translate',
|
||||
magnitude_key='magnitude',
|
||||
magnitude_range=(0, 0.45),
|
||||
direction='vertical')
|
||||
],
|
||||
num_policies=2,
|
||||
total_level=10,
|
||||
magnitude_level=9,
|
||||
magnitude_std=0.5,
|
||||
hparams=dict(pad_val=[104, 116, 124],
|
||||
interpolation='bicubic')),
|
||||
dict(
|
||||
type='RandomErasing',
|
||||
erase_prob=0.25,
|
||||
mode='rand',
|
||||
min_area_ratio=0.02,
|
||||
max_area_ratio=0.3333333333333333,
|
||||
fill_color=[103.53, 116.28, 123.675],
|
||||
fill_std=[57.375, 57.12, 58.395]),
|
||||
dict(
|
||||
type='Normalize',
|
||||
mean=[123.675, 116.28, 103.53],
|
||||
std=[58.395, 57.12, 57.375],
|
||||
to_rgb=True),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='ToTensor', keys=['gt_label']),
|
||||
dict(type='Collect', keys=['img', 'gt_label'])
|
||||
],
|
||||
ann_file='/data/oss_bucket_0/virgo_data/dailytags/train_mmcls.txt',
|
||||
classes='/data/oss_bucket_0/virgo_data/dailytags/classname.txt'),
|
||||
val=dict(
|
||||
type='ImageNet',
|
||||
data_prefix='/data/oss_bucket_0/',
|
||||
ann_file='/data/oss_bucket_0/virgo_data/dailytags/val_mmcls.txt',
|
||||
pipeline=[
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='Resize',
|
||||
size=(256, -1),
|
||||
backend='pillow',
|
||||
interpolation='bicubic'),
|
||||
dict(type='CenterCrop', crop_size=224),
|
||||
dict(
|
||||
type='Normalize',
|
||||
mean=[123.675, 116.28, 103.53],
|
||||
std=[58.395, 57.12, 57.375],
|
||||
to_rgb=True),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img'])
|
||||
],
|
||||
classes='/data/oss_bucket_0/virgo_data/dailytags/classname.txt'),
|
||||
test=dict(
|
||||
type='ImageNet',
|
||||
data_prefix='/data/oss_bucket_0/',
|
||||
ann_file='/data/oss_bucket_0/virgo_data/dailytags/val_mmcls.txt',
|
||||
pipeline=[
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='Resize',
|
||||
size=(256, -1),
|
||||
backend='pillow',
|
||||
interpolation='bicubic'),
|
||||
dict(type='CenterCrop', crop_size=224),
|
||||
dict(
|
||||
type='Normalize',
|
||||
mean=[123.675, 116.28, 103.53],
|
||||
std=[58.395, 57.12, 57.375],
|
||||
to_rgb=True),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img'])
|
||||
],
|
||||
classes='/data/oss_bucket_0/virgo_data/dailytags/classname.txt'))
|
||||
evaluation = dict(interval=2, metric='accuracy', save_best='auto')
|
||||
paramwise_cfg = dict(
|
||||
norm_decay_mult=0.0,
|
||||
bias_decay_mult=0.0,
|
||||
custom_keys=dict({
|
||||
'.absolute_pos_embed': dict(decay_mult=0.0),
|
||||
'.relative_position_bias_table': dict(decay_mult=0.0),
|
||||
'.cls_token': dict(decay_mult=0.0),
|
||||
'.pos_embed': dict(decay_mult=0.0)
|
||||
}))
|
||||
optimizer = dict(
|
||||
type='AdamW',
|
||||
lr=0.001,
|
||||
weight_decay=0.1,
|
||||
eps=1e-08,
|
||||
betas=(0.9, 0.999),
|
||||
paramwise_cfg=dict(
|
||||
norm_decay_mult=0.0,
|
||||
bias_decay_mult=0.0,
|
||||
custom_keys=dict({
|
||||
'.absolute_pos_embed': dict(decay_mult=0.0),
|
||||
'.relative_position_bias_table': dict(decay_mult=0.0),
|
||||
'.cls_token': dict(decay_mult=0.0),
|
||||
'.pos_embed': dict(decay_mult=0.0)
|
||||
})))
|
||||
optimizer_config = dict(grad_clip=dict(max_norm=5.0))
|
||||
lr_config = dict(
|
||||
policy='CosineAnnealing',
|
||||
by_epoch=False,
|
||||
min_lr_ratio=0.01,
|
||||
warmup='linear',
|
||||
warmup_ratio=0.001,
|
||||
warmup_iters=20,
|
||||
warmup_by_epoch=True)
|
||||
runner = dict(type='EpochBasedRunner', max_epochs=300)
|
||||
checkpoint_config = dict(interval=1, max_keep_ckpts=20, create_symlink=False)
|
||||
log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')])
|
||||
dist_params = dict(backend='nccl')
|
||||
log_level = 'INFO'
|
||||
load_from = None
|
||||
resume_from = '/data/oss_bucket_1/emian/VisualContentRecognition/mmlab/work_dirs/dailytags_deit-base_pt-16xb64_in1k_gzn2_pai_wd0.1/epoch_296.pth'
|
||||
workflow = [('train', 1)]
|
||||
model = dict(
|
||||
type='ImageClassifier',
|
||||
backbone=dict(
|
||||
type='VisionTransformer',
|
||||
arch='deit-base',
|
||||
img_size=224,
|
||||
patch_size=16,
|
||||
drop_path_rate=0.1),
|
||||
neck=None,
|
||||
head=dict(
|
||||
type='VisionTransformerClsHead',
|
||||
num_classes=1296,
|
||||
in_channels=768,
|
||||
loss=dict(
|
||||
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original')),
|
||||
init_cfg=[
|
||||
dict(type='TruncNormal', layer='Linear', std=0.02),
|
||||
dict(type='Constant', layer='LayerNorm', val=1.0, bias=0.0)
|
||||
],
|
||||
train_cfg=dict(augments=[
|
||||
dict(type='BatchMixup', alpha=0.8, num_classes=1296, prob=0.5),
|
||||
dict(type='BatchCutMix', alpha=1.0, num_classes=1296, prob=0.5)
|
||||
]))
|
||||
custom_hooks = [dict(type='EMAHook', momentum=4e-05, priority='ABOVE_NORMAL')]
|
||||
work_dir = '/data/oss_bucket_1/emian/VisualContentRecognition/mmlab/work_dirs/dailytags_deit-base_pt-16xb64_in1k_gzn2_pai_wd0.1/'
|
||||
gpu_ids = range(0, 32)
|
|
@ -0,0 +1,332 @@
|
|||
{
|
||||
"framework":"pytorch",
|
||||
"task":"image-classification",
|
||||
"pipeline":{
|
||||
"type":"vit-base_image-classification_Dailylife-labels"
|
||||
},
|
||||
"model":{
|
||||
"type": "ClassificationModel",
|
||||
"mm_model": {
|
||||
"type": "ImageClassifier",
|
||||
"pretrained": null,
|
||||
"backbone": {
|
||||
"type": "VisionTransformer",
|
||||
"arch": "deit-base",
|
||||
"img_size": 224,
|
||||
"patch_size": 16,
|
||||
"drop_path_rate": 0.1
|
||||
},
|
||||
"neck": null,
|
||||
"head": {
|
||||
"type": "VisionTransformerClsHead",
|
||||
"num_classes": 1296,
|
||||
"in_channels": 768,
|
||||
"loss": {
|
||||
"type": "LabelSmoothLoss",
|
||||
"label_smooth_val": 0.1,
|
||||
"mode": "original"
|
||||
}
|
||||
},
|
||||
"init_cfg": [
|
||||
{
|
||||
"type": "TruncNormal",
|
||||
"layer": "Linear",
|
||||
"std": 0.02
|
||||
},
|
||||
{
|
||||
"type": "Constant",
|
||||
"layer": "LayerNorm",
|
||||
"val": 1.0,
|
||||
"bias": 0.0
|
||||
}
|
||||
],
|
||||
"train_cfg":
|
||||
{
|
||||
"augments": [
|
||||
{
|
||||
"type": "BatchMixup",
|
||||
"alpha": 0.8,
|
||||
"num_classes": 1296,
|
||||
"prob": 0.5
|
||||
},
|
||||
{
|
||||
"type": "BatchCutMix",
|
||||
"alpha": 1.0,
|
||||
"num_classes": 1296,
|
||||
"prob": 0.5
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
"dataset": {
|
||||
"classes": null
|
||||
},
|
||||
|
||||
"preprocessor": {
|
||||
"type": "image-classification-bypass-preprocessor",
|
||||
"train": [
|
||||
{
|
||||
"type": "LoadImageFromFile"
|
||||
},
|
||||
{
|
||||
"type": "RandomResizedCrop",
|
||||
"size": 224,
|
||||
"backend": "pillow",
|
||||
"interpolation": "bicubic"
|
||||
},
|
||||
{
|
||||
"type": "RandomFlip",
|
||||
"flip_prob": 0.5,
|
||||
"direction": "horizontal"
|
||||
},
|
||||
{
|
||||
"type": "RandAugment",
|
||||
"policies": [
|
||||
{
|
||||
"type": "AutoContrast"
|
||||
},
|
||||
{
|
||||
"type": "Equalize"
|
||||
},
|
||||
{
|
||||
"type": "Invert"
|
||||
},
|
||||
{
|
||||
"type": "Rotate",
|
||||
"magnitude_key": "angle",
|
||||
"magnitude_range": [0, 30]
|
||||
|
||||
},
|
||||
{
|
||||
"type": "Posterize",
|
||||
"magnitude_key": "bits",
|
||||
"magnitude_range": [4, 0]
|
||||
|
||||
},
|
||||
{
|
||||
"type": "Solarize",
|
||||
"magnitude_key": "thr",
|
||||
"magnitude_range": [256, 0]
|
||||
|
||||
},
|
||||
{
|
||||
"type": "SolarizeAdd",
|
||||
"magnitude_key": "magnitude",
|
||||
"magnitude_range": [0, 110]
|
||||
|
||||
},
|
||||
{
|
||||
"type": "ColorTransform",
|
||||
"magnitude_key": "magnitude",
|
||||
"magnitude_range": [0, 0.9]
|
||||
|
||||
},
|
||||
{
|
||||
"type": "Contrast",
|
||||
"magnitude_key": "magnitude",
|
||||
"magnitude_range": [0, 0.9]
|
||||
|
||||
},
|
||||
{
|
||||
"type": "Brightness",
|
||||
"magnitude_key": "magnitude",
|
||||
"magnitude_range": [0, 0.9]
|
||||
|
||||
},
|
||||
{
|
||||
"type": "Sharpness",
|
||||
"magnitude_key": "magnitude",
|
||||
"magnitude_range": [0, 0.9]
|
||||
|
||||
},
|
||||
{
|
||||
"type": "Shear",
|
||||
"magnitude_key": "magnitude",
|
||||
"magnitude_range": [0, 0.3],
|
||||
"direction": "horizontal"
|
||||
|
||||
},
|
||||
{
|
||||
"type": "Shear",
|
||||
"magnitude_key": "magnitude",
|
||||
"magnitude_range": [0, 0.3],
|
||||
"direction": "vertical"
|
||||
|
||||
},
|
||||
{
|
||||
"type": "Translate",
|
||||
"magnitude_key": "magnitude",
|
||||
"magnitude_range": [0, 0.45],
|
||||
"direction": "horizontal"
|
||||
|
||||
},
|
||||
{
|
||||
"type": "Translate",
|
||||
"magnitude_key": "magnitude",
|
||||
"magnitude_range": [0, 0.45],
|
||||
"direction": "vertical"
|
||||
|
||||
}
|
||||
],
|
||||
"num_policies": 2,
|
||||
"total_level": 10,
|
||||
"magnitude_level": 9,
|
||||
"magnitude_std": 0.5,
|
||||
"hparams": {
|
||||
"pad_val": [104, 116, 124],
|
||||
"interpolation": "bicubic"
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "RandomErasing",
|
||||
"erase_prob": 0.25,
|
||||
"mode": "rand",
|
||||
"min_area_ratio": 0.02,
|
||||
"max_area_ratio": 0.3333333333333333,
|
||||
"fill_color": [103.53, 116.28, 123.675],
|
||||
"fill_std": [57.375, 57.12, 58.395]
|
||||
},
|
||||
{
|
||||
"type": "Normalize",
|
||||
"mean": [123.675, 116.28, 103.53],
|
||||
"std": [58.395, 57.12, 57.375],
|
||||
"to_rgb": true
|
||||
},
|
||||
{
|
||||
"type": "ImageToTensor",
|
||||
"keys": ["img"]
|
||||
},
|
||||
{
|
||||
"type": "ToTensor",
|
||||
"keys": ["gt_label"]
|
||||
},
|
||||
{
|
||||
"type": "Collect",
|
||||
"keys": ["img", "gt_label"]
|
||||
}
|
||||
],
|
||||
|
||||
"val": [
|
||||
{
|
||||
"type": "LoadImageFromFile"
|
||||
},
|
||||
{
|
||||
"type": "Resize",
|
||||
"size": [256, -1],
|
||||
"backend": "pillow",
|
||||
"interpolation": "bicubic"
|
||||
},
|
||||
{
|
||||
"type": "CenterCrop",
|
||||
"crop_size": 224
|
||||
},
|
||||
{
|
||||
"type": "Normalize",
|
||||
"mean": [123.675, 116.28, 103.53],
|
||||
"std": [58.395, 57.12, 57.375],
|
||||
"to_rgb": true
|
||||
},
|
||||
{
|
||||
"type": "ImageToTensor",
|
||||
"keys": ["img"]
|
||||
},
|
||||
{
|
||||
"type": "Collect",
|
||||
"keys": ["img"]
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
"train": {
|
||||
"dataloader": {
|
||||
"batch_size_per_gpu": 32,
|
||||
"workers_per_gpu": 4
|
||||
},
|
||||
"max_epochs": 1,
|
||||
"runner": {
|
||||
"type": "EpochBasedRunner",
|
||||
"max_epochs": 300
|
||||
},
|
||||
"evaluation": {
|
||||
"interval": 1,
|
||||
"metric": "accuracy",
|
||||
"save_best": "auto"
|
||||
},
|
||||
"checkpoint_config": {
|
||||
"interval": 1,
|
||||
"max_keep_ckpts": 20,
|
||||
"create_symlink": true
|
||||
},
|
||||
"log_config": {
|
||||
"interval": 100,
|
||||
"hooks": [
|
||||
{
|
||||
"type": "TextLoggerHook"
|
||||
}
|
||||
]
|
||||
},
|
||||
"custom_hooks": [
|
||||
{
|
||||
"type": "EMAHook",
|
||||
"momentum": 4e-05,
|
||||
"priority": "ABOVE_NORMAL"
|
||||
}
|
||||
],
|
||||
"workflow": [
|
||||
["train", 1]
|
||||
],
|
||||
"work_dir": "./work_dir",
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"lr": 0.001,
|
||||
"weight_decay": 0.1,
|
||||
"eps": 1e-08,
|
||||
"betas": [0.9, 0.999],
|
||||
"paramwise_cfg": {
|
||||
"norm_decay_mult": 0.0,
|
||||
"bias_decay_mult": 0.0,
|
||||
"custom_keys": {
|
||||
".absolute_pos_embed": {
|
||||
"decay_mult": 0.0
|
||||
},
|
||||
".relative_position_bias_table": {
|
||||
"decay_mult": 0.0
|
||||
},
|
||||
".cls_token": {
|
||||
"decay_mult": 0.0
|
||||
},
|
||||
".pos_embed": {
|
||||
"decay_mult": 0.0
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"optimizer_config": {
|
||||
"grad_clip": {
|
||||
"max_norm": 5.0
|
||||
}
|
||||
},
|
||||
"lr_config": {
|
||||
"policy": "CosineAnnealing",
|
||||
"by_epoch": false,
|
||||
"min_lr_ratio": 0.01,
|
||||
"warmup": "linear",
|
||||
"warmup_ratio": 0.001,
|
||||
"warmup_iters": 20,
|
||||
"warmup_by_epoch": true
|
||||
}
|
||||
},
|
||||
|
||||
"evaluation": {
|
||||
"dataloader": {
|
||||
"batch_size_per_gpu": 32,
|
||||
"workers_per_gpu": 4
|
||||
},
|
||||
"metrics": ["accuracy"],
|
||||
"metric_options": {
|
||||
"topk": [1, 5]
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
Binary file not shown.
After Width: | Height: | Size: 323 KiB |
Binary file not shown.
After Width: | Height: | Size: 68 KiB |
Loading…
Reference in New Issue