diff --git a/.gitattributes b/.gitattributes index a6344aa..97c8e80 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,35 +1,32 @@ *.7z filter=lfs diff=lfs merge=lfs -text *.arrow filter=lfs diff=lfs merge=lfs -text *.bin filter=lfs diff=lfs merge=lfs -text +*.bin.* filter=lfs diff=lfs merge=lfs -text *.bz2 filter=lfs diff=lfs merge=lfs -text -*.ckpt filter=lfs diff=lfs merge=lfs -text *.ftz filter=lfs diff=lfs merge=lfs -text *.gz filter=lfs diff=lfs merge=lfs -text *.h5 filter=lfs diff=lfs merge=lfs -text *.joblib filter=lfs diff=lfs merge=lfs -text *.lfs.* filter=lfs diff=lfs merge=lfs -text -*.mlmodel filter=lfs diff=lfs merge=lfs -text *.model filter=lfs diff=lfs merge=lfs -text *.msgpack filter=lfs diff=lfs merge=lfs -text -*.npy filter=lfs diff=lfs merge=lfs -text -*.npz filter=lfs diff=lfs merge=lfs -text *.onnx filter=lfs diff=lfs merge=lfs -text *.ot filter=lfs diff=lfs merge=lfs -text *.parquet filter=lfs diff=lfs merge=lfs -text *.pb filter=lfs diff=lfs merge=lfs -text -*.pickle filter=lfs diff=lfs merge=lfs -text -*.pkl filter=lfs diff=lfs merge=lfs -text *.pt filter=lfs diff=lfs merge=lfs -text *.pth filter=lfs diff=lfs merge=lfs -text *.rar filter=lfs diff=lfs merge=lfs -text -*.safetensors filter=lfs diff=lfs merge=lfs -text saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.tar.* filter=lfs diff=lfs merge=lfs -text -*.tar filter=lfs diff=lfs merge=lfs -text *.tflite filter=lfs diff=lfs merge=lfs -text *.tgz filter=lfs diff=lfs merge=lfs -text -*.wasm filter=lfs diff=lfs merge=lfs -text *.xz filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text -*.zst filter=lfs diff=lfs merge=lfs -text -*tfevents* filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md index 8b1bca6..f3a6632 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,202 @@ -# ViT_image_classification +--- +tasks: +- image-classification +model-type: +- Transformer +domain: +- cv +frameworks: +- pytorch +backbone: +- ViT-Base +metrics: +- accuracy +finetune-support: True +customized-quickstart: True +integrating: False +license: Apache License 2.0 +tags: +- Alibaba +- Image classification +- Transformer +- Dailylife tags +datasets: + evaluation: + - online dataset + train: + - tany0699/dailytags + test: + - tany0699/dailytags -ViT是2020年Google团队提出的将Transformer应用在图像分类的模型,虽然不是第一篇将transformer应用在视觉任务的论文,但是因为其模型‘’简单”且效果好,可扩展性强(scalable,模型越大效果越好),成为了transformer在CV领域应用的里程碑著作,也引爆了后续相关研究。 +widgets: + - task: image-classification + inputs: + - type: image + examples: + - name: 1 + inputs: + - name: image + data: git://resources/test.jpg +--- + +# 日常物体识别模型介绍 +自建1300类常见物体标签体系,覆盖常见的日用品,动物,植物,家具,设备,食物等物体,标签从海量中文互联网社区语料进行提取,保留了出现频率较高的常见物体名称。模型结构采用最新的ViT-Base结构。 +创空间快速可视化展示: [ViT图像分类-中文-日常物品](https://modelscope.cn/studios/tany0699/cv_vit-base_image-classification_Dailylife-labels/summary) + +本系列还有如下模型,欢迎试用: +- [ViT图像分类-通用](https://modelscope.cn/models/damo/cv_vit-base_image-classification_ImageNet-labels/summary) +- [NextViT实时图像分类-中文-日常物品](https://modelscope.cn/models/damo/cv_nextvit-small_image-classification_Dailylife-labels/summary) +- [ConvNeXt图像分类-中文-垃圾分类](https://modelscope.cn/models/damo/cv_convnext-base_image-classification_garbage/summary) +- [BEiTv2图像分类-通用-base](https://modelscope.cn/models/damo/cv_beitv2-base_image-classification_patch16_224_pt1k_ft22k_in1k/summary) +- [BEiTv2图像分类-通用-large](https://modelscope.cn/models/damo/cv_beitv2-large_image-classification_patch16_224_pt1k_ft22k_in1k/summary) + +## 模型描述 + +采用Transformer经典的[ViT-Base](https://github.com/google-research/vision_transformer)结构, 并采用了DeiT的知识蒸馏方式进行训练。 +overview + +## 期望模型使用方式以及适用范围 + +本模型适用范围较广,覆盖大部分日常生活常见的物品类目,包括日用品,动物,植物,家具,设备,食物等。也可作为下游任务的预训练backbone。 + +### 如何使用 + +在ModelScope框架上,提供输入图片,即可通过简单的Pipeline调用来使用。 + +#### 代码范例 +```python +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks + +img_path = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/bird.JPEG' +image_classification = pipeline(Tasks.image_classification, + model='damo/cv_vit-base_image-classification_Dailylife-labels') +result = image_classification(img_path) +print(result) +``` + +### 模型局限性以及可能的偏差 + +- 支持1300类常见物体识别 + + +## 训练数据介绍 + +- 140万包含常见物体的图像集 + + +## 模型训练流程 + +- 主要训练参数参考[DeiT论文](https://arxiv.org/abs/2012.12877)的设置,除了weight decay在复现时设置为0.1,模型训练未使用pretrained参数进行初始化。 + +### 预处理 + +测试时主要的预处理如下: +- Resize:先将原始图片的短边缩放至256 +- Normalize:图像归一化,减均值除以标准差 +- CenterCrop:裁切为224x224 + +## 数据评估及结果 + +模型在自建测试集进行测试,结果如下: + +| Model | top-1 acc | top-5 acc | #params | Remark | +|:--------:|:-------:|:--------:|:-------:|--------------| +| ViT-base | 74.3 | 95.3 | 86M | modelscope | + + +## 模型训练 +使用托管在modelscope DatasetHub上的小型数据集[mini_imagenet100](https://modelscope.cn/datasets/tany0699/mini_imagenet100/summary)进行finetune训练的示例代码: + +```python +from modelscope.msdatasets import MsDataset +from modelscope.metainfo import Trainers +from modelscope.trainers import build_trainer +import tempfile + +model_id = 'damo/cv_vit-base_image-classification_Dailylife-labels' + +# 加载数据 +ms_train_dataset = MsDataset.load( + 'mini_imagenet100', namespace='tany0699', + subset_name='default', split='train') # 加载训练集 + +ms_val_dataset = MsDataset.load( + 'mini_imagenet100', namespace='tany0699', + subset_name='default', split='validation') # 加载验证集 + +tmp_dir = tempfile.TemporaryDirectory().name # 使用临时目录作为工作目录 + +# 修改配置文件 +def cfg_modify_fn(cfg): + cfg.train.dataloader.batch_size_per_gpu = 32 # batch大小 + cfg.train.dataloader.workers_per_gpu = 2 # 每个gpu的worker数目 + cfg.train.max_epochs = 1 # 最大训练epoch数 + cfg.model.mm_model.head.num_classes = 100 # 分类数 + cfg.model.mm_model.train_cfg.augments[0].num_classes = 100 # 分类数 + cfg.model.mm_model.train_cfg.augments[1].num_classes = 100 # 分类数 + cfg.train.optimizer.lr = 1e-4 # 学习率 + cfg.train.lr_config.warmup_iters = 1 # 预热次数 + cfg.train.evaluation.metric_options = {'topk': (1, 5)} # 训练时的评估指标 + cfg.evaluation.metric_options = {'topk': (1, 5)} # 评估时的评估指标 + return cfg + +# 构建训练器 +kwargs = dict( + model=model_id, # 模型id + work_dir=tmp_dir, # 工作目录 + train_dataset=ms_train_dataset, # 训练集 + eval_dataset=ms_val_dataset, # 验证集 + cfg_modify_fn=cfg_modify_fn, # 用于修改训练配置文件的回调函数 + model_revision='v1.0.2' + ) +trainer = build_trainer(name=Trainers.image_classification, default_args=kwargs) + +# 进行训练 +trainer.train() + +# 进行评估 +result = trainer.evaluate() +print('result:', result) +``` +训练说明见示例代码中的注释,更详细的训练说明和用法见官方的[训练文档](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E8%AE%AD%E7%BB%83Train)。训练过程产生的log和模型权重文件保存在work_dir工作目录中,并以前缀为'best_'的文件保存了在验证集上最优精度的权重。evaluate()默认使用精度最高的模型权重进行评估。 + + +## 模型评估 +使用训练好的模型对需要评估的数据集进行精度评估示例代码如下: + +```python +from modelscope.msdatasets import MsDataset +from modelscope.metainfo import Trainers +from modelscope.trainers import build_trainer +import tempfile + +model_id = 'damo/cv_vit-base_image-classification_Dailylife-labels' + +# 加载用于评估的数据集 +ms_val_dataset = MsDataset.load( + 'dailytags', namespace='tany0699', + subset_name='default', split='validation') + +tmp_dir = tempfile.TemporaryDirectory().name # 使用临时目录作为工作目录 + +# 构建训练器 +kwargs = dict( + model=model_id, # 模型id + work_dir=tmp_dir, # 工作目录 + train_dataset=None, + eval_dataset=ms_val_dataset, # 评估的数据集 + model_revision='v1.0.2' + ) +trainer = build_trainer(name=Trainers.image_classification, default_args=kwargs) + +# 开始评估 +result = trainer.evaluate() +print('result:', result) +``` +评估过程默认使用模型中自带的预训练权重进行评估。 + +#### Clone with HTTP +```bash + git clone https://www.modelscope.cn/damo/cv_vit-base_image-classification_Dailylife-labels.git +``` \ No newline at end of file diff --git a/config.py b/config.py new file mode 100644 index 0000000..ef2266e --- /dev/null +++ b/config.py @@ -0,0 +1,354 @@ +rand_increasing_policies = [ + dict(type='AutoContrast'), + dict(type='Equalize'), + dict(type='Invert'), + dict(type='Rotate', magnitude_key='angle', magnitude_range=(0, 30)), + dict(type='Posterize', magnitude_key='bits', magnitude_range=(4, 0)), + dict(type='Solarize', magnitude_key='thr', magnitude_range=(256, 0)), + dict( + type='SolarizeAdd', + magnitude_key='magnitude', + magnitude_range=(0, 110)), + dict( + type='ColorTransform', + magnitude_key='magnitude', + magnitude_range=(0, 0.9)), + dict(type='Contrast', magnitude_key='magnitude', magnitude_range=(0, 0.9)), + dict( + type='Brightness', magnitude_key='magnitude', + magnitude_range=(0, 0.9)), + dict( + type='Sharpness', magnitude_key='magnitude', magnitude_range=(0, 0.9)), + dict( + type='Shear', + magnitude_key='magnitude', + magnitude_range=(0, 0.3), + direction='horizontal'), + dict( + type='Shear', + magnitude_key='magnitude', + magnitude_range=(0, 0.3), + direction='vertical'), + dict( + type='Translate', + magnitude_key='magnitude', + magnitude_range=(0, 0.45), + direction='horizontal'), + dict( + type='Translate', + magnitude_key='magnitude', + magnitude_range=(0, 0.45), + direction='vertical') +] +dataset_type = 'ImageNet' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='RandomResizedCrop', + size=224, + backend='pillow', + interpolation='bicubic'), + dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), + dict( + type='RandAugment', + policies=[ + dict(type='AutoContrast'), + dict(type='Equalize'), + dict(type='Invert'), + dict( + type='Rotate', magnitude_key='angle', magnitude_range=(0, 30)), + dict( + type='Posterize', magnitude_key='bits', + magnitude_range=(4, 0)), + dict( + type='Solarize', magnitude_key='thr', + magnitude_range=(256, 0)), + dict( + type='SolarizeAdd', + magnitude_key='magnitude', + magnitude_range=(0, 110)), + dict( + type='ColorTransform', + magnitude_key='magnitude', + magnitude_range=(0, 0.9)), + dict( + type='Contrast', + magnitude_key='magnitude', + magnitude_range=(0, 0.9)), + dict( + type='Brightness', + magnitude_key='magnitude', + magnitude_range=(0, 0.9)), + dict( + type='Sharpness', + magnitude_key='magnitude', + magnitude_range=(0, 0.9)), + dict( + type='Shear', + magnitude_key='magnitude', + magnitude_range=(0, 0.3), + direction='horizontal'), + dict( + type='Shear', + magnitude_key='magnitude', + magnitude_range=(0, 0.3), + direction='vertical'), + dict( + type='Translate', + magnitude_key='magnitude', + magnitude_range=(0, 0.45), + direction='horizontal'), + dict( + type='Translate', + magnitude_key='magnitude', + magnitude_range=(0, 0.45), + direction='vertical') + ], + num_policies=2, + total_level=10, + magnitude_level=9, + magnitude_std=0.5, + hparams=dict(pad_val=[104, 116, 124], interpolation='bicubic')), + dict( + type='RandomErasing', + erase_prob=0.25, + mode='rand', + min_area_ratio=0.02, + max_area_ratio=0.3333333333333333, + fill_color=[103.53, 116.28, 123.675], + fill_std=[57.375, 57.12, 58.395]), + dict( + type='Normalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['gt_label']), + dict(type='Collect', keys=['img', 'gt_label']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='Resize', + size=(256, -1), + backend='pillow', + interpolation='bicubic'), + dict(type='CenterCrop', crop_size=224), + dict( + type='Normalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) +] +data = dict( + samples_per_gpu=32, + workers_per_gpu=16, + train=dict( + type='ImageNet', + data_prefix='/data/oss_bucket_0/', + pipeline=[ + dict(type='LoadImageFromFile'), + dict( + type='RandomResizedCrop', + size=224, + backend='pillow', + interpolation='bicubic'), + dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), + dict( + type='RandAugment', + policies=[ + dict(type='AutoContrast'), + dict(type='Equalize'), + dict(type='Invert'), + dict( + type='Rotate', + magnitude_key='angle', + magnitude_range=(0, 30)), + dict( + type='Posterize', + magnitude_key='bits', + magnitude_range=(4, 0)), + dict( + type='Solarize', + magnitude_key='thr', + magnitude_range=(256, 0)), + dict( + type='SolarizeAdd', + magnitude_key='magnitude', + magnitude_range=(0, 110)), + dict( + type='ColorTransform', + magnitude_key='magnitude', + magnitude_range=(0, 0.9)), + dict( + type='Contrast', + magnitude_key='magnitude', + magnitude_range=(0, 0.9)), + dict( + type='Brightness', + magnitude_key='magnitude', + magnitude_range=(0, 0.9)), + dict( + type='Sharpness', + magnitude_key='magnitude', + magnitude_range=(0, 0.9)), + dict( + type='Shear', + magnitude_key='magnitude', + magnitude_range=(0, 0.3), + direction='horizontal'), + dict( + type='Shear', + magnitude_key='magnitude', + magnitude_range=(0, 0.3), + direction='vertical'), + dict( + type='Translate', + magnitude_key='magnitude', + magnitude_range=(0, 0.45), + direction='horizontal'), + dict( + type='Translate', + magnitude_key='magnitude', + magnitude_range=(0, 0.45), + direction='vertical') + ], + num_policies=2, + total_level=10, + magnitude_level=9, + magnitude_std=0.5, + hparams=dict(pad_val=[104, 116, 124], + interpolation='bicubic')), + dict( + type='RandomErasing', + erase_prob=0.25, + mode='rand', + min_area_ratio=0.02, + max_area_ratio=0.3333333333333333, + fill_color=[103.53, 116.28, 123.675], + fill_std=[57.375, 57.12, 58.395]), + dict( + type='Normalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['gt_label']), + dict(type='Collect', keys=['img', 'gt_label']) + ], + ann_file='/data/oss_bucket_0/virgo_data/dailytags/train_mmcls.txt', + classes='/data/oss_bucket_0/virgo_data/dailytags/classname.txt'), + val=dict( + type='ImageNet', + data_prefix='/data/oss_bucket_0/', + ann_file='/data/oss_bucket_0/virgo_data/dailytags/val_mmcls.txt', + pipeline=[ + dict(type='LoadImageFromFile'), + dict( + type='Resize', + size=(256, -1), + backend='pillow', + interpolation='bicubic'), + dict(type='CenterCrop', crop_size=224), + dict( + type='Normalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) + ], + classes='/data/oss_bucket_0/virgo_data/dailytags/classname.txt'), + test=dict( + type='ImageNet', + data_prefix='/data/oss_bucket_0/', + ann_file='/data/oss_bucket_0/virgo_data/dailytags/val_mmcls.txt', + pipeline=[ + dict(type='LoadImageFromFile'), + dict( + type='Resize', + size=(256, -1), + backend='pillow', + interpolation='bicubic'), + dict(type='CenterCrop', crop_size=224), + dict( + type='Normalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) + ], + classes='/data/oss_bucket_0/virgo_data/dailytags/classname.txt')) +evaluation = dict(interval=2, metric='accuracy', save_best='auto') +paramwise_cfg = dict( + norm_decay_mult=0.0, + bias_decay_mult=0.0, + custom_keys=dict({ + '.absolute_pos_embed': dict(decay_mult=0.0), + '.relative_position_bias_table': dict(decay_mult=0.0), + '.cls_token': dict(decay_mult=0.0), + '.pos_embed': dict(decay_mult=0.0) + })) +optimizer = dict( + type='AdamW', + lr=0.001, + weight_decay=0.1, + eps=1e-08, + betas=(0.9, 0.999), + paramwise_cfg=dict( + norm_decay_mult=0.0, + bias_decay_mult=0.0, + custom_keys=dict({ + '.absolute_pos_embed': dict(decay_mult=0.0), + '.relative_position_bias_table': dict(decay_mult=0.0), + '.cls_token': dict(decay_mult=0.0), + '.pos_embed': dict(decay_mult=0.0) + }))) +optimizer_config = dict(grad_clip=dict(max_norm=5.0)) +lr_config = dict( + policy='CosineAnnealing', + by_epoch=False, + min_lr_ratio=0.01, + warmup='linear', + warmup_ratio=0.001, + warmup_iters=20, + warmup_by_epoch=True) +runner = dict(type='EpochBasedRunner', max_epochs=300) +checkpoint_config = dict(interval=1, max_keep_ckpts=20, create_symlink=False) +log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')]) +dist_params = dict(backend='nccl') +log_level = 'INFO' +load_from = None +resume_from = '/data/oss_bucket_1/emian/VisualContentRecognition/mmlab/work_dirs/dailytags_deit-base_pt-16xb64_in1k_gzn2_pai_wd0.1/epoch_296.pth' +workflow = [('train', 1)] +model = dict( + type='ImageClassifier', + backbone=dict( + type='VisionTransformer', + arch='deit-base', + img_size=224, + patch_size=16, + drop_path_rate=0.1), + neck=None, + head=dict( + type='VisionTransformerClsHead', + num_classes=1296, + in_channels=768, + loss=dict( + type='LabelSmoothLoss', label_smooth_val=0.1, mode='original')), + init_cfg=[ + dict(type='TruncNormal', layer='Linear', std=0.02), + dict(type='Constant', layer='LayerNorm', val=1.0, bias=0.0) + ], + train_cfg=dict(augments=[ + dict(type='BatchMixup', alpha=0.8, num_classes=1296, prob=0.5), + dict(type='BatchCutMix', alpha=1.0, num_classes=1296, prob=0.5) + ])) +custom_hooks = [dict(type='EMAHook', momentum=4e-05, priority='ABOVE_NORMAL')] +work_dir = '/data/oss_bucket_1/emian/VisualContentRecognition/mmlab/work_dirs/dailytags_deit-base_pt-16xb64_in1k_gzn2_pai_wd0.1/' +gpu_ids = range(0, 32) diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..9bf6009 --- /dev/null +++ b/configuration.json @@ -0,0 +1,332 @@ +{ + "framework":"pytorch", + "task":"image-classification", + "pipeline":{ + "type":"vit-base_image-classification_Dailylife-labels" + }, + "model":{ + "type": "ClassificationModel", + "mm_model": { + "type": "ImageClassifier", + "pretrained": null, + "backbone": { + "type": "VisionTransformer", + "arch": "deit-base", + "img_size": 224, + "patch_size": 16, + "drop_path_rate": 0.1 + }, + "neck": null, + "head": { + "type": "VisionTransformerClsHead", + "num_classes": 1296, + "in_channels": 768, + "loss": { + "type": "LabelSmoothLoss", + "label_smooth_val": 0.1, + "mode": "original" + } + }, + "init_cfg": [ + { + "type": "TruncNormal", + "layer": "Linear", + "std": 0.02 + }, + { + "type": "Constant", + "layer": "LayerNorm", + "val": 1.0, + "bias": 0.0 + } + ], + "train_cfg": + { + "augments": [ + { + "type": "BatchMixup", + "alpha": 0.8, + "num_classes": 1296, + "prob": 0.5 + }, + { + "type": "BatchCutMix", + "alpha": 1.0, + "num_classes": 1296, + "prob": 0.5 + } + ] + } + } + }, + + "dataset": { + "classes": null + }, + + "preprocessor": { + "type": "image-classification-bypass-preprocessor", + "train": [ + { + "type": "LoadImageFromFile" + }, + { + "type": "RandomResizedCrop", + "size": 224, + "backend": "pillow", + "interpolation": "bicubic" + }, + { + "type": "RandomFlip", + "flip_prob": 0.5, + "direction": "horizontal" + }, + { + "type": "RandAugment", + "policies": [ + { + "type": "AutoContrast" + }, + { + "type": "Equalize" + }, + { + "type": "Invert" + }, + { + "type": "Rotate", + "magnitude_key": "angle", + "magnitude_range": [0, 30] + + }, + { + "type": "Posterize", + "magnitude_key": "bits", + "magnitude_range": [4, 0] + + }, + { + "type": "Solarize", + "magnitude_key": "thr", + "magnitude_range": [256, 0] + + }, + { + "type": "SolarizeAdd", + "magnitude_key": "magnitude", + "magnitude_range": [0, 110] + + }, + { + "type": "ColorTransform", + "magnitude_key": "magnitude", + "magnitude_range": [0, 0.9] + + }, + { + "type": "Contrast", + "magnitude_key": "magnitude", + "magnitude_range": [0, 0.9] + + }, + { + "type": "Brightness", + "magnitude_key": "magnitude", + "magnitude_range": [0, 0.9] + + }, + { + "type": "Sharpness", + "magnitude_key": "magnitude", + "magnitude_range": [0, 0.9] + + }, + { + "type": "Shear", + "magnitude_key": "magnitude", + "magnitude_range": [0, 0.3], + "direction": "horizontal" + + }, + { + "type": "Shear", + "magnitude_key": "magnitude", + "magnitude_range": [0, 0.3], + "direction": "vertical" + + }, + { + "type": "Translate", + "magnitude_key": "magnitude", + "magnitude_range": [0, 0.45], + "direction": "horizontal" + + }, + { + "type": "Translate", + "magnitude_key": "magnitude", + "magnitude_range": [0, 0.45], + "direction": "vertical" + + } + ], + "num_policies": 2, + "total_level": 10, + "magnitude_level": 9, + "magnitude_std": 0.5, + "hparams": { + "pad_val": [104, 116, 124], + "interpolation": "bicubic" + } + }, + { + "type": "RandomErasing", + "erase_prob": 0.25, + "mode": "rand", + "min_area_ratio": 0.02, + "max_area_ratio": 0.3333333333333333, + "fill_color": [103.53, 116.28, 123.675], + "fill_std": [57.375, 57.12, 58.395] + }, + { + "type": "Normalize", + "mean": [123.675, 116.28, 103.53], + "std": [58.395, 57.12, 57.375], + "to_rgb": true + }, + { + "type": "ImageToTensor", + "keys": ["img"] + }, + { + "type": "ToTensor", + "keys": ["gt_label"] + }, + { + "type": "Collect", + "keys": ["img", "gt_label"] + } + ], + + "val": [ + { + "type": "LoadImageFromFile" + }, + { + "type": "Resize", + "size": [256, -1], + "backend": "pillow", + "interpolation": "bicubic" + }, + { + "type": "CenterCrop", + "crop_size": 224 + }, + { + "type": "Normalize", + "mean": [123.675, 116.28, 103.53], + "std": [58.395, 57.12, 57.375], + "to_rgb": true + }, + { + "type": "ImageToTensor", + "keys": ["img"] + }, + { + "type": "Collect", + "keys": ["img"] + } + ] + }, + + "train": { + "dataloader": { + "batch_size_per_gpu": 32, + "workers_per_gpu": 4 + }, + "max_epochs": 1, + "runner": { + "type": "EpochBasedRunner", + "max_epochs": 300 + }, + "evaluation": { + "interval": 1, + "metric": "accuracy", + "save_best": "auto" + }, + "checkpoint_config": { + "interval": 1, + "max_keep_ckpts": 20, + "create_symlink": true + }, + "log_config": { + "interval": 100, + "hooks": [ + { + "type": "TextLoggerHook" + } + ] + }, + "custom_hooks": [ + { + "type": "EMAHook", + "momentum": 4e-05, + "priority": "ABOVE_NORMAL" + } + ], + "workflow": [ + ["train", 1] + ], + "work_dir": "./work_dir", + "optimizer": { + "type": "AdamW", + "lr": 0.001, + "weight_decay": 0.1, + "eps": 1e-08, + "betas": [0.9, 0.999], + "paramwise_cfg": { + "norm_decay_mult": 0.0, + "bias_decay_mult": 0.0, + "custom_keys": { + ".absolute_pos_embed": { + "decay_mult": 0.0 + }, + ".relative_position_bias_table": { + "decay_mult": 0.0 + }, + ".cls_token": { + "decay_mult": 0.0 + }, + ".pos_embed": { + "decay_mult": 0.0 + } + } + } + }, + "optimizer_config": { + "grad_clip": { + "max_norm": 5.0 + } + }, + "lr_config": { + "policy": "CosineAnnealing", + "by_epoch": false, + "min_lr_ratio": 0.01, + "warmup": "linear", + "warmup_ratio": 0.001, + "warmup_iters": 20, + "warmup_by_epoch": true + } + }, + + "evaluation": { + "dataloader": { + "batch_size_per_gpu": 32, + "workers_per_gpu": 4 + }, + "metrics": ["accuracy"], + "metric_options": { + "topk": [1, 5] + } + } +} diff --git a/pytorch_model.pt b/pytorch_model.pt new file mode 100644 index 0000000..fc9e9e0 --- /dev/null +++ b/pytorch_model.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1732b9c9c2daad2b55b013d8a4e75a5af9b0bffe1c0843561849194f50a2bf7f +size 347270855 diff --git a/resources/.gitkeep b/resources/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/resources/overview.jpg b/resources/overview.jpg new file mode 100644 index 0000000..9ce6f2a Binary files /dev/null and b/resources/overview.jpg differ diff --git a/resources/test.jpg b/resources/test.jpg new file mode 100644 index 0000000..84e8f7b Binary files /dev/null and b/resources/test.jpg differ