commit 20f29158d290ddcc1d2d30832a6b000691cee529 Author: ailab Date: Thu Jul 11 15:32:07 2024 +0800 first commit diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..4a6f8b4 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,28 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +model.safetensors filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..e96ad2f --- /dev/null +++ b/README.md @@ -0,0 +1,106 @@ +--- +license: apache-2.0 +tags: +- object-detection +- vision +datasets: +- coco +widget: +- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/savanna.jpg + example_title: Savanna +- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/football-match.jpg + example_title: Football Match +- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/airport.jpg + example_title: Airport +--- + +# YOLOS (tiny-sized) model + +YOLOS model fine-tuned on COCO 2017 object detection (118k annotated images). It was introduced in the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Fang et al. and first released in [this repository](https://github.com/hustvl/YOLOS). + +Disclaimer: The team releasing YOLOS did not write a model card for this model so this model card has been written by the Hugging Face team. + +## Model description + +YOLOS is a Vision Transformer (ViT) trained using the DETR loss. Despite its simplicity, a base-sized YOLOS model is able to achieve 42 AP on COCO validation 2017 (similar to DETR and more complex frameworks such as Faster R-CNN). + +The model is trained using a "bipartite matching loss": one compares the predicted classes + bounding boxes of each of the N = 100 object queries to the ground truth annotations, padded up to the same length N (so if an image only contains 4 objects, 96 annotations will just have a "no object" as class and "no bounding box" as bounding box). The Hungarian matching algorithm is used to create an optimal one-to-one mapping between each of the N queries and each of the N annotations. Next, standard cross-entropy (for the classes) and a linear combination of the L1 and generalized IoU loss (for the bounding boxes) are used to optimize the parameters of the model. + +## Intended uses & limitations + +You can use the raw model for object detection. See the [model hub](https://huggingface.co/models?search=hustvl/yolos) to look for all available YOLOS models. + +### How to use + +Here is how to use this model: + +```python +from transformers import YolosImageProcessor, YolosForObjectDetection +from PIL import Image +import torch +import requests + +url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image = Image.open(requests.get(url, stream=True).raw) + +model = YolosForObjectDetection.from_pretrained('hustvl/yolos-tiny') +image_processor = YolosImageProcessor.from_pretrained("hustvl/yolos-tiny") + +inputs = image_processor(images=image, return_tensors="pt") +outputs = model(**inputs) + +# model predicts bounding boxes and corresponding COCO classes +logits = outputs.logits +bboxes = outputs.pred_boxes + + +# print results +target_sizes = torch.tensor([image.size[::-1]]) +results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[0] +for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): + box = [round(i, 2) for i in box.tolist()] + print( + f"Detected {model.config.id2label[label.item()]} with confidence " + f"{round(score.item(), 3)} at location {box}" + ) +``` + +Currently, both the feature extractor and model support PyTorch. + +## Training data + +The YOLOS model was pre-trained on [ImageNet-1k](https://huggingface.co/datasets/imagenet2012) and fine-tuned on [COCO 2017 object detection](https://cocodataset.org/#download), a dataset consisting of 118k/5k annotated images for training/validation respectively. + +### Training + +The model was pre-trained for 300 epochs on ImageNet-1k and fine-tuned for 300 epochs on COCO. + +## Evaluation results + +This model achieves an AP (average precision) of **28.7** on COCO 2017 validation. For more details regarding evaluation results, we refer to the original paper. + +### BibTeX entry and citation info + +```bibtex +@article{DBLP:journals/corr/abs-2106-00666, + author = {Yuxin Fang and + Bencheng Liao and + Xinggang Wang and + Jiemin Fang and + Jiyang Qi and + Rui Wu and + Jianwei Niu and + Wenyu Liu}, + title = {You Only Look at One Sequence: Rethinking Transformer in Vision through + Object Detection}, + journal = {CoRR}, + volume = {abs/2106.00666}, + year = {2021}, + url = {https://arxiv.org/abs/2106.00666}, + eprinttype = {arXiv}, + eprint = {2106.00666}, + timestamp = {Fri, 29 Apr 2022 19:49:16 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-2106-00666.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..9b1dc5f --- /dev/null +++ b/config.json @@ -0,0 +1,209 @@ +{ + "architectures": [ + "YolosForObjectDetection" + ], + "attention_probs_dropout_prob": 0.0, + "auxiliary_loss": false, + "bbox_cost": 5, + "bbox_loss_coefficient": 5, + "class_cost": 1, + "eos_coefficient": 0.1, + "giou_cost": 2, + "giou_loss_coefficient": 2, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.0, + "hidden_size": 192, + "id2label": { + "0": "N/A", + "1": "person", + "2": "bicycle", + "3": "car", + "4": "motorcycle", + "5": "airplane", + "6": "bus", + "7": "train", + "8": "truck", + "9": "boat", + "10": "traffic light", + "11": "fire hydrant", + "12": "N/A", + "13": "stop sign", + "14": "parking meter", + "15": "bench", + "16": "bird", + "17": "cat", + "18": "dog", + "19": "horse", + "20": "sheep", + "21": "cow", + "22": "elephant", + "23": "bear", + "24": "zebra", + "25": "giraffe", + "26": "N/A", + "27": "backpack", + "28": "umbrella", + "29": "N/A", + "30": "N/A", + "31": "handbag", + "32": "tie", + "33": "suitcase", + "34": "frisbee", + "35": "skis", + "36": "snowboard", + "37": "sports ball", + "38": "kite", + "39": "baseball bat", + "40": "baseball glove", + "41": "skateboard", + "42": "surfboard", + "43": "tennis racket", + "44": "bottle", + "45": "N/A", + "46": "wine glass", + "47": "cup", + "48": "fork", + "49": "knife", + "50": "spoon", + "51": "bowl", + "52": "banana", + "53": "apple", + "54": "sandwich", + "55": "orange", + "56": "broccoli", + "57": "carrot", + "58": "hot dog", + "59": "pizza", + "60": "donut", + "61": "cake", + "62": "chair", + "63": "couch", + "64": "potted plant", + "65": "bed", + "66": "N/A", + "67": "dining table", + "68": "N/A", + "69": "N/A", + "70": "toilet", + "71": "N/A", + "72": "tv", + "73": "laptop", + "74": "mouse", + "75": "remote", + "76": "keyboard", + "77": "cell phone", + "78": "microwave", + "79": "oven", + "80": "toaster", + "81": "sink", + "82": "refrigerator", + "83": "N/A", + "84": "book", + "85": "clock", + "86": "vase", + "87": "scissors", + "88": "teddy bear", + "89": "hair drier", + "90": "toothbrush" + }, + "image_size": [ + 800, + 1333 + ], + "initializer_range": 0.02, + "intermediate_size": 768, + "label2id": { + "N/A": 83, + "airplane": 5, + "apple": 53, + "backpack": 27, + "banana": 52, + "baseball bat": 39, + "baseball glove": 40, + "bear": 23, + "bed": 65, + "bench": 15, + "bicycle": 2, + "bird": 16, + "boat": 9, + "book": 84, + "bottle": 44, + "bowl": 51, + "broccoli": 56, + "bus": 6, + "cake": 61, + "car": 3, + "carrot": 57, + "cat": 17, + "cell phone": 77, + "chair": 62, + "clock": 85, + "couch": 63, + "cow": 21, + "cup": 47, + "dining table": 67, + "dog": 18, + "donut": 60, + "elephant": 22, + "fire hydrant": 11, + "fork": 48, + "frisbee": 34, + "giraffe": 25, + "hair drier": 89, + "handbag": 31, + "horse": 19, + "hot dog": 58, + "keyboard": 76, + "kite": 38, + "knife": 49, + "laptop": 73, + "microwave": 78, + "motorcycle": 4, + "mouse": 74, + "orange": 55, + "oven": 79, + "parking meter": 14, + "person": 1, + "pizza": 59, + "potted plant": 64, + "refrigerator": 82, + "remote": 75, + "sandwich": 54, + "scissors": 87, + "sheep": 20, + "sink": 81, + "skateboard": 41, + "skis": 35, + "snowboard": 36, + "spoon": 50, + "sports ball": 37, + "stop sign": 13, + "suitcase": 33, + "surfboard": 42, + "teddy bear": 88, + "tennis racket": 43, + "tie": 32, + "toaster": 80, + "toilet": 70, + "toothbrush": 90, + "traffic light": 10, + "train": 7, + "truck": 8, + "tv": 72, + "umbrella": 28, + "vase": 86, + "wine glass": 46, + "zebra": 24 + }, + "layer_norm_eps": 1e-12, + "model_type": "yolos", + "num_attention_heads": 3, + "num_channels": 3, + "num_detection_tokens": 100, + "num_hidden_layers": 12, + "patch_size": 16, + "qkv_bias": true, + "torch_dtype": "float32", + "transformers_version": "4.19.0.dev0", + "use_mid_position_embeddings": false +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..7c2cd33 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a6a017a20cb522dd347271fa5bd670467e456176aaccd940090e50985ac6e74 +size 25978888 diff --git a/preprocessor_config.json b/preprocessor_config.json new file mode 100644 index 0000000..79bb34e --- /dev/null +++ b/preprocessor_config.json @@ -0,0 +1,17 @@ +{ + "do_normalize": true, + "do_resize": true, + "image_processor_type": "YolosImageProcessor", + "format": "coco_detection", + "image_mean": [ + 0.485, + 0.456, + 0.406 + ], + "image_std": [ + 0.229, + 0.224, + 0.225 + ], + "size": {"shortest_edge": 512, "longest_edge": 1333} +} \ No newline at end of file diff --git a/pytorch_model.bin b/pytorch_model.bin new file mode 100644 index 0000000..2914077 --- /dev/null +++ b/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a5cc7772832fba0f8dac7b1fc3425c6d834c361d01d5aef429bd5865e2c0726 +size 26021147