first commit

This commit is contained in:
xxl 2025-01-22 17:19:32 +08:00
parent 2ad8529465
commit 4e85e1bf2e
9 changed files with 978 additions and 2 deletions

View File

@ -1,3 +1,73 @@
# docling-models
---
license: cdla-permissive-2.0
---
docling-models
# Docling Models
This page contains models that power the PDF document converion package [docling](https://github.com/DS4SD/docling).
## Layout Model
The layout model will take an image from a poge and apply RT-DETR model in order to find different layout components. It currently detects the labels: Caption, Footnote, Formula, List-item, Page-footer, Page-header, Picture, Section-header, Table, Text, Title. As a reference (from the DocLayNet-paper), this is the performance of standard object detection methods on the DocLayNet dataset compared to human evaluation,
| | human | MRCNN | MRCNN | FRCNN | YOLO |
|----------------|---------|---------|---------|---------|--------|
| | human | R50 | R101 | R101 | v5x6 |
| Caption | 84-89 | 68.4 | 71.5 | 70.1 | 77.7 |
| Footnote | 83-91 | 70.9 | 71.8 | 73.7 | 77.2 |
| Formula | 83-85 | 60.1 | 63.4 | 63.5 | 66.2 |
| List-item | 87-88 | 81.2 | 80.8 | 81.0 | 86.2 |
| Page-footer | 93-94 | 61.6 | 59.3 | 58.9 | 61.1 |
| Page-header | 85-89 | 71.9 | 70.0 | 72.0 | 67.9 |
| Picture | 69-71 | 71.7 | 72.7 | 72.0 | 77.1 |
| Section-header | 83-84 | 67.6 | 69.3 | 68.4 | 74.6 |
| Table | 77-81 | 82.2 | 82.9 | 82.2 | 86.3 |
| Text | 84-86 | 84.6 | 85.8 | 85.4 | 88.1 |
| Title | 60-72 | 76.7 | 80.4 | 79.9 | 82.7 |
| All | 82-83 | 72.4 | 73.5 | 73.4 | 76.8 |
## TableFormer
The tableformer model will identify the structure of the table, starting from an image of a table. It uses the predicted table regions of the layout model to identify the tables. Tableformer has SOTA table structure identification,
| Model (TEDS) | Simple table | Complex table | All tables |
| ------------ | ------------ | ------------- | ---------- |
| Tabula | 78.0 | 57.8 | 67.9 |
| Traprange | 60.8 | 49.9 | 55.4 |
| Camelot | 80.0 | 66.0 | 73.0 |
| Acrobat Pro | 68.9 | 61.8 | 65.3 |
| EDD | 91.2 | 85.4 | 88.3 |
| TableFormer | 95.4 | 90.1 | 93.6 |
## References
```
@techreport{Docling,
author = {Deep Search Team},
month = {8},
title = {{Docling Technical Report}},
url={https://arxiv.org/abs/2408.09869},
eprint={2408.09869},
doi = "10.48550/arXiv.2408.09869",
version = {1.0.0},
year = {2024}
}
@article{doclaynet2022,
title = {DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis},
doi = {10.1145/3534678.353904},
url = {https://arxiv.org/abs/2206.01062},
author = {Pfitzmann, Birgit and Auer, Christoph and Dolfi, Michele and Nassar, Ahmed S and Staar, Peter W J},
year = {2022}
}
@InProceedings{TableFormer2022,
author = {Nassar, Ahmed and Livathinos, Nikolaos and Lysak, Maksym and Staar, Peter},
title = {TableFormer: Table Structure Understanding With Transformers},
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
month = {June},
year = {2022},
pages = {4614-4623},
doi = {https://doi.org/10.1109/CVPR52688.2022.00457}
}
```

3
config.json Normal file
View File

@ -0,0 +1,3 @@
{
"_name_or_path": "docling-models"
}

View File

@ -0,0 +1,130 @@
{
"activation_dropout": 0.0,
"activation_function": "silu",
"anchor_image_size": null,
"architectures": [
"RTDetrForObjectDetection"
],
"attention_dropout": 0.0,
"auxiliary_loss": true,
"backbone": null,
"backbone_config": {
"model_type": "rt_detr_resnet",
"out_features": [
"stage2",
"stage3",
"stage4"
],
"out_indices": [
2,
3,
4
]
},
"backbone_kwargs": null,
"batch_norm_eps": 1e-05,
"box_noise_scale": 1.0,
"d_model": 256,
"decoder_activation_function": "relu",
"decoder_attention_heads": 8,
"decoder_ffn_dim": 1024,
"decoder_in_channels": [
256,
256,
256
],
"decoder_layers": 6,
"decoder_n_points": 4,
"disable_custom_kernels": true,
"dropout": 0.0,
"encode_proj_layers": [
2
],
"encoder_activation_function": "gelu",
"encoder_attention_heads": 8,
"encoder_ffn_dim": 1024,
"encoder_hidden_dim": 256,
"encoder_in_channels": [
512,
1024,
2048
],
"encoder_layers": 1,
"eos_coefficient": 0.0001,
"eval_size": null,
"feat_strides": [
8,
16,
32
],
"focal_loss_alpha": 0.75,
"focal_loss_gamma": 2.0,
"freeze_backbone_batch_norms": true,
"hidden_expansion": 1.0,
"id2label": {
"0": "background",
"1": "Caption",
"10": "Text",
"11": "Title",
"12": "Document Index",
"13": "Code",
"14": "Checkbox-Selected",
"15": "Checkbox-Unselected",
"16": "Form",
"17": "Key-Value Region",
"2": "Footnote",
"3": "Formula",
"4": "List-item",
"5": "Page-footer",
"6": "Page-header",
"7": "Picture",
"8": "Section-header",
"9": "Table"
},
"initializer_bias_prior_prob": null,
"initializer_range": 0.01,
"is_encoder_decoder": true,
"label2id": {
"Caption": "1",
"Checkbox-Selected": "14",
"Checkbox-Unselected": "15",
"Code": "13",
"Document Index": "12",
"Footnote": "2",
"Form": "16",
"Formula": "3",
"Key-Value Region": "17",
"List-item": "4",
"Page-footer": "5",
"Page-header": "6",
"Picture": "7",
"Section-header": "8",
"Table": "9",
"Text": "10",
"Title": "11",
"background": "0"
},
"label_noise_ratio": 0.5,
"layer_norm_eps": 1e-05,
"learn_initial_query": false,
"matcher_alpha": 0.25,
"matcher_bbox_cost": 5.0,
"matcher_class_cost": 2.0,
"matcher_gamma": 2.0,
"matcher_giou_cost": 2.0,
"model_type": "rt_detr",
"normalize_before": false,
"num_denoising": 100,
"num_feature_levels": 3,
"num_queries": 300,
"positional_encoding_temperature": 10000,
"torch_dtype": "float32",
"transformers_version": "4.46.2",
"use_focal_loss": true,
"use_pretrained_backbone": false,
"use_timm_backbone": false,
"weight_loss_bbox": 5.0,
"weight_loss_giou": 2.0,
"weight_loss_vfl": 1.0,
"with_box_refine": true
}

BIN
model_artifacts/layout/model.safetensors (Stored with Git LFS) Normal file

Binary file not shown.

View File

@ -0,0 +1,26 @@
{
"do_convert_annotations": true,
"do_normalize": false,
"do_pad": false,
"do_rescale": true,
"do_resize": true,
"format": "coco_detection",
"image_mean": [
0.485,
0.456,
0.406
],
"image_processor_type": "RTDetrImageProcessor",
"image_std": [
0.229,
0.224,
0.225
],
"pad_size": null,
"resample": 2,
"rescale_factor": 0.00392156862745098,
"size": {
"height": 640,
"width": 640
}
}

Binary file not shown.

View File

@ -0,0 +1,369 @@
{
"dataset": {
"type": "PTN_prepared",
"name": "PubTabNet_300_100_512",
"raw_data_dir": "./tests/test_data/ccs_api/model/",
"load_cells": true,
"bbox_format": "5plet",
"resized_image": 448,
"keep_AR": false,
"up_scaling_enabled": true,
"down_scaling_enabled": true,
"padding_mode": "null",
"padding_color": [
0,
0,
0
],
"image_normalization": {
"state": true,
"mean": [
0.94247851,
0.94254675,
0.94292611
],
"std": [
0.17910956,
0.17940403,
0.17931663
]
},
"color_jitter": true,
"rand_crop": true,
"rand_pad": true,
"image_grayscale": false
},
"model": {
"type": "TableModel04_rs",
"name": "14_128_256_4_true",
"backbone": "resnet18",
"enc_image_size": 28,
"tag_embed_dim": 16,
"hidden_dim": 512,
"tag_decoder_dim": 512,
"bbox_embed_dim": 256,
"tag_attention_dim": 256,
"bbox_attention_dim": 512,
"enc_layers": 6,
"dec_layers": 6,
"nheads": 8,
"dropout": 0.1,
"bbox_classes": 2
},
"train": {
"bbox": true
},
"predict": {
"max_steps": 1024,
"beam_size": 5,
"bbox": true,
"pdf_cell_iou_thres": 0.05,
"padding": false,
"padding_size": 50,
"disable_post_process": false,
"profiling": false
},
"debug": {
"save_debug_images": false
},
"dataset_wordmap": {
"word_map_tag": {
"<pad>": 0,
"<unk>": 1,
"<start>": 2,
"<end>": 3,
"ecel": 4,
"fcel": 5,
"lcel": 6,
"ucel": 7,
"xcel": 8,
"nl": 9,
"ched": 10,
"rhed": 11,
"srow": 12
},
"word_map_cell": {
" ": 13,
"!": 179,
"\"": 126,
"#": 101,
"$": 119,
"%": 18,
"&": 114,
"'": 108,
"(": 29,
")": 32,
"*": 26,
"+": 97,
",": 71,
"-": 63,
".": 34,
"/": 66,
"0": 33,
"1": 36,
"2": 43,
"3": 41,
"4": 45,
"5": 17,
"6": 37,
"7": 35,
"8": 40,
"9": 16,
":": 88,
";": 92,
"<": 73,
"</b>": 9,
"</i>": 23,
"</overline>": 219,
"</strike>": 233,
"</sub>": 94,
"</sup>": 77,
"</underline>": 151,
"<b>": 1,
"<end>": 280,
"<i>": 21,
"<overline>": 218,
"<pad>": 0,
"<start>": 279,
"<strike>": 232,
"<sub>": 93,
"<sup>": 75,
"<underline>": 150,
"<unk>": 278,
"=": 99,
">": 39,
"?": 96,
"@": 125,
"A": 27,
"B": 86,
"C": 19,
"D": 57,
"E": 64,
"F": 47,
"G": 44,
"H": 10,
"I": 20,
"J": 80,
"K": 81,
"L": 52,
"M": 46,
"N": 69,
"O": 65,
"P": 62,
"Q": 59,
"R": 60,
"S": 58,
"T": 48,
"U": 55,
"V": 2,
"W": 83,
"X": 104,
"Y": 89,
"Z": 113,
"[": 70,
"\\": 165,
"]": 72,
"^": 132,
"_": 84,
"`": 196,
"a": 3,
"b": 6,
"c": 54,
"d": 12,
"e": 8,
"f": 50,
"g": 28,
"h": 56,
"i": 5,
"j": 82,
"k": 95,
"l": 7,
"m": 30,
"n": 31,
"o": 15,
"p": 22,
"q": 67,
"r": 4,
"s": 51,
"t": 14,
"u": 25,
"v": 24,
"w": 53,
"x": 61,
"y": 49,
"z": 11,
"{": 158,
"|": 139,
"}": 159,
"~": 147,
"\u00a2": 203,
"\u00a3": 162,
"\u00a4": 220,
"\u00a5": 176,
"\u00a7": 142,
"\u00a9": 268,
"\u00ab": 239,
"\u00ad": 275,
"\u00ae": 130,
"\u00b0": 100,
"\u00b1": 79,
"\u00b6": 171,
"\u00b7": 137,
"\u00bb": 240,
"\u00d7": 118,
"\u00d8": 192,
"\u00df": 197,
"\u00e6": 261,
"\u00f7": 225,
"\u00f8": 163,
"\u0131": 242,
"\u0142": 267,
"\u01c2": 211,
"\u025b": 223,
"\u02b9": 248,
"\u02c2": 195,
"\u02c3": 208,
"\u02c6": 253,
"\u0300": 209,
"\u0301": 131,
"\u0302": 138,
"\u0303": 156,
"\u0304": 152,
"\u0306": 222,
"\u0307": 247,
"\u0308": 103,
"\u030a": 102,
"\u030c": 254,
"\u0327": 155,
"\u0328": 269,
"\u0338": 170,
"\u0391": 173,
"\u0392": 169,
"\u0393": 180,
"\u0394": 85,
"\u0398": 243,
"\u0399": 271,
"\u039b": 272,
"\u03a0": 213,
"\u03a3": 185,
"\u03a6": 148,
"\u03a7": 212,
"\u03a8": 141,
"\u03a9": 161,
"\u03b1": 90,
"\u03b2": 107,
"\u03b3": 110,
"\u03b4": 153,
"\u03b5": 166,
"\u03b6": 178,
"\u03b7": 146,
"\u03b8": 186,
"\u03b9": 229,
"\u03ba": 164,
"\u03bb": 91,
"\u03bc": 78,
"\u03bd": 230,
"\u03be": 244,
"\u03c0": 127,
"\u03c1": 149,
"\u03c3": 116,
"\u03c4": 198,
"\u03c5": 189,
"\u03c6": 140,
"\u03c7": 124,
"\u03c8": 216,
"\u03c9": 167,
"\u0410": 273,
"\u0421": 194,
"\u115f": 217,
"\u200b": 265,
"\u2010": 117,
"\u2012": 135,
"\u2013": 42,
"\u2014": 106,
"\u2015": 228,
"\u2016": 259,
"\u2018": 123,
"\u2019": 121,
"\u201c": 87,
"\u201d": 115,
"\u201e": 245,
"\u2020": 109,
"\u2021": 129,
"\u2022": 128,
"\u2028": 190,
"\u2030": 154,
"\u2032": 68,
"\u203b": 224,
"\u2044": 188,
"\u204e": 199,
"\u2061": 200,
"\u20ac": 184,
"\u2190": 202,
"\u2191": 112,
"\u2192": 120,
"\u2193": 111,
"\u2194": 183,
"\u21d1": 266,
"\u21d2": 264,
"\u21d3": 255,
"\u2205": 215,
"\u2206": 175,
"\u2208": 262,
"\u2211": 160,
"\u2212": 76,
"\u2216": 206,
"\u2217": 105,
"\u2218": 246,
"\u2219": 236,
"\u221a": 187,
"\u221e": 207,
"\u2223": 260,
"\u2225": 193,
"\u2227": 182,
"\u2229": 256,
"\u222b": 258,
"\u223c": 98,
"\u2248": 210,
"\u2264": 38,
"\u2265": 74,
"\u2266": 214,
"\u2267": 181,
"\u2295": 263,
"\u22c5": 174,
"\u22c6": 191,
"\u22ee": 277,
"\u22ef": 270,
"\u2500": 205,
"\u2551": 231,
"\u25a0": 250,
"\u25a1": 177,
"\u25aa": 145,
"\u25b2": 136,
"\u25b3": 143,
"\u25bc": 251,
"\u25c6": 226,
"\u25ca": 235,
"\u25cb": 227,
"\u25cf": 172,
"\u25e6": 274,
"\u2605": 204,
"\u2606": 144,
"\u2640": 133,
"\u2642": 134,
"\u2663": 252,
"\u2666": 157,
"\u266f": 221,
"\u2713": 122,
"\u2714": 249,
"\u2717": 201,
"\u2794": 168,
"\u27a2": 276,
"\u2a7d": 234,
"\u2a7e": 241,
"\u3008": 237,
"\u3009": 238,
"\ufeff": 257
}
}
}

Binary file not shown.

View File

@ -0,0 +1,369 @@
{
"dataset": {
"type": "PTN_prepared",
"name": "PubTabNet_300_100_512",
"raw_data_dir": "./tests/test_data/ccs_api/model/",
"load_cells": true,
"bbox_format": "5plet",
"resized_image": 448,
"keep_AR": false,
"up_scaling_enabled": true,
"down_scaling_enabled": true,
"padding_mode": "null",
"padding_color": [
0,
0,
0
],
"image_normalization": {
"state": true,
"mean": [
0.94247851,
0.94254675,
0.94292611
],
"std": [
0.17910956,
0.17940403,
0.17931663
]
},
"color_jitter": true,
"rand_crop": true,
"rand_pad": true,
"image_grayscale": false
},
"model": {
"type": "TableModel04_rs",
"name": "14_128_256_4_true",
"backbone": "resnet18",
"enc_image_size": 28,
"tag_embed_dim": 16,
"hidden_dim": 512,
"tag_decoder_dim": 512,
"bbox_embed_dim": 256,
"tag_attention_dim": 256,
"bbox_attention_dim": 512,
"enc_layers": 4,
"dec_layers": 2,
"nheads": 8,
"dropout": 0.1,
"bbox_classes": 2
},
"train": {
"bbox": true
},
"predict": {
"max_steps": 1024,
"beam_size": 5,
"bbox": true,
"pdf_cell_iou_thres": 0.05,
"padding": false,
"padding_size": 50,
"disable_post_process": false,
"profiling": false
},
"debug": {
"save_debug_images": false
},
"dataset_wordmap": {
"word_map_tag": {
"<pad>": 0,
"<unk>": 1,
"<start>": 2,
"<end>": 3,
"ecel": 4,
"fcel": 5,
"lcel": 6,
"ucel": 7,
"xcel": 8,
"nl": 9,
"ched": 10,
"rhed": 11,
"srow": 12
},
"word_map_cell": {
" ": 13,
"!": 179,
"\"": 126,
"#": 101,
"$": 119,
"%": 18,
"&": 114,
"'": 108,
"(": 29,
")": 32,
"*": 26,
"+": 97,
",": 71,
"-": 63,
".": 34,
"/": 66,
"0": 33,
"1": 36,
"2": 43,
"3": 41,
"4": 45,
"5": 17,
"6": 37,
"7": 35,
"8": 40,
"9": 16,
":": 88,
";": 92,
"<": 73,
"</b>": 9,
"</i>": 23,
"</overline>": 219,
"</strike>": 233,
"</sub>": 94,
"</sup>": 77,
"</underline>": 151,
"<b>": 1,
"<end>": 280,
"<i>": 21,
"<overline>": 218,
"<pad>": 0,
"<start>": 279,
"<strike>": 232,
"<sub>": 93,
"<sup>": 75,
"<underline>": 150,
"<unk>": 278,
"=": 99,
">": 39,
"?": 96,
"@": 125,
"A": 27,
"B": 86,
"C": 19,
"D": 57,
"E": 64,
"F": 47,
"G": 44,
"H": 10,
"I": 20,
"J": 80,
"K": 81,
"L": 52,
"M": 46,
"N": 69,
"O": 65,
"P": 62,
"Q": 59,
"R": 60,
"S": 58,
"T": 48,
"U": 55,
"V": 2,
"W": 83,
"X": 104,
"Y": 89,
"Z": 113,
"[": 70,
"\\": 165,
"]": 72,
"^": 132,
"_": 84,
"`": 196,
"a": 3,
"b": 6,
"c": 54,
"d": 12,
"e": 8,
"f": 50,
"g": 28,
"h": 56,
"i": 5,
"j": 82,
"k": 95,
"l": 7,
"m": 30,
"n": 31,
"o": 15,
"p": 22,
"q": 67,
"r": 4,
"s": 51,
"t": 14,
"u": 25,
"v": 24,
"w": 53,
"x": 61,
"y": 49,
"z": 11,
"{": 158,
"|": 139,
"}": 159,
"~": 147,
"\u00a2": 203,
"\u00a3": 162,
"\u00a4": 220,
"\u00a5": 176,
"\u00a7": 142,
"\u00a9": 268,
"\u00ab": 239,
"\u00ad": 275,
"\u00ae": 130,
"\u00b0": 100,
"\u00b1": 79,
"\u00b6": 171,
"\u00b7": 137,
"\u00bb": 240,
"\u00d7": 118,
"\u00d8": 192,
"\u00df": 197,
"\u00e6": 261,
"\u00f7": 225,
"\u00f8": 163,
"\u0131": 242,
"\u0142": 267,
"\u01c2": 211,
"\u025b": 223,
"\u02b9": 248,
"\u02c2": 195,
"\u02c3": 208,
"\u02c6": 253,
"\u0300": 209,
"\u0301": 131,
"\u0302": 138,
"\u0303": 156,
"\u0304": 152,
"\u0306": 222,
"\u0307": 247,
"\u0308": 103,
"\u030a": 102,
"\u030c": 254,
"\u0327": 155,
"\u0328": 269,
"\u0338": 170,
"\u0391": 173,
"\u0392": 169,
"\u0393": 180,
"\u0394": 85,
"\u0398": 243,
"\u0399": 271,
"\u039b": 272,
"\u03a0": 213,
"\u03a3": 185,
"\u03a6": 148,
"\u03a7": 212,
"\u03a8": 141,
"\u03a9": 161,
"\u03b1": 90,
"\u03b2": 107,
"\u03b3": 110,
"\u03b4": 153,
"\u03b5": 166,
"\u03b6": 178,
"\u03b7": 146,
"\u03b8": 186,
"\u03b9": 229,
"\u03ba": 164,
"\u03bb": 91,
"\u03bc": 78,
"\u03bd": 230,
"\u03be": 244,
"\u03c0": 127,
"\u03c1": 149,
"\u03c3": 116,
"\u03c4": 198,
"\u03c5": 189,
"\u03c6": 140,
"\u03c7": 124,
"\u03c8": 216,
"\u03c9": 167,
"\u0410": 273,
"\u0421": 194,
"\u115f": 217,
"\u200b": 265,
"\u2010": 117,
"\u2012": 135,
"\u2013": 42,
"\u2014": 106,
"\u2015": 228,
"\u2016": 259,
"\u2018": 123,
"\u2019": 121,
"\u201c": 87,
"\u201d": 115,
"\u201e": 245,
"\u2020": 109,
"\u2021": 129,
"\u2022": 128,
"\u2028": 190,
"\u2030": 154,
"\u2032": 68,
"\u203b": 224,
"\u2044": 188,
"\u204e": 199,
"\u2061": 200,
"\u20ac": 184,
"\u2190": 202,
"\u2191": 112,
"\u2192": 120,
"\u2193": 111,
"\u2194": 183,
"\u21d1": 266,
"\u21d2": 264,
"\u21d3": 255,
"\u2205": 215,
"\u2206": 175,
"\u2208": 262,
"\u2211": 160,
"\u2212": 76,
"\u2216": 206,
"\u2217": 105,
"\u2218": 246,
"\u2219": 236,
"\u221a": 187,
"\u221e": 207,
"\u2223": 260,
"\u2225": 193,
"\u2227": 182,
"\u2229": 256,
"\u222b": 258,
"\u223c": 98,
"\u2248": 210,
"\u2264": 38,
"\u2265": 74,
"\u2266": 214,
"\u2267": 181,
"\u2295": 263,
"\u22c5": 174,
"\u22c6": 191,
"\u22ee": 277,
"\u22ef": 270,
"\u2500": 205,
"\u2551": 231,
"\u25a0": 250,
"\u25a1": 177,
"\u25aa": 145,
"\u25b2": 136,
"\u25b3": 143,
"\u25bc": 251,
"\u25c6": 226,
"\u25ca": 235,
"\u25cb": 227,
"\u25cf": 172,
"\u25e6": 274,
"\u2605": 204,
"\u2606": 144,
"\u2640": 133,
"\u2642": 134,
"\u2663": 252,
"\u2666": 157,
"\u266f": 221,
"\u2713": 122,
"\u2714": 249,
"\u2717": 201,
"\u2794": 168,
"\u27a2": 276,
"\u2a7d": 234,
"\u2a7e": 241,
"\u3008": 237,
"\u3009": 238,
"\ufeff": 257
}
}
}