From a67da1f72c3c993deb7b51e4cab21417dc51bf5a Mon Sep 17 00:00:00 2001 From: ailab Date: Sat, 8 Jun 2024 01:37:19 +0800 Subject: [PATCH] first commit --- .gitattributes | 36 ++++++++ README.md | 111 +++++++++++++++++++++++ config.json | 57 ++++++++++++ diffusion_pytorch_model.bin | 3 + diffusion_pytorch_model.fp16.bin | 3 + diffusion_pytorch_model.fp16.safetensors | 3 + diffusion_pytorch_model.safetensors | 3 + spiderman.png | 3 + 8 files changed, 219 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 config.json create mode 100644 diffusion_pytorch_model.bin create mode 100644 diffusion_pytorch_model.fp16.bin create mode 100644 diffusion_pytorch_model.fp16.safetensors create mode 100644 diffusion_pytorch_model.safetensors create mode 100644 spiderman.png diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..2886043 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +spiderman.png filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..58c18e2 --- /dev/null +++ b/README.md @@ -0,0 +1,111 @@ + +--- +license: openrail++ +base_model: stabilityai/stable-diffusion-xl-base-1.0 +tags: +- stable-diffusion-xl +- stable-diffusion-xl-diffusers +- text-to-image +- diffusers +- controlnet +inference: false +--- + +# SDXL-controlnet: Depth + +These are controlnet weights trained on stabilityai/stable-diffusion-xl-base-1.0 with depth conditioning. You can find some example images in the following. + +prompt: spiderman lecture, photorealistic +![images_0)](./spiderman.png) + +## Usage + +Make sure to first install the libraries: + +```bash +pip install accelerate transformers safetensors diffusers +``` + +And then we're ready to go: + +```python +import torch +import numpy as np +from PIL import Image + +from transformers import DPTFeatureExtractor, DPTForDepthEstimation +from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline, AutoencoderKL +from diffusers.utils import load_image + + +depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda") +feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas") +controlnet = ControlNetModel.from_pretrained( + "diffusers/controlnet-depth-sdxl-1.0", + variant="fp16", + use_safetensors=True, + torch_dtype=torch.float16, +) +vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16) +pipe = StableDiffusionXLControlNetPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + controlnet=controlnet, + vae=vae, + variant="fp16", + use_safetensors=True, + torch_dtype=torch.float16, +) +pipe.enable_model_cpu_offload() + +def get_depth_map(image): + image = feature_extractor(images=image, return_tensors="pt").pixel_values.to("cuda") + with torch.no_grad(), torch.autocast("cuda"): + depth_map = depth_estimator(image).predicted_depth + + depth_map = torch.nn.functional.interpolate( + depth_map.unsqueeze(1), + size=(1024, 1024), + mode="bicubic", + align_corners=False, + ) + depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True) + depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True) + depth_map = (depth_map - depth_min) / (depth_max - depth_min) + image = torch.cat([depth_map] * 3, dim=1) + + image = image.permute(0, 2, 3, 1).cpu().numpy()[0] + image = Image.fromarray((image * 255.0).clip(0, 255).astype(np.uint8)) + return image + + +prompt = "stormtrooper lecture, photorealistic" +image = load_image("https://huggingface.co/lllyasviel/sd-controlnet-depth/resolve/main/images/stormtrooper.png") +controlnet_conditioning_scale = 0.5 # recommended for good generalization + +depth_image = get_depth_map(image) + +images = pipe( + prompt, image=depth_image, num_inference_steps=30, controlnet_conditioning_scale=controlnet_conditioning_scale, +).images +images[0] + +images[0].save(f"stormtrooper.png") +``` + +For more details, check out the official documentation of [`StableDiffusionXLControlNetPipeline`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/controlnet_sdxl). + +### Training + +Our training script was built on top of the official training script that we provide [here](https://github.com/huggingface/diffusers/blob/main/examples/controlnet/README_sdxl.md). + +#### Training data and Compute +The model is trained on 3M image-text pairs from LAION-Aesthetics V2. The model is trained for 700 GPU hours on 80GB A100 GPUs. + +#### Batch size +Data parallel with a single GPU batch size of 8 for a total batch size of 256. + +#### Hyper Parameters +The constant learning rate of 1e-5. + +#### Mixed precision +fp16 \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..9e36326 --- /dev/null +++ b/config.json @@ -0,0 +1,57 @@ +{ + "_class_name": "ControlNetModel", + "_diffusers_version": "0.20.0.dev0", + "_name_or_path": "valhalla/depth-2", + "act_fn": "silu", + "addition_embed_type": "text_time", + "addition_embed_type_num_heads": 64, + "addition_time_embed_dim": 256, + "attention_head_dim": [ + 5, + 10, + 20 + ], + "block_out_channels": [ + 320, + 640, + 1280 + ], + "class_embed_type": null, + "conditioning_channels": 3, + "conditioning_embedding_out_channels": [ + 16, + 32, + 96, + 256 + ], + "controlnet_conditioning_channel_order": "rgb", + "cross_attention_dim": 2048, + "down_block_types": [ + "DownBlock2D", + "CrossAttnDownBlock2D", + "CrossAttnDownBlock2D" + ], + "downsample_padding": 1, + "encoder_hid_dim": null, + "encoder_hid_dim_type": null, + "flip_sin_to_cos": true, + "freq_shift": 0, + "global_pool_conditions": false, + "in_channels": 4, + "layers_per_block": 2, + "mid_block_scale_factor": 1, + "norm_eps": 1e-05, + "norm_num_groups": 32, + "num_attention_heads": null, + "num_class_embeds": null, + "only_cross_attention": false, + "projection_class_embeddings_input_dim": 2816, + "resnet_time_scale_shift": "default", + "transformer_layers_per_block": [ + 1, + 2, + 10 + ], + "upcast_attention": null, + "use_linear_projection": true +} diff --git a/diffusion_pytorch_model.bin b/diffusion_pytorch_model.bin new file mode 100644 index 0000000..e977929 --- /dev/null +++ b/diffusion_pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edf33a2662e4f270d4eeeff3a56292bed7a050eba13841d07bba8a213f08a090 +size 5004438321 diff --git a/diffusion_pytorch_model.fp16.bin b/diffusion_pytorch_model.fp16.bin new file mode 100644 index 0000000..629c2a9 --- /dev/null +++ b/diffusion_pytorch_model.fp16.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b139b4fb4b2819868616b27e3dc2935c8b4b55ec57b115c031a2d2807a61e53 +size 2502412879 diff --git a/diffusion_pytorch_model.fp16.safetensors b/diffusion_pytorch_model.fp16.safetensors new file mode 100644 index 0000000..5cc12d9 --- /dev/null +++ b/diffusion_pytorch_model.fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66a6813e6bd7270ecfe68206a59ddd605a011ae85321188376605c66e0a4f303 +size 2502139134 diff --git a/diffusion_pytorch_model.safetensors b/diffusion_pytorch_model.safetensors new file mode 100644 index 0000000..4829030 --- /dev/null +++ b/diffusion_pytorch_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b760b1ed26970c7c60abbcef08e7238bb9428bee510a405aff239e06a1471945 +size 5004167860 diff --git a/spiderman.png b/spiderman.png new file mode 100644 index 0000000..be56ee8 --- /dev/null +++ b/spiderman.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2229cade24c6efc97310c9a67115eea65a36784d25ef9f39749db37d2e884406 +size 6356349