From 589e866dc64b9cf389c04053521dd78a92eac8a8 Mon Sep 17 00:00:00 2001
From: ailab <ailab@leinao.ai>
Date: Sat, 8 Jun 2024 02:48:54 +0800
Subject: [PATCH] first commit

---
 .gitattributes                           |  41 +++++++
 README.md                                | 130 +++++++++++++++++++++++
 config.json                              |  57 ++++++++++
 diffusion_pytorch_model.bin              |   3 +
 diffusion_pytorch_model.fp16.bin         |   3 +
 diffusion_pytorch_model.fp16.safetensors |   3 +
 diffusion_pytorch_model.safetensors      |   3 +
 hf_logo_small.png                        |   3 +
 megatron_small.png                       |   3 +
 oppenheimer_small.png                    |   3 +
 spiderman-small.png                      |   3 +
 stormtrooper_grid.png                    |   3 +
 12 files changed, 255 insertions(+)
 create mode 100644 .gitattributes
 create mode 100644 README.md
 create mode 100644 config.json
 create mode 100644 diffusion_pytorch_model.bin
 create mode 100644 diffusion_pytorch_model.fp16.bin
 create mode 100644 diffusion_pytorch_model.fp16.safetensors
 create mode 100644 diffusion_pytorch_model.safetensors
 create mode 100644 hf_logo_small.png
 create mode 100644 megatron_small.png
 create mode 100644 oppenheimer_small.png
 create mode 100644 spiderman-small.png
 create mode 100644 stormtrooper_grid.png

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..1429c8a
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,41 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+hf_logo_small.png filter=lfs diff=lfs merge=lfs -text
+megatron_small.png filter=lfs diff=lfs merge=lfs -text
+spiderman_small.png filter=lfs diff=lfs merge=lfs -text
+stormtrooper_grid.png filter=lfs diff=lfs merge=lfs -text
+oppenheimer_small.png filter=lfs diff=lfs merge=lfs -text
+spiderman-small.png filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..c84ba8c
--- /dev/null
+++ b/README.md
@@ -0,0 +1,130 @@
+
+---
+license: openrail++
+base_model: stabilityai/stable-diffusion-xl-base-1.0
+tags:
+- stable-diffusion-xl
+- stable-diffusion-xl-diffusers
+- text-to-image
+- diffusers
+- controlnet
+inference: false
+---
+    
+# SDXL-controlnet: Depth
+
+These are controlnet weights trained on stabilityai/stable-diffusion-xl-base-1.0 with depth conditioning. This checkpoint is 7x smaller than the original XL controlnet checkpoint. You can find some example images in the following.
+
+prompt: donald trump, serious look, cigar in the mouth, 70mm, film still, head shot
+![open](oppenheimer_small.png)
+
+prompt: spiderman lecture, photorealistic
+![images_0)](./spiderman-small.png)
+
+prompt: aerial view, a futuristic research complex in a bright foggy jungle, hard lighting
+![images_1)](./hf_logo_small.png)
+
+prompt: megatron in an apocalyptic world ground, runied city in the background, photorealistic
+![images_2)](./megatron_small.png)
+
+## Usage
+
+Make sure to first install the libraries:
+
+```bash
+pip install accelerate transformers safetensors diffusers
+```
+
+And then we're ready to go:
+
+```python
+import torch
+import numpy as np
+from PIL import Image
+
+from transformers import DPTFeatureExtractor, DPTForDepthEstimation
+from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline, AutoencoderKL
+from diffusers.utils import load_image
+
+
+depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
+feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")
+controlnet = ControlNetModel.from_pretrained(
+    "diffusers/controlnet-depth-sdxl-1.0-small",
+    variant="fp16",
+    use_safetensors=True,
+    torch_dtype=torch.float16,
+).to("cuda")
+vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16).to("cuda")
+pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    controlnet=controlnet,
+    vae=vae,
+    variant="fp16",
+    use_safetensors=True,
+    torch_dtype=torch.float16,
+).to("cuda")
+pipe.enable_model_cpu_offload()
+
+def get_depth_map(image):
+    image = feature_extractor(images=image, return_tensors="pt").pixel_values.to("cuda")
+    with torch.no_grad(), torch.autocast("cuda"):
+        depth_map = depth_estimator(image).predicted_depth
+
+    depth_map = torch.nn.functional.interpolate(
+        depth_map.unsqueeze(1),
+        size=(1024, 1024),
+        mode="bicubic",
+        align_corners=False,
+    )
+    depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
+    depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
+    depth_map = (depth_map - depth_min) / (depth_max - depth_min)
+    image = torch.cat([depth_map] * 3, dim=1)
+
+    image = image.permute(0, 2, 3, 1).cpu().numpy()[0]
+    image = Image.fromarray((image * 255.0).clip(0, 255).astype(np.uint8))
+    return image
+
+
+prompt = "stormtrooper lecture, photorealistic"
+image = load_image("https://huggingface.co/lllyasviel/sd-controlnet-depth/resolve/main/images/stormtrooper.png")
+controlnet_conditioning_scale = 0.5  # recommended for good generalization
+
+depth_image = get_depth_map(image)
+
+images = pipe(
+    prompt, image=depth_image, num_inference_steps=30, controlnet_conditioning_scale=controlnet_conditioning_scale,
+).images
+images[0]
+
+images[0].save(f"stormtrooper_grid.png")
+```
+
+![](./stormtrooper_grid.png)
+
+To more details, check out the official documentation of [`StableDiffusionXLControlNetPipeline`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/controlnet_sdxl).
+
+🚨 Please note that this checkpoint is experimental and there's a lot of room for improvement. We encourage the community to build on top of it, improve it, and provide us with feedback. 🚨
+
+### Training
+
+Our training script was built on top of the official training script that we provide [here](https://github.com/huggingface/diffusers/blob/main/examples/controlnet/README_sdxl.md). 
+You can refer to [this script](https://github.com/huggingface/diffusers/blob/7b93c2a882d8e12209fbaeffa51ee2b599ab5349/examples/research_projects/controlnet/train_controlnet_webdataset.py) for full discolsure.
+
+* This checkpoint does not perform distillation. We just use a smaller ControlNet initialized from the SDXL UNet. We
+encourage the community to try and conduct distillation too. This resource might be of help in [this regard](https://huggingface.co/blog/sd_distillation). 
+* To learn more about how the ControlNet was initialized, refer to [this code block](https://github.com/huggingface/diffusers/blob/7b93c2a882d8e12209fbaeffa51ee2b599ab5349/examples/research_projects/controlnet/train_controlnet_webdataset.py#L981C1-L999C36). 
+* It does not have any attention blocks.
+* The model works pretty good on most conditioning images. But for more complex conditionings, the bigger checkpoints might be better. We are still working on improving the quality of this checkpoint and looking for feedback from the community.
+* We recommend playing around with the `controlnet_conditioning_scale` and `guidance_scale` arguments for potentially better
+image generation quality.
+
+#### Training data
+The model was trained on 3M images from LAION aesthetic 6 plus subset, with batch size of 256 for 50k steps with constant learning rate of 3e-5.
+
+#### Compute
+One 8xA100 machine
+
+#### Mixed precision
+FP16
\ No newline at end of file
diff --git a/config.json b/config.json
new file mode 100644
index 0000000..f27d9b7
--- /dev/null
+++ b/config.json
@@ -0,0 +1,57 @@
+{
+  "_class_name": "ControlNetModel",
+  "_diffusers_version": "0.20.0.dev0",
+  "_name_or_path": "valhalla/d-n-a-fixed",
+  "act_fn": "silu",
+  "addition_embed_type": "text_time",
+  "addition_embed_type_num_heads": 64,
+  "addition_time_embed_dim": 256,
+  "attention_head_dim": [
+    5,
+    10,
+    20
+  ],
+  "block_out_channels": [
+    320,
+    640,
+    1280
+  ],
+  "class_embed_type": null,
+  "conditioning_channels": 3,
+  "conditioning_embedding_out_channels": [
+    16,
+    32,
+    96,
+    256
+  ],
+  "controlnet_conditioning_channel_order": "rgb",
+  "cross_attention_dim": 2048,
+  "down_block_types": [
+    "DownBlock2D",
+    "DownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "encoder_hid_dim": null,
+  "encoder_hid_dim_type": null,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "global_pool_conditions": false,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_scale_factor": 1,
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_attention_heads": null,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "projection_class_embeddings_input_dim": 2816,
+  "resnet_time_scale_shift": "default",
+  "transformer_layers_per_block": [
+    0,
+    0,
+    0
+  ],
+  "upcast_attention": null,
+  "use_linear_projection": true
+}
diff --git a/diffusion_pytorch_model.bin b/diffusion_pytorch_model.bin
new file mode 100644
index 0000000..aaaa897
--- /dev/null
+++ b/diffusion_pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ad581d22d1a814f9e3f6888f44fb79634c651a2b12dc4cbc90dd086072fd6813
+size 640496521
diff --git a/diffusion_pytorch_model.fp16.bin b/diffusion_pytorch_model.fp16.bin
new file mode 100644
index 0000000..2681b44
--- /dev/null
+++ b/diffusion_pytorch_model.fp16.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b83f650db40e500b33bb5a429c2f17655892b0ec35b548440c4c21f9ccd41ebf
+size 320274895
diff --git a/diffusion_pytorch_model.fp16.safetensors b/diffusion_pytorch_model.fp16.safetensors
new file mode 100644
index 0000000..8621858
--- /dev/null
+++ b/diffusion_pytorch_model.fp16.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f23c58fd632f52238a7b35ebdc02f9f596fd13dbaa121f9b37b9f4689c2b1e9
+size 320237179
diff --git a/diffusion_pytorch_model.safetensors b/diffusion_pytorch_model.safetensors
new file mode 100644
index 0000000..8775e87
--- /dev/null
+++ b/diffusion_pytorch_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8e62b40b28f8a444a778c9d4afd11183778cec03916afe07a959d0601053b85
+size 640459467
diff --git a/hf_logo_small.png b/hf_logo_small.png
new file mode 100644
index 0000000..23f065a
--- /dev/null
+++ b/hf_logo_small.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:acf92b8794c8db44d05b2b813ba6b8f45f2d4e4a9a7597390ea2da3188dbf82c
+size 7210516
diff --git a/megatron_small.png b/megatron_small.png
new file mode 100644
index 0000000..916609f
--- /dev/null
+++ b/megatron_small.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fc188d195d71a6adee06d5746452dff47f937af6ddd155c31ed577331523ac40
+size 4641719
diff --git a/oppenheimer_small.png b/oppenheimer_small.png
new file mode 100644
index 0000000..c322c6f
--- /dev/null
+++ b/oppenheimer_small.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a33ffa9201288b5158b6b04554934037645a7e08fa83eaf4b81ba80ee24eee2
+size 8765050
diff --git a/spiderman-small.png b/spiderman-small.png
new file mode 100644
index 0000000..506c9bd
--- /dev/null
+++ b/spiderman-small.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:319d068ffd117f56158e8f0cad196d3cf48910370bc2912fd09f71c42f8d13f5
+size 6938098
diff --git a/stormtrooper_grid.png b/stormtrooper_grid.png
new file mode 100644
index 0000000..8dd9a27
--- /dev/null
+++ b/stormtrooper_grid.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8cb05e203f169eb17d5337c2e084bd6432fdf401fa7b9eae98ca2948c09a604
+size 1617343