elismasilva commited on
Commit
5f88efd
·
verified ·
1 Parent(s): fa8760b

Upload 50 files

Browse files
Files changed (50) hide show
  1. .gitattributes +49 -35
  2. .gitignore +14 -0
  3. README.md +78 -0
  4. assets/DMDR.webp +3 -0
  5. assets/Z-Image-Gallery.pdf +3 -0
  6. assets/architecture.webp +3 -0
  7. assets/decoupled-dmd.webp +3 -0
  8. assets/leaderboard.png +3 -0
  9. assets/leaderboard.webp +0 -0
  10. assets/reasoning.png +3 -0
  11. assets/showcase.jpg +3 -0
  12. assets/showcase_editing.png +3 -0
  13. assets/showcase_realistic.png +3 -0
  14. assets/showcase_rendering.png +3 -0
  15. diffusers_local/__init__.py +1 -0
  16. diffusers_local/__pycache__/__init__.cpython-310.pyc +0 -0
  17. diffusers_local/__pycache__/patch.cpython-310.pyc +0 -0
  18. diffusers_local/__pycache__/pipeline_z_image_control_unified.cpython-310.pyc +0 -0
  19. diffusers_local/__pycache__/z_image_control_transformer_2d.cpython-310.pyc +0 -0
  20. diffusers_local/patch.py +246 -0
  21. diffusers_local/pipeline_z_image_control_unified.py +288 -0
  22. diffusers_local/z_image_control_transformer_2d.py +815 -0
  23. example/canny_man.png +0 -0
  24. example/depth_cat.png +3 -0
  25. example/depth_man.png +3 -0
  26. example/teed_man.png +0 -0
  27. infer_gguf.py +103 -0
  28. infer_pretrained.py +74 -0
  29. model_index.json +24 -0
  30. requirements.txt +18 -0
  31. scheduler/scheduler_config.json +7 -0
  32. text_encoder/config.json +30 -0
  33. text_encoder/generation_config.json +13 -0
  34. text_encoder/model-00001-of-00003.safetensors +3 -0
  35. text_encoder/model-00002-of-00003.safetensors +3 -0
  36. text_encoder/model-00003-of-00003.safetensors +3 -0
  37. text_encoder/model.safetensors.index.json +405 -0
  38. tokenizer/merges.txt +0 -0
  39. tokenizer/tokenizer.json +3 -0
  40. tokenizer/tokenizer_config.json +239 -0
  41. tokenizer/vocab.json +0 -0
  42. transformer/config.json +31 -0
  43. transformer/diffusion_pytorch_model-00001-of-00004.safetensors +3 -0
  44. transformer/diffusion_pytorch_model-00002-of-00004.safetensors +3 -0
  45. transformer/diffusion_pytorch_model-00003-of-00004.safetensors +3 -0
  46. transformer/diffusion_pytorch_model-00004-of-00004.safetensors +3 -0
  47. transformer/diffusion_pytorch_model.safetensors.index.json +664 -0
  48. vae/config.json +38 -0
  49. vae/diffusion_pytorch_model.safetensors +3 -0
  50. z_image_turbo_control_unified_q4_k_m.gguf +3 -0
.gitattributes CHANGED
@@ -1,35 +1,49 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/architecture.webp filter=lfs diff=lfs merge=lfs -text
37
+ assets/decoupled-dmd.webp filter=lfs diff=lfs merge=lfs -text
38
+ assets/DMDR.webp filter=lfs diff=lfs merge=lfs -text
39
+ assets/leaderboard.png filter=lfs diff=lfs merge=lfs -text
40
+ assets/reasoning.png filter=lfs diff=lfs merge=lfs -text
41
+ assets/showcase_editing.png filter=lfs diff=lfs merge=lfs -text
42
+ assets/showcase_realistic.png filter=lfs diff=lfs merge=lfs -text
43
+ assets/showcase_rendering.png filter=lfs diff=lfs merge=lfs -text
44
+ assets/showcase.jpg filter=lfs diff=lfs merge=lfs -text
45
+ assets/Z-Image-Gallery.pdf filter=lfs diff=lfs merge=lfs -text
46
+ example/depth_cat.png filter=lfs diff=lfs merge=lfs -text
47
+ example/depth_man.png filter=lfs diff=lfs merge=lfs -text
48
+ tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
49
+ z_image_turbo_control_unified_q4_k_m.gguf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.py[cod]
3
+ mod_tests/
4
+ /.vs
5
+ .vscode/
6
+ .idea/
7
+ venv/
8
+ .venv/
9
+ *.log
10
+ .DS_Store
11
+ .gradio
12
+ download.py
13
+ bk
14
+ outputs/
README.md CHANGED
@@ -1,3 +1,81 @@
1
  ---
2
  license: apache-2.0
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
+ tags:
4
+ - text-to-image
5
+ - image-to-image
6
+ - controlnet
7
+ - diffusers
8
+ - gguf
9
+ - z-image-turbo
10
+ pipeline_tag: text-to-image
11
  ---
12
+
13
+ # Z-Image Turbo Control Unified
14
+
15
+ This repository hosts the **Z-Image Turbo Control Unified** model. This is a specialized architecture that unifies the powerful **Z-Image Turbo** base transformer with **ControlNet** capabilities into a single, cohesive architecture.
16
+
17
+ Unlike traditional pipelines where ControlNet is an external add-on, this model integrates control layers directly into the transformer structure. This enables **Unified GGUF Quantization**, allowing the entire merged architecture (Base + Control) to be quantized (e.g., Q4_K_M) and run on consumer hardware with limited VRAM.
18
+
19
+ ## 📥 Installation
20
+
21
+ To set up the environment, simply install the dependencies using the provided requirements file:
22
+
23
+ ```bash
24
+ pip install -r requirements.txt
25
+ ```
26
+
27
+ *Note: This repository contains a `diffusers_local` folder with custom pipelines required to run this specific architecture.*
28
+
29
+ ## 🚀 Usage
30
+
31
+ We provide two ready-to-use scripts for inference, depending on your hardware capabilities and requirements.
32
+
33
+ ### Option 1: Low VRAM (GGUF) - Recommended
34
+ **Script:** `infer_gguf.py`
35
+
36
+ Use this version if you have limited VRAM (e.g., 6GB - 8GB) or want to save memory. It loads the model from the quantized **GGUF** file (`z_image_turbo_control_unified_q4_k_m.gguf`).
37
+
38
+ To run:
39
+ ```bash
40
+ python infer_gguf.py
41
+ ```
42
+
43
+ **Key Features of this mode:**
44
+ * Loads the unified transformer from a single 4-bit quantized file.
45
+ * Uses `GGUFQuantizationConfig` for efficient computation.
46
+ * Enables aggressive group offloading to fit large models in consumer GPUs.
47
+
48
+ ### Option 2: High Precision (Diffusers/BF16)
49
+ **Script:** `infer_pretrained.py`
50
+
51
+ Use this version if you have ample VRAM (e.g., 24GB+) and want to run the model in standard **BFloat16** precision without quantization.
52
+
53
+ To run:
54
+ ```bash
55
+ python infer_pretrained.py
56
+ ```
57
+
58
+ **Key Features of this mode:**
59
+ * Loads the model using the standard `from_pretrained` directory structure.
60
+ * maintains full floating-point precision.
61
+
62
+ ---
63
+
64
+ ## 🛠️ Model Configuration
65
+
66
+ The inference scripts are pre-configured with parameters optimized for the **Turbo** nature of this model:
67
+
68
+ * **Inference Steps:** 9 steps (Fast generation).
69
+ * **Guidance Scale:** 0.0 (Turbo models do not use CFG).
70
+ * **Conditioning Scale:** 0.7 (Recommended strength for ControlNet).
71
+ * **Shift:** 3.0 (Scheduler shift parameter).
72
+
73
+ ## 📂 Repository Structure
74
+
75
+ * `z_image_turbo_control_unified_q4_k_m.gguf`: The unified, quantized model weights.
76
+ * `infer_gguf.py`: Script for running GGUF inference.
77
+ * `infer_pretrained.py`: Script for running standard Diffusers inference.
78
+ * `diffusers_local/`: Custom pipeline code (`ZImageControlUnifiedPipeline`) and transformer logic.
79
+ * `requirements.txt`: Python dependencies.
80
+
81
+
assets/DMDR.webp ADDED

Git LFS Details

  • SHA256: 2e6f3053b98d097f2aa11d3892bd9307326db41b65336bea54dc5825a0e03077
  • Pointer size: 131 Bytes
  • Size of remote file: 173 kB
assets/Z-Image-Gallery.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f9895b3246d2547bac74bbe0be975da500eaae93f2cad4248ad3281786b1ac6
3
+ size 15767436
assets/architecture.webp ADDED

Git LFS Details

  • SHA256: 261af62ecc7e9749ae28e1d3a84e2f70a6c192d2017b7d8f020c7bff982ef59c
  • Pointer size: 131 Bytes
  • Size of remote file: 422 kB
assets/decoupled-dmd.webp ADDED

Git LFS Details

  • SHA256: 4568ca559b997fc38f57dc1c3f5b1da3a3c144ae12419caa855ced972bf8c7aa
  • Pointer size: 131 Bytes
  • Size of remote file: 152 kB
assets/leaderboard.png ADDED

Git LFS Details

  • SHA256: e9fd4aa185bb7bff2b5515f2001b4d80df330595e78d6a098142e5a232bb4e4e
  • Pointer size: 132 Bytes
  • Size of remote file: 2.03 MB
assets/leaderboard.webp ADDED
assets/reasoning.png ADDED

Git LFS Details

  • SHA256: 96c16b2c8d8dc67bb92ecc22d54b9955ab55136977f515bb76f4b2eb42eb3cdb
  • Pointer size: 132 Bytes
  • Size of remote file: 7.7 MB
assets/showcase.jpg ADDED

Git LFS Details

  • SHA256: f6ee74e066e00596e429f5a08140aebae1678e5935ce1e11ca6c1c6cd72432ee
  • Pointer size: 132 Bytes
  • Size of remote file: 6.43 MB
assets/showcase_editing.png ADDED

Git LFS Details

  • SHA256: 7d720c3157fd0b0c1f07ac826c6d380b4bcb1b6933c64eb11bfe804ccf7c26f4
  • Pointer size: 132 Bytes
  • Size of remote file: 4.75 MB
assets/showcase_realistic.png ADDED

Git LFS Details

  • SHA256: 697e6f6857f619314173508df72a14314cbb43e67475de7494123bb8b4f4eb2c
  • Pointer size: 132 Bytes
  • Size of remote file: 6.26 MB
assets/showcase_rendering.png ADDED

Git LFS Details

  • SHA256: 3556dd66be2200d53f957424e12ecf914ddf3eded151cde86c7353f8b231284f
  • Pointer size: 132 Bytes
  • Size of remote file: 7.6 MB
diffusers_local/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .z_image_control_transformer_2d import Transformer2DModelOutput, ZImageControlTransformer2DModel
diffusers_local/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (299 Bytes). View file
 
diffusers_local/__pycache__/patch.cpython-310.pyc ADDED
Binary file (6.74 kB). View file
 
diffusers_local/__pycache__/pipeline_z_image_control_unified.cpython-310.pyc ADDED
Binary file (8.77 kB). View file
 
diffusers_local/__pycache__/z_image_control_transformer_2d.cpython-310.pyc ADDED
Binary file (21.7 kB). View file
 
diffusers_local/patch.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib
2
+ import os
3
+ import diffusers.pipelines.pipeline_loading_utils as pipe_loading_utils
4
+ import diffusers.loaders.single_file_model as single_file_model
5
+ from diffusers.utils import (
6
+ _maybe_remap_transformers_class,
7
+ get_class_from_dynamic_module,
8
+ )
9
+ from diffusers.loaders.single_file_utils import (
10
+ convert_animatediff_checkpoint_to_diffusers,
11
+ convert_auraflow_transformer_checkpoint_to_diffusers,
12
+ convert_autoencoder_dc_checkpoint_to_diffusers,
13
+ convert_chroma_transformer_checkpoint_to_diffusers,
14
+ convert_controlnet_checkpoint,
15
+ convert_cosmos_transformer_checkpoint_to_diffusers,
16
+ convert_flux2_transformer_checkpoint_to_diffusers,
17
+ convert_flux_transformer_checkpoint_to_diffusers,
18
+ convert_hidream_transformer_to_diffusers,
19
+ convert_hunyuan_video_transformer_to_diffusers,
20
+ convert_ldm_unet_checkpoint,
21
+ convert_ldm_vae_checkpoint,
22
+ convert_ltx_transformer_checkpoint_to_diffusers,
23
+ convert_ltx_vae_checkpoint_to_diffusers,
24
+ convert_lumina2_to_diffusers,
25
+ convert_mochi_transformer_checkpoint_to_diffusers,
26
+ convert_sana_transformer_to_diffusers,
27
+ convert_sd3_transformer_checkpoint_to_diffusers,
28
+ convert_stable_cascade_unet_single_file_to_diffusers,
29
+ convert_wan_transformer_to_diffusers,
30
+ convert_wan_vae_to_diffusers,
31
+ convert_z_image_transformer_checkpoint_to_diffusers,
32
+ create_controlnet_diffusers_config_from_ldm,
33
+ create_unet_diffusers_config_from_ldm,
34
+ create_vae_diffusers_config_from_ldm,
35
+ )
36
+ import torch
37
+ def convert_z_image_control_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
38
+ Z_IMAGE_KEYS_RENAME_DICT = {
39
+ "final_layer.": "all_final_layer.2-1.",
40
+ "x_embedder.": "all_x_embedder.2-1.",
41
+ ".attention.out.bias": ".attention.to_out.0.bias",
42
+ ".attention.k_norm.weight": ".attention.norm_k.weight",
43
+ ".attention.q_norm.weight": ".attention.norm_q.weight",
44
+ ".attention.out.weight": ".attention.to_out.0.weight",
45
+ "control_x_embedder.": "control_all_x_embedder.2-1.",
46
+ }
47
+
48
+ def convert_z_image_fused_attention(key: str, state_dict: dict[str, object]) -> None:
49
+ if ".attention.qkv.weight" not in key:
50
+ return
51
+
52
+ fused_qkv_weight = state_dict.pop(key)
53
+ to_q_weight, to_k_weight, to_v_weight = torch.chunk(fused_qkv_weight, 3, dim=0)
54
+ new_q_name = key.replace(".attention.qkv.weight", ".attention.to_q.weight")
55
+ new_k_name = key.replace(".attention.qkv.weight", ".attention.to_k.weight")
56
+ new_v_name = key.replace(".attention.qkv.weight", ".attention.to_v.weight")
57
+
58
+ state_dict[new_q_name] = to_q_weight
59
+ state_dict[new_k_name] = to_k_weight
60
+ state_dict[new_v_name] = to_v_weight
61
+ return
62
+
63
+ TRANSFORMER_SPECIAL_KEYS_REMAP = {
64
+ ".attention.qkv.weight": convert_z_image_fused_attention,
65
+ }
66
+
67
+ def update_state_dict(state_dict: dict[str, object], old_key: str, new_key: str) -> None:
68
+ state_dict[new_key] = state_dict.pop(old_key)
69
+
70
+ converted_state_dict = {key: checkpoint.pop(key) for key in list(checkpoint.keys())}
71
+
72
+ # Handle single file --> diffusers key remapping via the remap dict
73
+ for key in list(converted_state_dict.keys()):
74
+ new_key = key[:]
75
+ for replace_key, rename_key in Z_IMAGE_KEYS_RENAME_DICT.items():
76
+ new_key = new_key.replace(replace_key, rename_key)
77
+
78
+ update_state_dict(converted_state_dict, key, new_key)
79
+
80
+ # Handle any special logic which can't be expressed by a simple 1:1 remapping with the handlers in
81
+ # special_keys_remap
82
+ for key in list(converted_state_dict.keys()):
83
+ for special_key, handler_fn_inplace in TRANSFORMER_SPECIAL_KEYS_REMAP.items():
84
+ if special_key not in key:
85
+ continue
86
+ handler_fn_inplace(key, converted_state_dict)
87
+
88
+ return converted_state_dict
89
+
90
+ SINGLE_FILE_LOADABLE_CLASSES = {
91
+ "StableCascadeUNet": {
92
+ "checkpoint_mapping_fn": convert_stable_cascade_unet_single_file_to_diffusers,
93
+ },
94
+ "UNet2DConditionModel": {
95
+ "checkpoint_mapping_fn": convert_ldm_unet_checkpoint,
96
+ "config_mapping_fn": create_unet_diffusers_config_from_ldm,
97
+ "default_subfolder": "unet",
98
+ "legacy_kwargs": {
99
+ "num_in_channels": "in_channels", # Legacy kwargs supported by `from_single_file` mapped to new args
100
+ },
101
+ },
102
+ "AutoencoderKL": {
103
+ "checkpoint_mapping_fn": convert_ldm_vae_checkpoint,
104
+ "config_mapping_fn": create_vae_diffusers_config_from_ldm,
105
+ "default_subfolder": "vae",
106
+ },
107
+ "ControlNetModel": {
108
+ "checkpoint_mapping_fn": convert_controlnet_checkpoint,
109
+ "config_mapping_fn": create_controlnet_diffusers_config_from_ldm,
110
+ },
111
+ "SD3Transformer2DModel": {
112
+ "checkpoint_mapping_fn": convert_sd3_transformer_checkpoint_to_diffusers,
113
+ "default_subfolder": "transformer",
114
+ },
115
+ "MotionAdapter": {
116
+ "checkpoint_mapping_fn": convert_animatediff_checkpoint_to_diffusers,
117
+ },
118
+ "SparseControlNetModel": {
119
+ "checkpoint_mapping_fn": convert_animatediff_checkpoint_to_diffusers,
120
+ },
121
+ "FluxTransformer2DModel": {
122
+ "checkpoint_mapping_fn": convert_flux_transformer_checkpoint_to_diffusers,
123
+ "default_subfolder": "transformer",
124
+ },
125
+ "ChromaTransformer2DModel": {
126
+ "checkpoint_mapping_fn": convert_chroma_transformer_checkpoint_to_diffusers,
127
+ "default_subfolder": "transformer",
128
+ },
129
+ "LTXVideoTransformer3DModel": {
130
+ "checkpoint_mapping_fn": convert_ltx_transformer_checkpoint_to_diffusers,
131
+ "default_subfolder": "transformer",
132
+ },
133
+ "AutoencoderKLLTXVideo": {
134
+ "checkpoint_mapping_fn": convert_ltx_vae_checkpoint_to_diffusers,
135
+ "default_subfolder": "vae",
136
+ },
137
+ "AutoencoderDC": {"checkpoint_mapping_fn": convert_autoencoder_dc_checkpoint_to_diffusers},
138
+ "MochiTransformer3DModel": {
139
+ "checkpoint_mapping_fn": convert_mochi_transformer_checkpoint_to_diffusers,
140
+ "default_subfolder": "transformer",
141
+ },
142
+ "HunyuanVideoTransformer3DModel": {
143
+ "checkpoint_mapping_fn": convert_hunyuan_video_transformer_to_diffusers,
144
+ "default_subfolder": "transformer",
145
+ },
146
+ "AuraFlowTransformer2DModel": {
147
+ "checkpoint_mapping_fn": convert_auraflow_transformer_checkpoint_to_diffusers,
148
+ "default_subfolder": "transformer",
149
+ },
150
+ "Lumina2Transformer2DModel": {
151
+ "checkpoint_mapping_fn": convert_lumina2_to_diffusers,
152
+ "default_subfolder": "transformer",
153
+ },
154
+ "SanaTransformer2DModel": {
155
+ "checkpoint_mapping_fn": convert_sana_transformer_to_diffusers,
156
+ "default_subfolder": "transformer",
157
+ },
158
+ "WanTransformer3DModel": {
159
+ "checkpoint_mapping_fn": convert_wan_transformer_to_diffusers,
160
+ "default_subfolder": "transformer",
161
+ },
162
+ "WanVACETransformer3DModel": {
163
+ "checkpoint_mapping_fn": convert_wan_transformer_to_diffusers,
164
+ "default_subfolder": "transformer",
165
+ },
166
+ "AutoencoderKLWan": {
167
+ "checkpoint_mapping_fn": convert_wan_vae_to_diffusers,
168
+ "default_subfolder": "vae",
169
+ },
170
+ "HiDreamImageTransformer2DModel": {
171
+ "checkpoint_mapping_fn": convert_hidream_transformer_to_diffusers,
172
+ "default_subfolder": "transformer",
173
+ },
174
+ "CosmosTransformer3DModel": {
175
+ "checkpoint_mapping_fn": convert_cosmos_transformer_checkpoint_to_diffusers,
176
+ "default_subfolder": "transformer",
177
+ },
178
+ "QwenImageTransformer2DModel": {
179
+ "checkpoint_mapping_fn": lambda x: x,
180
+ "default_subfolder": "transformer",
181
+ },
182
+ "Flux2Transformer2DModel": {
183
+ "checkpoint_mapping_fn": convert_flux2_transformer_checkpoint_to_diffusers,
184
+ "default_subfolder": "transformer",
185
+ },
186
+ "ZImageTransformer2DModel": {
187
+ "checkpoint_mapping_fn": convert_z_image_transformer_checkpoint_to_diffusers,
188
+ "default_subfolder": "transformer",
189
+ },
190
+ "ZImageControlTransformer2DModel": {
191
+ "checkpoint_mapping_fn": convert_z_image_control_transformer_checkpoint_to_diffusers,
192
+ "default_subfolder": "transformer",
193
+ },
194
+ }
195
+
196
+ def get_class_obj_and_candidates(
197
+ library_name, class_name, importable_classes, pipelines, is_pipeline_module, component_name=None, cache_dir=None
198
+ ):
199
+ """Simple helper method to retrieve class object of module as well as potential parent class objects"""
200
+ component_folder = os.path.join(cache_dir, component_name) if component_name and cache_dir else None
201
+
202
+ if is_pipeline_module:
203
+ pipeline_module = getattr(pipelines, library_name)
204
+
205
+ class_obj = getattr(pipeline_module, class_name)
206
+ class_candidates = dict.fromkeys(importable_classes.keys(), class_obj)
207
+ elif component_folder and os.path.isfile(os.path.join(component_folder, library_name + ".py")):
208
+ # load custom component
209
+ class_obj = get_class_from_dynamic_module(
210
+ component_folder, module_file=library_name + ".py", class_name=class_name
211
+ )
212
+ class_candidates = dict.fromkeys(importable_classes.keys(), class_obj)
213
+ else:
214
+ # else we just import it from the library.
215
+ library = importlib.import_module(library_name)
216
+
217
+ # Handle deprecated Transformers classes
218
+ if library_name == "transformers":
219
+ class_name = _maybe_remap_transformers_class(class_name) or class_name
220
+
221
+ try:
222
+ class_obj = getattr(library, class_name)
223
+ except:
224
+ module = importlib.import_module("diffusers_local")
225
+ class_obj = getattr(module, class_name)
226
+ class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
227
+
228
+ return class_obj, class_candidates
229
+
230
+ def _get_single_file_loadable_mapping_class(cls):
231
+ diffusers_module = importlib.import_module("diffusers")
232
+ class_name_str = cls.__name__
233
+ for loadable_class_str in SINGLE_FILE_LOADABLE_CLASSES:
234
+ try:
235
+ loadable_class = getattr(diffusers_module, loadable_class_str)
236
+ except:
237
+ module = importlib.import_module("diffusers_local")
238
+ loadable_class = getattr(module, loadable_class_str)
239
+ if issubclass(cls, loadable_class):
240
+ return loadable_class_str
241
+
242
+ return class_name_str
243
+
244
+ pipe_loading_utils.get_class_obj_and_candidates = get_class_obj_and_candidates
245
+ single_file_model.SINGLE_FILE_LOADABLE_CLASSES = SINGLE_FILE_LOADABLE_CLASSES
246
+ single_file_model._get_single_file_loadable_mapping_class = _get_single_file_loadable_mapping_class
diffusers_local/pipeline_z_image_control_unified.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import List, Optional, Union
17
+ import torch
18
+ from PIL import Image
19
+
20
+ from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, DiffusionPipeline
21
+ from diffusers.loaders import FromSingleFileMixin, ZImageLoraLoaderMixin
22
+ from diffusers.image_processor import VaeImageProcessor
23
+ from diffusers.utils import logging
24
+ from diffusers.pipelines.z_image.pipeline_z_image import calculate_shift
25
+ from diffusers.pipelines.z_image.pipeline_output import ZImagePipelineOutput
26
+ from diffusers_local.z_image_control_transformer_2d import ZImageControlTransformer2DModel
27
+ from transformers import AutoTokenizer, PreTrainedModel
28
+ from diffusers.utils.torch_utils import randn_tensor
29
+
30
+ logger = logging.get_logger(__name__)
31
+
32
+ # Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
33
+ def calculate_shift(
34
+ image_seq_len,
35
+ base_seq_len: int = 256,
36
+ max_seq_len: int = 4096,
37
+ base_shift: float = 0.5,
38
+ max_shift: float = 1.15,
39
+ ):
40
+ m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
41
+ b = base_shift - m * base_seq_len
42
+ mu = image_seq_len * m + b
43
+ return mu
44
+
45
+
46
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
47
+ def retrieve_timesteps(
48
+ scheduler,
49
+ num_inference_steps: Optional[int] = None,
50
+ device: Optional[Union[str, torch.device]] = None,
51
+ timesteps: Optional[List[int]] = None,
52
+ sigmas: Optional[List[float]] = None,
53
+ **kwargs,
54
+ ):
55
+ r"""
56
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
57
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
58
+
59
+ Args:
60
+ scheduler (`SchedulerMixin`):
61
+ The scheduler to get timesteps from.
62
+ num_inference_steps (`int`):
63
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
64
+ must be `None`.
65
+ device (`str` or `torch.device`, *optional*):
66
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
67
+ timesteps (`List[int]`, *optional*):
68
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
69
+ `num_inference_steps` and `sigmas` must be `None`.
70
+ sigmas (`List[float]`, *optional*):
71
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
72
+ `num_inference_steps` and `timesteps` must be `None`.
73
+
74
+ Returns:
75
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
76
+ second element is the number of inference steps.
77
+ """
78
+ if timesteps is not None and sigmas is not None:
79
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
80
+ if timesteps is not None:
81
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
82
+ if not accepts_timesteps:
83
+ raise ValueError(
84
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
85
+ f" timestep schedules. Please check whether you are using the correct scheduler."
86
+ )
87
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
88
+ timesteps = scheduler.timesteps
89
+ num_inference_steps = len(timesteps)
90
+ elif sigmas is not None:
91
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
92
+ if not accept_sigmas:
93
+ raise ValueError(
94
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
95
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
96
+ )
97
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
98
+ timesteps = scheduler.timesteps
99
+ num_inference_steps = len(timesteps)
100
+ else:
101
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
102
+ timesteps = scheduler.timesteps
103
+ return timesteps, num_inference_steps
104
+
105
+
106
+ class ZImageControlUnifiedPipeline(DiffusionPipeline, ZImageLoraLoaderMixin, FromSingleFileMixin):
107
+ _model_cpu_offload_seq = "text_encoder->transformer->vae"
108
+ _optional_components = []
109
+ _callback_tensor_inputs = ["latents", "prompt_embeds"]
110
+
111
+ def __init__(
112
+ self,
113
+ scheduler: FlowMatchEulerDiscreteScheduler,
114
+ vae: AutoencoderKL,
115
+ text_encoder: PreTrainedModel,
116
+ tokenizer: AutoTokenizer,
117
+ transformer: ZImageControlTransformer2DModel,
118
+ ):
119
+ self.register_modules(
120
+ vae=vae, text_encoder=text_encoder, tokenizer=tokenizer,
121
+ transformer=transformer, scheduler=scheduler
122
+ )
123
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
124
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
125
+
126
+ def _encode_prompt(self, prompt: str, device: torch.device, max_sequence_length: int) -> torch.Tensor:
127
+ messages = [{"role": "user", "content": prompt}]
128
+ if hasattr(self.tokenizer, "apply_chat_template"):
129
+ prompt_formatted = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True)
130
+ else:
131
+ prompt_formatted = prompt
132
+
133
+ text_inputs = self.tokenizer(prompt_formatted, padding="max_length", max_length=max_sequence_length, truncation=True, return_tensors="pt").to(device)
134
+ prompt_masks = text_inputs.attention_mask.bool()
135
+ with torch.no_grad():
136
+ prompt_embeds = self.text_encoder(input_ids=text_inputs.input_ids, attention_mask=prompt_masks, output_hidden_states=True).hidden_states[-2]
137
+ return prompt_embeds[0][prompt_masks[0]]
138
+
139
+ def prepare_latents(self, batch_size, num_channels, height, width, dtype, device, generator, latents=None):
140
+ shape = (batch_size, num_channels, height // self.vae_scale_factor, width // self.vae_scale_factor)
141
+ if latents is None:
142
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
143
+ else:
144
+ latents = latents.to(device)
145
+ return latents * self.scheduler.init_noise_sigma if hasattr(self.scheduler, "init_noise_sigma") else latents
146
+
147
+ def prepare_control_image(self, image, width, height, batch_size, num_images_per_prompt, device, dtype):
148
+ image = self.image_processor.preprocess(image, height=height, width=width).to(device=device, dtype=dtype)
149
+
150
+ image_batch_size = image.shape[0]
151
+ if image_batch_size == 1:
152
+ repeat_by = batch_size
153
+ else:
154
+ repeat_by = num_images_per_prompt
155
+ image = image.repeat_interleave(repeat_by, dim=0)
156
+ return image
157
+
158
+ @torch.no_grad()
159
+ def __call__(
160
+ self,
161
+ prompt: Union[str, List[str]],
162
+ image: Union[torch.Tensor, Image.Image],
163
+ negative_prompt: Optional[Union[str, List[str]]] = None,
164
+ height: Optional[int] = None,
165
+ width: Optional[int] = None,
166
+ num_inference_steps: int = 50,
167
+ guidance_scale: float = 0.0,
168
+ controlnet_conditioning_scale: float = 1.0,
169
+ num_images_per_prompt: int = 1,
170
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
171
+ output_type: str = "pil",
172
+ return_dict: bool = True,
173
+ **kwargs,
174
+ ):
175
+ device = self._execution_device
176
+ height = height or image.height
177
+ width = width or image.width
178
+
179
+ # 1. Prompt adjustment and batch size
180
+ if isinstance(prompt, str): prompt = [prompt]
181
+ if isinstance(negative_prompt, str): negative_prompt = [negative_prompt]
182
+
183
+ batch_size = len(prompt) * num_images_per_prompt
184
+ do_cfg = guidance_scale > 0.0
185
+
186
+ # 2. Encode text
187
+ # Repeat embeddings if num_images_per_prompt > 1
188
+ prompt_embeds_list = []
189
+ for p in prompt:
190
+ embed = self._encode_prompt(p, device, 512)
191
+ for _ in range(num_images_per_prompt):
192
+ prompt_embeds_list.append(embed)
193
+
194
+ if do_cfg:
195
+ if negative_prompt is None: negative_prompt = [""] * len(prompt)
196
+ neg_embeds_list = []
197
+ for np in negative_prompt:
198
+ embed = self._encode_prompt(np, device, 512)
199
+ for _ in range(num_images_per_prompt):
200
+ neg_embeds_list.append(embed)
201
+
202
+ prompt_input = neg_embeds_list + prompt_embeds_list
203
+ else:
204
+ prompt_input = prompt_embeds_list
205
+
206
+
207
+ # 3. Control image preparation
208
+ control_tensor = self.prepare_control_image(
209
+ image, width, height, batch_size, num_images_per_prompt, device, self.vae.dtype
210
+ )
211
+
212
+ if len(control_tensor.shape) == 3:
213
+ control_tensor = control_tensor.unsqueeze(0)
214
+
215
+ with torch.no_grad():
216
+ # Encode to latents
217
+ control_latents = self.vae.encode(control_tensor).latent_dist.mode()
218
+ control_latents = control_latents * self.vae.config.scaling_factor
219
+
220
+ # Channel fix: 4 channels -> 16 channels
221
+ if control_latents.shape[1] == 4 and self.transformer.in_channels == 16:
222
+ control_latents = control_latents.repeat(1, 4, 1, 1) # [B, 16, H, W]
223
+
224
+ control_latents = control_latents.to(dtype=self.transformer.dtype)
225
+
226
+ # Fix dimension: frame dimension [B, 16, 1, H, W]
227
+ control_latents = control_latents.unsqueeze(2)
228
+ control_context = list(control_latents.unbind(0))
229
+
230
+ # Expansion for CFG
231
+ if do_cfg:
232
+ control_context_input = control_context * 2
233
+ else:
234
+ control_context_input = control_context
235
+
236
+ # 4. Initial latents
237
+ latents = self.prepare_latents(
238
+ batch_size, self.transformer.in_channels, height, width,
239
+ prompt_embeds_list[0].dtype, device, generator
240
+ )
241
+ latents = latents.to(self.transformer.dtype)
242
+
243
+ # 5. Denoising loop
244
+ image_seq_len = (height // (self.vae_scale_factor)) * (width // (self.vae_scale_factor))
245
+ mu = calculate_shift(image_seq_len)
246
+ self.scheduler.set_timesteps(num_inference_steps, device=device, mu=mu)
247
+
248
+ for t in self.progress_bar(self.scheduler.timesteps):
249
+ t_input = t.expand(len(prompt_input))
250
+ timestep_norm = (1000.0 - t_input) / 1000.0
251
+
252
+ latents_input = torch.cat([latents] * 2) if do_cfg else latents
253
+
254
+ # List of [16, 1, H, W]
255
+ latent_list = list(latents_input.unsqueeze(2).unbind(dim=0))
256
+
257
+ model_out_list = self.transformer(
258
+ x=latent_list,
259
+ t=timestep_norm,
260
+ cap_feats=prompt_input,
261
+ control_context=control_context_input,
262
+ conditioning_scale=controlnet_conditioning_scale,
263
+ )[0]
264
+
265
+ model_out = torch.stack(model_out_list, dim=0).squeeze(2)
266
+
267
+ if do_cfg:
268
+ neg_out, pos_out = model_out.chunk(2)
269
+ noise_pred = neg_out + guidance_scale * (pos_out - neg_out)
270
+ else:
271
+ noise_pred = model_out
272
+
273
+ noise_pred = -noise_pred
274
+ latents = self.scheduler.step(noise_pred, t, latents).prev_sample
275
+
276
+ # 6. Decode
277
+ if not output_type == "latent":
278
+ # Pass 16 channels to VAE
279
+ latents_for_vae = latents.to(self.vae.dtype)
280
+ latents_for_vae = (latents_for_vae / self.vae.config.scaling_factor) + self.vae.config.shift_factor
281
+
282
+ image = self.vae.decode(latents_for_vae, return_dict=False)[0]
283
+ image = self.image_processor.postprocess(image, output_type=output_type)
284
+ else:
285
+ image = latents
286
+
287
+ self.maybe_free_model_hooks()
288
+ return ZImagePipelineOutput(images=image)
diffusers_local/z_image_control_transformer_2d.py ADDED
@@ -0,0 +1,815 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import math
16
+ from typing import List, Optional, Tuple
17
+
18
+ import torch
19
+ import torch.nn as nn
20
+ import torch.nn.functional as F
21
+ from torch.nn.utils.rnn import pad_sequence
22
+
23
+ from diffusers.configuration_utils import ConfigMixin,register_to_config
24
+ from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
25
+ from diffusers.models.attention_processor import Attention
26
+ from diffusers.models.modeling_utils import ModelMixin
27
+ from diffusers.models.normalization import RMSNorm
28
+ from diffusers.utils.torch_utils import maybe_allow_in_graph
29
+ from diffusers.models.attention_dispatch import dispatch_attention_fn
30
+ from diffusers.models.modeling_outputs import Transformer2DModelOutput
31
+
32
+
33
+ ADALN_EMBED_DIM = 256
34
+ SEQ_MULTI_OF = 32
35
+
36
+
37
+ class TimestepEmbedder(nn.Module):
38
+ def __init__(self, out_size, mid_size=None, frequency_embedding_size=256):
39
+ super().__init__()
40
+ if mid_size is None:
41
+ mid_size = out_size
42
+ self.mlp = nn.Sequential(
43
+ nn.Linear(frequency_embedding_size, mid_size, bias=True),
44
+ nn.SiLU(),
45
+ nn.Linear(mid_size, out_size, bias=True),
46
+ )
47
+
48
+ self.frequency_embedding_size = frequency_embedding_size
49
+
50
+ @staticmethod
51
+ def timestep_embedding(t, dim, max_period=10000):
52
+ with torch.amp.autocast("cuda", enabled=False):
53
+ half = dim // 2
54
+ freqs = torch.exp(
55
+ -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half
56
+ )
57
+ args = t[:, None].float() * freqs[None]
58
+ embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
59
+ if dim % 2:
60
+ embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
61
+ return embedding
62
+
63
+ def forward(self, t):
64
+ t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
65
+ weight_dtype = self.mlp[0].weight.dtype
66
+ compute_dtype = getattr(self.mlp[0], "compute_dtype", None)
67
+ if weight_dtype.is_floating_point:
68
+ t_freq = t_freq.to(weight_dtype)
69
+ elif compute_dtype is not None:
70
+ t_freq = t_freq.to(compute_dtype)
71
+ t_emb = self.mlp(t_freq)
72
+ return t_emb
73
+
74
+
75
+ class ZSingleStreamAttnProcessor:
76
+ """
77
+ Processor for Z-Image single stream attention that adapts the existing Attention class to match the behavior of the
78
+ original Z-ImageAttention module.
79
+ """
80
+
81
+ _attention_backend = None
82
+ _parallel_config = None
83
+
84
+ def __init__(self):
85
+ if not hasattr(F, "scaled_dot_product_attention"):
86
+ raise ImportError(
87
+ "ZSingleStreamAttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to version 2.0 or higher."
88
+ )
89
+
90
+ def __call__(
91
+ self,
92
+ attn: Attention,
93
+ hidden_states: torch.Tensor,
94
+ encoder_hidden_states: Optional[torch.Tensor] = None,
95
+ attention_mask: Optional[torch.Tensor] = None,
96
+ freqs_cis: Optional[torch.Tensor] = None,
97
+ ) -> torch.Tensor:
98
+ query = attn.to_q(hidden_states)
99
+ key = attn.to_k(hidden_states)
100
+ value = attn.to_v(hidden_states)
101
+
102
+ query = query.unflatten(-1, (attn.heads, -1))
103
+ key = key.unflatten(-1, (attn.heads, -1))
104
+ value = value.unflatten(-1, (attn.heads, -1))
105
+
106
+ # Apply Norms
107
+ if attn.norm_q is not None:
108
+ query = attn.norm_q(query)
109
+ if attn.norm_k is not None:
110
+ key = attn.norm_k(key)
111
+
112
+ # Apply RoPE
113
+ def apply_rotary_emb(x_in: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
114
+ with torch.amp.autocast("cuda", enabled=False):
115
+ x = torch.view_as_complex(x_in.float().reshape(*x_in.shape[:-1], -1, 2))
116
+ freqs_cis = freqs_cis.unsqueeze(2)
117
+ x_out = torch.view_as_real(x * freqs_cis).flatten(3)
118
+ return x_out.type_as(x_in) # todo
119
+
120
+ if freqs_cis is not None:
121
+ query = apply_rotary_emb(query, freqs_cis)
122
+ key = apply_rotary_emb(key, freqs_cis)
123
+
124
+ # Cast to correct dtype
125
+ dtype = query.dtype
126
+ query, key = query.to(dtype), key.to(dtype)
127
+
128
+ # From [batch, seq_len] to [batch, 1, 1, seq_len] -> broadcast to [batch, heads, seq_len, seq_len]
129
+ if attention_mask is not None and attention_mask.ndim == 2:
130
+ attention_mask = attention_mask[:, None, None, :]
131
+
132
+ # Compute joint attention
133
+ hidden_states = dispatch_attention_fn(
134
+ query,
135
+ key,
136
+ value,
137
+ attn_mask=attention_mask,
138
+ dropout_p=0.0,
139
+ is_causal=False,
140
+ backend=self._attention_backend,
141
+ parallel_config=self._parallel_config,
142
+ )
143
+
144
+ # Reshape back
145
+ hidden_states = hidden_states.flatten(2, 3)
146
+ hidden_states = hidden_states.to(dtype)
147
+
148
+ output = attn.to_out[0](hidden_states)
149
+ if len(attn.to_out) > 1: # dropout
150
+ output = attn.to_out[1](output)
151
+
152
+ return output
153
+
154
+
155
+ class FeedForward(nn.Module):
156
+ def __init__(self, dim: int, hidden_dim: int):
157
+ super().__init__()
158
+ self.w1 = nn.Linear(dim, hidden_dim, bias=False)
159
+ self.w2 = nn.Linear(hidden_dim, dim, bias=False)
160
+ self.w3 = nn.Linear(dim, hidden_dim, bias=False)
161
+
162
+ def _forward_silu_gating(self, x1, x3):
163
+ return F.silu(x1) * x3
164
+
165
+ def forward(self, x):
166
+ return self.w2(self._forward_silu_gating(self.w1(x), self.w3(x)))
167
+
168
+ def zero_module(module):
169
+ for p in module.parameters():
170
+ nn.init.zeros_(p)
171
+ return module
172
+
173
+ @maybe_allow_in_graph
174
+ class ZImageTransformerBlock(nn.Module):
175
+ def __init__(
176
+ self,
177
+ layer_id: int,
178
+ dim: int,
179
+ n_heads: int,
180
+ n_kv_heads: int,
181
+ norm_eps: float,
182
+ qk_norm: bool,
183
+ modulation=True,
184
+ ):
185
+ super().__init__()
186
+ self.dim = dim
187
+ self.head_dim = dim // n_heads
188
+
189
+ # Refactored to use diffusers Attention with custom processor
190
+ # Original Z-Image params: dim, n_heads, n_kv_heads, qk_norm
191
+ self.attention = Attention(
192
+ query_dim=dim,
193
+ cross_attention_dim=None,
194
+ dim_head=dim // n_heads,
195
+ heads=n_heads,
196
+ qk_norm="rms_norm" if qk_norm else None,
197
+ eps=1e-5,
198
+ bias=False,
199
+ out_bias=False,
200
+ processor=ZSingleStreamAttnProcessor(),
201
+ )
202
+
203
+ self.feed_forward = FeedForward(dim=dim, hidden_dim=int(dim / 3 * 8))
204
+ self.layer_id = layer_id
205
+
206
+ self.attention_norm1 = RMSNorm(dim, eps=norm_eps)
207
+ self.ffn_norm1 = RMSNorm(dim, eps=norm_eps)
208
+
209
+ self.attention_norm2 = RMSNorm(dim, eps=norm_eps)
210
+ self.ffn_norm2 = RMSNorm(dim, eps=norm_eps)
211
+
212
+ self.modulation = modulation
213
+ if modulation:
214
+ self.adaLN_modulation = nn.Sequential(nn.Linear(min(dim, ADALN_EMBED_DIM), 4 * dim, bias=True))
215
+
216
+ def forward(
217
+ self,
218
+ x: torch.Tensor,
219
+ attn_mask: torch.Tensor,
220
+ freqs_cis: torch.Tensor,
221
+ adaln_input: Optional[torch.Tensor] = None,
222
+ ):
223
+ if self.modulation:
224
+ assert adaln_input is not None
225
+ scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(adaln_input).unsqueeze(1).chunk(4, dim=2)
226
+ gate_msa, gate_mlp = gate_msa.tanh(), gate_mlp.tanh()
227
+ scale_msa, scale_mlp = 1.0 + scale_msa, 1.0 + scale_mlp
228
+
229
+ # Attention block
230
+ attn_out = self.attention(
231
+ self.attention_norm1(x) * scale_msa, attention_mask=attn_mask, freqs_cis=freqs_cis
232
+ )
233
+ x = x + gate_msa * self.attention_norm2(attn_out)
234
+
235
+ # FFN block
236
+ x = x + gate_mlp * self.ffn_norm2(self.feed_forward(self.ffn_norm1(x) * scale_mlp))
237
+ else:
238
+ # Attention block
239
+ attn_out = self.attention(self.attention_norm1(x), attention_mask=attn_mask, freqs_cis=freqs_cis)
240
+ x = x + self.attention_norm2(attn_out)
241
+
242
+ # FFN block
243
+ x = x + self.ffn_norm2(self.feed_forward(self.ffn_norm1(x)))
244
+
245
+ return x
246
+
247
+ @maybe_allow_in_graph
248
+ class ZImageControlTransformerBlock(ZImageTransformerBlock):
249
+ def __init__(
250
+ self,
251
+ layer_id: int,
252
+ dim: int,
253
+ n_heads: int,
254
+ n_kv_heads: int,
255
+ norm_eps: float,
256
+ qk_norm: bool,
257
+ modulation=True,
258
+ block_id=0,
259
+ ):
260
+ super().__init__(layer_id, dim, n_heads, n_kv_heads, norm_eps, qk_norm, modulation)
261
+ self.block_id = block_id
262
+ if block_id == 0:
263
+ self.before_proj = zero_module(nn.Linear(self.dim, self.dim))
264
+ self.after_proj = zero_module(nn.Linear(self.dim, self.dim))
265
+
266
+ def forward(
267
+ self,
268
+ c: torch.Tensor,
269
+ x: torch.Tensor,
270
+ attn_mask: torch.Tensor,
271
+ freqs_cis: torch.Tensor,
272
+ adaln_input: Optional[torch.Tensor] = None,
273
+ ):
274
+ if self.block_id == 0:
275
+ c = self.before_proj(c) + x
276
+ all_c = []
277
+ else:
278
+ all_c = list(torch.unbind(c))
279
+ c = all_c.pop(-1)
280
+
281
+ c = super().forward(c, attn_mask, freqs_cis, adaln_input)
282
+ c_skip = self.after_proj(c)
283
+ all_c += [c_skip, c]
284
+ c = torch.stack(all_c)
285
+ return c
286
+
287
+ class FinalLayer(nn.Module):
288
+ def __init__(self, hidden_size, out_channels):
289
+ super().__init__()
290
+ self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
291
+ self.linear = nn.Linear(hidden_size, out_channels, bias=True)
292
+
293
+ self.adaLN_modulation = nn.Sequential(
294
+ nn.SiLU(),
295
+ nn.Linear(min(hidden_size, ADALN_EMBED_DIM), hidden_size, bias=True),
296
+ )
297
+
298
+ def forward(self, x, c):
299
+ scale = 1.0 + self.adaLN_modulation(c)
300
+ x = self.norm_final(x) * scale.unsqueeze(1)
301
+ x = self.linear(x)
302
+ return x
303
+
304
+
305
+ class RopeEmbedder:
306
+ def __init__(
307
+ self,
308
+ theta: float = 256.0,
309
+ axes_dims: List[int] = (16, 56, 56),
310
+ axes_lens: List[int] = (64, 128, 128),
311
+ ):
312
+ self.theta = theta
313
+ self.axes_dims = axes_dims
314
+ self.axes_lens = axes_lens
315
+ assert len(axes_dims) == len(axes_lens), "axes_dims and axes_lens must have the same length"
316
+ self.freqs_cis = None
317
+
318
+ @staticmethod
319
+ def precompute_freqs_cis(dim: List[int], end: List[int], theta: float = 256.0):
320
+ with torch.device("cpu"):
321
+ freqs_cis = []
322
+ for i, (d, e) in enumerate(zip(dim, end)):
323
+ freqs = 1.0 / (theta ** (torch.arange(0, d, 2, dtype=torch.float64, device="cpu") / d))
324
+ timestep = torch.arange(e, device=freqs.device, dtype=torch.float64)
325
+ freqs = torch.outer(timestep, freqs).float()
326
+ freqs_cis_i = torch.polar(torch.ones_like(freqs), freqs).to(torch.complex64) # complex64
327
+ freqs_cis.append(freqs_cis_i)
328
+
329
+ return freqs_cis
330
+
331
+ def __call__(self, ids: torch.Tensor):
332
+ assert ids.ndim == 2
333
+ assert ids.shape[-1] == len(self.axes_dims)
334
+ device = ids.device
335
+
336
+ if self.freqs_cis is None:
337
+ self.freqs_cis = self.precompute_freqs_cis(self.axes_dims, self.axes_lens, theta=self.theta)
338
+ self.freqs_cis = [freqs_cis.to(device) for freqs_cis in self.freqs_cis]
339
+ else:
340
+ # Ensure freqs_cis are on the same device as ids
341
+ if self.freqs_cis[0].device != device:
342
+ self.freqs_cis = [freqs_cis.to(device) for freqs_cis in self.freqs_cis]
343
+
344
+ result = []
345
+ for i in range(len(self.axes_dims)):
346
+ index = ids[:, i]
347
+ result.append(self.freqs_cis[i][index])
348
+ return torch.cat(result, dim=-1)
349
+
350
+
351
+ class ZImageControlTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
352
+ _supports_gradient_checkpointing = True
353
+ _no_split_modules = ["ZImageTransformerBlock", "ZImageControlTransformerBlock"]
354
+ _repeated_blocks = ["ZImageTransformerBlock", "ZImageControlTransformerBlock"]
355
+ _skip_layerwise_casting_patterns = ["t_embedder", "cap_embedder"] # precision sensitive layers
356
+
357
+ @register_to_config
358
+ def __init__(
359
+ self,
360
+ all_patch_size=(2,),
361
+ all_f_patch_size=(1,),
362
+ in_channels=16,
363
+ dim=3840,
364
+ n_layers=30,
365
+ n_refiner_layers=2,
366
+ n_heads=30,
367
+ n_kv_heads=30,
368
+ norm_eps=1e-5,
369
+ qk_norm=True,
370
+ cap_feat_dim=2560,
371
+ rope_theta=256.0,
372
+ t_scale=1000.0,
373
+ axes_dims=[32, 48, 48],
374
+ axes_lens=[1024, 512, 512],
375
+ control_layers_places: List[int] = [0, 5, 10, 15, 20, 25],
376
+ control_in_dim=16,
377
+ ) -> None:
378
+ super().__init__()
379
+
380
+ self.in_channels = in_channels
381
+ self.out_channels = in_channels
382
+ self.all_patch_size = all_patch_size
383
+ self.all_f_patch_size = all_f_patch_size
384
+ self.dim = dim
385
+ self.n_heads = n_heads
386
+
387
+ self.rope_theta = rope_theta
388
+ self.t_scale = t_scale
389
+ self.gradient_checkpointing = False
390
+
391
+ assert len(all_patch_size) == len(all_f_patch_size)
392
+
393
+ all_x_embedder = {}
394
+ all_final_layer = {}
395
+ for patch_idx, (patch_size, f_patch_size) in enumerate(zip(all_patch_size, all_f_patch_size)):
396
+ x_embedder = nn.Linear(f_patch_size * patch_size * patch_size * in_channels, dim, bias=True)
397
+ all_x_embedder[f"{patch_size}-{f_patch_size}"] = x_embedder
398
+
399
+ final_layer = FinalLayer(dim, patch_size * patch_size * f_patch_size * self.out_channels)
400
+ all_final_layer[f"{patch_size}-{f_patch_size}"] = final_layer
401
+
402
+ self.all_x_embedder = nn.ModuleDict(all_x_embedder)
403
+ self.all_final_layer = nn.ModuleDict(all_final_layer)
404
+ self.noise_refiner = nn.ModuleList(
405
+ [
406
+ ZImageTransformerBlock(
407
+ 1000 + layer_id,
408
+ dim,
409
+ n_heads,
410
+ n_kv_heads,
411
+ norm_eps,
412
+ qk_norm,
413
+ modulation=True,
414
+ )
415
+ for layer_id in range(n_refiner_layers)
416
+ ]
417
+ )
418
+ self.context_refiner = nn.ModuleList(
419
+ [
420
+ ZImageTransformerBlock(
421
+ layer_id,
422
+ dim,
423
+ n_heads,
424
+ n_kv_heads,
425
+ norm_eps,
426
+ qk_norm,
427
+ modulation=False,
428
+ )
429
+ for layer_id in range(n_refiner_layers)
430
+ ]
431
+ )
432
+ self.t_embedder = TimestepEmbedder(min(dim, ADALN_EMBED_DIM), mid_size=1024)
433
+ self.cap_embedder = nn.Sequential(RMSNorm(cap_feat_dim, eps=norm_eps), nn.Linear(cap_feat_dim, dim, bias=True))
434
+
435
+ self.x_pad_token = nn.Parameter(torch.empty((1, dim)))
436
+ self.cap_pad_token = nn.Parameter(torch.empty((1, dim)))
437
+
438
+ self.layers = nn.ModuleList(
439
+ [
440
+ ZImageTransformerBlock(layer_id, dim, n_heads, n_kv_heads, norm_eps, qk_norm)
441
+ for layer_id in range(n_layers)
442
+ ]
443
+ )
444
+ head_dim = dim // n_heads
445
+ assert head_dim == sum(axes_dims)
446
+ self.axes_dims = axes_dims
447
+ self.axes_lens = axes_lens
448
+
449
+ self.rope_embedder = RopeEmbedder(theta=rope_theta, axes_dims=axes_dims, axes_lens=axes_lens)
450
+
451
+ self.control_layers_places = [i for i in range(0, self.n_layers, 2)] if control_layers_places is None else control_layers_places
452
+ self.control_in_dim = self.dim if control_in_dim is None else control_in_dim
453
+
454
+ assert 0 in self.control_layers_places
455
+
456
+ # control blocks
457
+ self.control_layers = nn.ModuleList(
458
+ [
459
+ ZImageControlTransformerBlock(i, dim, n_heads, n_kv_heads, norm_eps, qk_norm, block_id=i)
460
+ for i in self.control_layers_places
461
+ ]
462
+ )
463
+
464
+ # control patch embeddings
465
+ all_x_embedder = {}
466
+ for patch_idx, (patch_size, f_patch_size) in enumerate(zip(all_patch_size, all_f_patch_size)):
467
+ x_embedder = nn.Linear(f_patch_size * patch_size * patch_size * self.control_in_dim, dim, bias=True)
468
+ all_x_embedder[f"{patch_size}-{f_patch_size}"] = x_embedder
469
+
470
+ self.control_all_x_embedder = nn.ModuleDict(all_x_embedder)
471
+ self.control_noise_refiner = nn.ModuleList(
472
+ [
473
+ ZImageTransformerBlock(
474
+ 1000 + layer_id,
475
+ dim,
476
+ n_heads,
477
+ n_kv_heads,
478
+ norm_eps,
479
+ qk_norm,
480
+ modulation=True,
481
+ )
482
+ for layer_id in range(n_refiner_layers)
483
+ ]
484
+ )
485
+
486
+ def unpatchify(self, x: List[torch.Tensor], size: List[Tuple], patch_size, f_patch_size) -> List[torch.Tensor]:
487
+ pH = pW = patch_size
488
+ pF = f_patch_size
489
+ bsz = len(x)
490
+ assert len(size) == bsz
491
+ for i in range(bsz):
492
+ F, H, W = size[i]
493
+ ori_len = (F // pF) * (H // pH) * (W // pW)
494
+ # "f h w pf ph pw c -> c (f pf) (h ph) (w pw)"
495
+ x[i] = (
496
+ x[i][:ori_len]
497
+ .view(F // pF, H // pH, W // pW, pF, pH, pW, self.out_channels)
498
+ .permute(6, 0, 3, 1, 4, 2, 5)
499
+ .reshape(self.out_channels, F, H, W)
500
+ )
501
+ return x
502
+
503
+ @staticmethod
504
+ def create_coordinate_grid(size, start=None, device=None):
505
+ if start is None:
506
+ start = (0 for _ in size)
507
+
508
+ axes = [torch.arange(x0, x0 + span, dtype=torch.int32, device=device) for x0, span in zip(start, size)]
509
+ grids = torch.meshgrid(axes, indexing="ij")
510
+ return torch.stack(grids, dim=-1)
511
+
512
+ def patchify_and_embed(
513
+ self,
514
+ all_image: List[torch.Tensor],
515
+ all_cap_feats: List[torch.Tensor],
516
+ patch_size: int,
517
+ f_patch_size: int,
518
+ ):
519
+ pH = pW = patch_size
520
+ pF = f_patch_size
521
+ device = all_image[0].device
522
+
523
+ all_image_out = []
524
+ all_image_size = []
525
+ all_image_pos_ids = []
526
+ all_image_pad_mask = []
527
+ all_cap_pos_ids = []
528
+ all_cap_pad_mask = []
529
+ all_cap_feats_out = []
530
+
531
+ for i, (image, cap_feat) in enumerate(zip(all_image, all_cap_feats)):
532
+ ### Process Caption
533
+ cap_ori_len = len(cap_feat)
534
+ cap_padding_len = (-cap_ori_len) % SEQ_MULTI_OF
535
+ # padded position ids
536
+ cap_padded_pos_ids = self.create_coordinate_grid(
537
+ size=(cap_ori_len + cap_padding_len, 1, 1),
538
+ start=(1, 0, 0),
539
+ device=device,
540
+ ).flatten(0, 2)
541
+ all_cap_pos_ids.append(cap_padded_pos_ids)
542
+ # pad mask
543
+ cap_pad_mask = torch.cat(
544
+ [
545
+ torch.zeros((cap_ori_len,), dtype=torch.bool, device=device),
546
+ torch.ones((cap_padding_len,), dtype=torch.bool, device=device),
547
+ ],
548
+ dim=0,
549
+ )
550
+ all_cap_pad_mask.append(
551
+ cap_pad_mask if cap_padding_len > 0 else torch.zeros((cap_ori_len,), dtype=torch.bool, device=device)
552
+ )
553
+
554
+ # padded feature
555
+ cap_padded_feat = torch.cat([cap_feat, cap_feat[-1:].repeat(cap_padding_len, 1)], dim=0)
556
+ all_cap_feats_out.append(cap_padded_feat)
557
+
558
+ ### Process Image
559
+ C, F, H, W = image.size()
560
+ all_image_size.append((F, H, W))
561
+ F_tokens, H_tokens, W_tokens = F // pF, H // pH, W // pW
562
+
563
+ image = image.view(C, F_tokens, pF, H_tokens, pH, W_tokens, pW)
564
+ # "c f pf h ph w pw -> (f h w) (pf ph pw c)"
565
+ image = image.permute(1, 3, 5, 2, 4, 6, 0).reshape(F_tokens * H_tokens * W_tokens, pF * pH * pW * C)
566
+
567
+ image_ori_len = len(image)
568
+ image_padding_len = (-image_ori_len) % SEQ_MULTI_OF
569
+
570
+ image_ori_pos_ids = self.create_coordinate_grid(
571
+ size=(F_tokens, H_tokens, W_tokens),
572
+ start=(cap_ori_len + cap_padding_len + 1, 0, 0),
573
+ device=device,
574
+ ).flatten(0, 2)
575
+ image_padded_pos_ids = torch.cat(
576
+ [
577
+ image_ori_pos_ids,
578
+ self.create_coordinate_grid(size=(1, 1, 1), start=(0, 0, 0), device=device)
579
+ .flatten(0, 2)
580
+ .repeat(image_padding_len, 1),
581
+ ],
582
+ dim=0,
583
+ )
584
+ all_image_pos_ids.append(image_padded_pos_ids if image_padding_len > 0 else image_ori_pos_ids)
585
+ # pad mask
586
+ image_pad_mask = torch.cat(
587
+ [
588
+ torch.zeros((image_ori_len,), dtype=torch.bool, device=device),
589
+ torch.ones((image_padding_len,), dtype=torch.bool, device=device),
590
+ ],
591
+ dim=0,
592
+ )
593
+ all_image_pad_mask.append(
594
+ image_pad_mask
595
+ if image_padding_len > 0
596
+ else torch.zeros((image_ori_len,), dtype=torch.bool, device=device)
597
+ )
598
+ # padded feature
599
+ image_padded_feat = torch.cat(
600
+ [image, image[-1:].repeat(image_padding_len, 1)],
601
+ dim=0,
602
+ )
603
+ all_image_out.append(image_padded_feat if image_padding_len > 0 else image)
604
+
605
+ return (
606
+ all_image_out,
607
+ all_cap_feats_out,
608
+ all_image_size,
609
+ all_image_pos_ids,
610
+ all_cap_pos_ids,
611
+ all_image_pad_mask,
612
+ all_cap_pad_mask,
613
+ )
614
+
615
+ def patchify(
616
+ self,
617
+ all_image: List[torch.Tensor],
618
+ patch_size: int,
619
+ f_patch_size: int,
620
+ ):
621
+ pH = pW = patch_size
622
+ pF = f_patch_size
623
+ all_image_out = []
624
+
625
+ for i, image in enumerate(all_image):
626
+ ### Process Image
627
+ C, F, H, W = image.size()
628
+ F_tokens, H_tokens, W_tokens = F // pF, H // pH, W // pW
629
+
630
+ image = image.view(C, F_tokens, pF, H_tokens, pH, W_tokens, pW)
631
+ # "c f pf h ph w pw -> (f h w) (pf ph pw c)"
632
+ image = image.permute(1, 3, 5, 2, 4, 6, 0).reshape(F_tokens * H_tokens * W_tokens, pF * pH * pW * C)
633
+
634
+ image_ori_len = len(image)
635
+ image_padding_len = (-image_ori_len) % SEQ_MULTI_OF
636
+
637
+ # padded feature
638
+ image_padded_feat = torch.cat([image, image[-1:].repeat(image_padding_len, 1)], dim=0)
639
+ all_image_out.append(image_padded_feat)
640
+
641
+ return all_image_out
642
+
643
+ def forward(
644
+ self,
645
+ x: List[torch.Tensor],
646
+ t,
647
+ cap_feats: List[torch.Tensor],
648
+ patch_size=2,
649
+ f_patch_size=1,
650
+ control_context: Optional[List[torch.Tensor]] = None,
651
+ conditioning_scale: float = 1.0,
652
+ return_dict: bool = True,
653
+ ):
654
+ assert patch_size in self.all_patch_size
655
+ assert f_patch_size in self.all_f_patch_size
656
+
657
+ bsz = len(x)
658
+ device = x[0].device
659
+ t = t * self.t_scale
660
+ t = self.t_embedder(t)
661
+
662
+ (
663
+ x,
664
+ cap_feats,
665
+ x_size,
666
+ x_pos_ids,
667
+ cap_pos_ids,
668
+ x_inner_pad_mask,
669
+ cap_inner_pad_mask,
670
+ ) = self.patchify_and_embed(x, cap_feats, patch_size, f_patch_size)
671
+
672
+ # x embed & refine
673
+ x_item_seqlens = [len(_) for _ in x]
674
+ assert all(_ % SEQ_MULTI_OF == 0 for _ in x_item_seqlens)
675
+ x_max_item_seqlen = max(x_item_seqlens)
676
+
677
+ x = torch.cat(x, dim=0)
678
+ x = self.all_x_embedder[f"{patch_size}-{f_patch_size}"](x)
679
+
680
+ # Match t_embedder output dtype to x for layerwise casting compatibility
681
+ adaln_input = t.type_as(x)
682
+ x[torch.cat(x_inner_pad_mask)] = self.x_pad_token.to(x.dtype)
683
+ x = list(x.split(x_item_seqlens, dim=0))
684
+ x_freqs_cis = list(self.rope_embedder(torch.cat(x_pos_ids, dim=0)).split([len(_) for _ in x_pos_ids], dim=0))
685
+
686
+ x = pad_sequence(x, batch_first=True, padding_value=0.0)
687
+ x_freqs_cis = pad_sequence(x_freqs_cis, batch_first=True, padding_value=0.0)
688
+ # Clarify the length matches to satisfy Dynamo due to "Symbolic Shape Inference" to avoid compilation errors
689
+ x_freqs_cis = x_freqs_cis[:, : x.shape[1]]
690
+
691
+ x_attn_mask = torch.zeros((bsz, x_max_item_seqlen), dtype=torch.bool, device=device)
692
+ for i, seq_len in enumerate(x_item_seqlens):
693
+ x_attn_mask[i, :seq_len] = 1
694
+
695
+ if torch.is_grad_enabled() and self.gradient_checkpointing:
696
+ for layer in self.noise_refiner:
697
+ x = self._gradient_checkpointing_func(layer, x, x_attn_mask, x_freqs_cis, adaln_input)
698
+ else:
699
+ for layer in self.noise_refiner:
700
+ x = layer(x, x_attn_mask, x_freqs_cis, adaln_input)
701
+
702
+ # cap embed & refine
703
+ cap_item_seqlens = [len(_) for _ in cap_feats]
704
+ cap_max_item_seqlen = max(cap_item_seqlens)
705
+
706
+ cap_feats = torch.cat(cap_feats, dim=0)
707
+ cap_feats = self.cap_embedder(cap_feats)
708
+ cap_feats[torch.cat(cap_inner_pad_mask)] = self.cap_pad_token.to(dtype=cap_feats.dtype)
709
+ cap_feats = list(cap_feats.split(cap_item_seqlens, dim=0))
710
+ cap_freqs_cis = list(
711
+ self.rope_embedder(torch.cat(cap_pos_ids, dim=0)).split([len(_) for _ in cap_pos_ids], dim=0)
712
+ )
713
+
714
+ cap_feats = pad_sequence(cap_feats, batch_first=True, padding_value=0.0)
715
+ cap_freqs_cis = pad_sequence(cap_freqs_cis, batch_first=True, padding_value=0.0)
716
+ # Clarify the length matches to satisfy Dynamo due to "Symbolic Shape Inference" to avoid compilation errors
717
+ cap_freqs_cis = cap_freqs_cis[:, : cap_feats.shape[1]]
718
+
719
+ cap_attn_mask = torch.zeros((bsz, cap_max_item_seqlen), dtype=torch.bool, device=device)
720
+ for i, seq_len in enumerate(cap_item_seqlens):
721
+ cap_attn_mask[i, :seq_len] = 1
722
+
723
+ if torch.is_grad_enabled() and self.gradient_checkpointing:
724
+ for layer in self.context_refiner:
725
+ cap_feats = self._gradient_checkpointing_func(layer, cap_feats, cap_attn_mask, cap_freqs_cis)
726
+ else:
727
+ for layer in self.context_refiner:
728
+ cap_feats = layer(cap_feats, cap_attn_mask, cap_freqs_cis)
729
+
730
+ # unified
731
+ unified = []
732
+ unified_freqs_cis = []
733
+ for i in range(bsz):
734
+ x_len = x_item_seqlens[i]
735
+ cap_len = cap_item_seqlens[i]
736
+ unified.append(torch.cat([x[i][:x_len], cap_feats[i][:cap_len]]))
737
+ unified_freqs_cis.append(torch.cat([x_freqs_cis[i][:x_len], cap_freqs_cis[i][:cap_len]]))
738
+ unified_item_seqlens = [a + b for a, b in zip(cap_item_seqlens, x_item_seqlens)]
739
+ assert unified_item_seqlens == [len(_) for _ in unified]
740
+ unified_max_item_seqlen = max(unified_item_seqlens)
741
+
742
+ unified = pad_sequence(unified, batch_first=True, padding_value=0.0)
743
+ unified_freqs_cis = pad_sequence(unified_freqs_cis, batch_first=True, padding_value=0.0)
744
+ unified_attn_mask = torch.zeros((bsz, unified_max_item_seqlen), dtype=torch.bool, device=device)
745
+ for i, seq_len in enumerate(unified_item_seqlens):
746
+ unified_attn_mask[i, :seq_len] = 1
747
+
748
+ ## ControlNet start
749
+
750
+ controlnet_block_samples = None
751
+ if control_context is not None:
752
+ control_context = self.patchify(control_context, patch_size, f_patch_size)
753
+ control_context = torch.cat(control_context, dim=0)
754
+ control_context = self.control_all_x_embedder[f"{patch_size}-{f_patch_size}"](control_context)
755
+
756
+ control_context[torch.cat(x_inner_pad_mask)] = self.x_pad_token
757
+ control_context = list(control_context.split(x_item_seqlens, dim=0))
758
+
759
+ control_context = pad_sequence(control_context, batch_first=True, padding_value=0.0)
760
+
761
+ if torch.is_grad_enabled() and self.gradient_checkpointing:
762
+ for layer in self.control_noise_refiner:
763
+ control_context = self._gradient_checkpointing_func(
764
+ layer, control_context, x_attn_mask, x_freqs_cis, adaln_input
765
+ )
766
+ else:
767
+ for layer in self.control_noise_refiner:
768
+ control_context = layer(control_context, x_attn_mask, x_freqs_cis, adaln_input)
769
+
770
+ # unified
771
+ control_context_unified = []
772
+ for i in range(bsz):
773
+ x_len = x_item_seqlens[i]
774
+ cap_len = cap_item_seqlens[i]
775
+ control_context_unified.append(torch.cat([control_context[i][:x_len], cap_feats[i][:cap_len]]))
776
+ control_context_unified = pad_sequence(control_context_unified, batch_first=True, padding_value=0.0)
777
+
778
+ for layer in self.control_layers:
779
+ if torch.is_grad_enabled() and self.gradient_checkpointing:
780
+ control_context_unified = self._gradient_checkpointing_func(
781
+ layer, control_context_unified, unified, unified_attn_mask, unified_freqs_cis, adaln_input
782
+ )
783
+ else:
784
+ control_context_unified = layer(
785
+ control_context_unified, unified, unified_attn_mask, unified_freqs_cis, adaln_input
786
+ )
787
+
788
+ hints = torch.unbind(control_context_unified)[:-1]
789
+ controlnet_block_samples = {
790
+ layer_idx: hints[idx] * conditioning_scale for idx, layer_idx in enumerate(self.control_layers_places)
791
+ }
792
+
793
+ if torch.is_grad_enabled() and self.gradient_checkpointing:
794
+ for layer_idx, layer in enumerate(self.layers):
795
+ unified = self._gradient_checkpointing_func(
796
+ layer, unified, unified_attn_mask, unified_freqs_cis, adaln_input
797
+ )
798
+ if controlnet_block_samples is not None:
799
+ if layer_idx in controlnet_block_samples:
800
+ unified = unified + controlnet_block_samples[layer_idx]
801
+ else:
802
+ for layer_idx, layer in enumerate(self.layers):
803
+ unified = layer(unified, unified_attn_mask, unified_freqs_cis, adaln_input)
804
+ if controlnet_block_samples is not None:
805
+ if layer_idx in controlnet_block_samples:
806
+ unified = unified + controlnet_block_samples[layer_idx]
807
+
808
+ unified = self.all_final_layer[f"{patch_size}-{f_patch_size}"](unified, adaln_input)
809
+ unified = list(unified.unbind(dim=0))
810
+ x = self.unpatchify(unified, x_size, patch_size, f_patch_size)
811
+
812
+ if not return_dict:
813
+ return (x,)
814
+
815
+ return Transformer2DModelOutput(sample=x)
example/canny_man.png ADDED
example/depth_cat.png ADDED

Git LFS Details

  • SHA256: 2088b40e100f912183036763a08de02b62e7ea26bc413448f15977452a7dd0b2
  • Pointer size: 131 Bytes
  • Size of remote file: 294 kB
example/depth_man.png ADDED

Git LFS Details

  • SHA256: 05924ff275319bee258e44dca63184c22d7006bfa18b71d94e387feab8cf9625
  • Pointer size: 131 Bytes
  • Size of remote file: 138 kB
example/teed_man.png ADDED
infer_gguf.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import time
3
+ import os
4
+ from PIL import Image
5
+
6
+ # 1. Import all necessary components
7
+ from diffusers_local.pipeline_z_image_control_unified import ZImageControlUnifiedPipeline
8
+ from diffusers_local.z_image_control_transformer_2d import ZImageControlTransformer2DModel
9
+ from diffusers import FlowMatchEulerDiscreteScheduler
10
+ from diffusers import (
11
+ AutoencoderKL,
12
+ FlowMatchEulerDiscreteScheduler,
13
+ GGUFQuantizationConfig
14
+ )
15
+ from transformers import AutoModelForCausalLM, AutoTokenizer
16
+
17
+ import diffusers_local.patch #apply patches required
18
+
19
+ def main():
20
+
21
+ # 1. Set params ---
22
+ BASE_MODEL_ID = "."
23
+ GGUF_FILENAME = "z_image_turbo_control_unified_q4_k_m.gguf"
24
+ prompt = "a man"
25
+ negative_prompt = None # Set to None, as it won't be used with guidance_scale=0
26
+
27
+ target_height, target_width = 1024, 1024
28
+ num_inference_steps = 9
29
+ guidance_scale = 0.0 #for Turbo
30
+ controlnet_conditioning_scale = 0.7
31
+ seed = 42
32
+ shift = 3.0
33
+ generator = torch.Generator("cuda").manual_seed(seed)
34
+
35
+ print("Loading Model Components...")
36
+ vae = AutoencoderKL.from_pretrained(BASE_MODEL_ID, subfolder="vae", torch_dtype=torch.bfloat16)
37
+ text_encoder = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, subfolder="text_encoder", torch_dtype=torch.bfloat16)
38
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, subfolder="tokenizer")
39
+ scheduler = FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=shift)
40
+
41
+ print(f"Loading Main Transformer from GGUF...")
42
+ transformer = ZImageControlTransformer2DModel.from_single_file(
43
+ os.path.join(BASE_MODEL_ID, GGUF_FILENAME),
44
+ quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
45
+ low_cpu_mem_usage=True,
46
+ torch_dtype=torch.bfloat16
47
+ )
48
+
49
+ print("Loading Pipeline...")
50
+ pipe = ZImageControlUnifiedPipeline(
51
+ vae=vae, text_encoder=text_encoder, tokenizer=tokenizer,
52
+ scheduler=scheduler, transformer=transformer
53
+ )
54
+
55
+ pipe.enable_group_offload(onload_device="cuda",
56
+ offload_device="cpu",
57
+ offload_type="leaf_level",
58
+ low_cpu_mem_usage=True,
59
+ use_stream=True
60
+ )
61
+
62
+
63
+ print("\nRunning Inference...")
64
+ input_image = Image.open("example/depth_cat.png").convert("RGB")
65
+
66
+ prompt = "a cat"
67
+ negative_prompt = None # Set to None, as it won't be used with guidance_scale=0
68
+
69
+ target_height, target_width = 1024, 1024
70
+ num_inference_steps = 9
71
+ guidance_scale = 0.0 # Correct setting for Turbo models
72
+ controlnet_conditioning_scale = 0.7
73
+ seed = 42
74
+ generator = torch.Generator("cuda").manual_seed(seed)
75
+
76
+ input_image_resized = input_image.resize((target_width, target_height), Image.Resampling.LANCZOS)
77
+
78
+ start_inference_time = time.time()
79
+
80
+ generated_image = pipe(
81
+ prompt=prompt,
82
+ negative_prompt=negative_prompt,
83
+ image=input_image_resized,
84
+ height=target_height,
85
+ width=target_width,
86
+ num_inference_steps=num_inference_steps,
87
+ guidance_scale=guidance_scale,
88
+ controlnet_conditioning_scale=controlnet_conditioning_scale,
89
+ generator=generator,
90
+ ).images[0]
91
+
92
+ end_inference_time = time.time()
93
+ print(f"\nGeneration finished in {end_inference_time - start_inference_time:.2f} seconds.")
94
+
95
+ # Save Output
96
+ if not os.path.exists("outputs"): os.makedirs("outputs")
97
+ output_filename = "outputs/z_image_controlnet_result_gguf.png"
98
+ generated_image.save(output_filename)
99
+ print(f"Image successfully saved as '{output_filename}'")
100
+ generated_image.show()
101
+
102
+ if __name__ == "__main__":
103
+ main()
infer_pretrained.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import time
3
+ import os
4
+ from PIL import Image
5
+
6
+ # 1. Import all necessary components
7
+ from diffusers_local.pipeline_z_image_control_unified import ZImageControlUnifiedPipeline
8
+ from diffusers import FlowMatchEulerDiscreteScheduler
9
+
10
+ import diffusers_local.patch #apply patches required
11
+
12
+ def main():
13
+
14
+ # 1. Set params ---
15
+ BASE_MODEL_ID = "."
16
+ prompt = "a man"
17
+ negative_prompt = None # Set to None, as it won't be used with guidance_scale=0
18
+
19
+ target_height, target_width = 1024, 1024
20
+ num_inference_steps = 9
21
+ guidance_scale = 0.0 #for Turbo
22
+ controlnet_conditioning_scale = 0.7
23
+ seed = 42
24
+ shift = 3.0
25
+ generator = torch.Generator("cuda").manual_seed(seed)
26
+
27
+ print("Loading Pipeline...")
28
+ scheduler = FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=shift)
29
+
30
+ pipe = ZImageControlUnifiedPipeline.from_pretrained(
31
+ BASE_MODEL_ID
32
+ )
33
+ pipe.scheduler = scheduler
34
+
35
+ #Apply optimization (Optional)
36
+ pipe.enable_group_offload(
37
+ onload_device="cuda",
38
+ offload_device="cpu",
39
+ offload_type="leaf_level",
40
+ low_cpu_mem_usage=True,
41
+ use_stream=True
42
+ )
43
+
44
+ print("\nRunning Inference...")
45
+
46
+ input_image = Image.open("example/depth_man.png").convert("RGB")
47
+ input_image_resized = input_image.resize((target_width, target_height), Image.Resampling.LANCZOS)
48
+ start_inference_time = time.time()
49
+
50
+ with torch.inference_mode():
51
+ generated_image = pipe(
52
+ prompt=prompt,
53
+ negative_prompt=negative_prompt,
54
+ image=input_image_resized,
55
+ height=target_height,
56
+ width=target_width,
57
+ num_inference_steps=num_inference_steps,
58
+ guidance_scale=guidance_scale,
59
+ controlnet_conditioning_scale=controlnet_conditioning_scale,
60
+ generator=generator,
61
+ ).images[0]
62
+
63
+ end_inference_time = time.time()
64
+ print(f"\nGeneration finished in {end_inference_time - start_inference_time:.2f} seconds.")
65
+
66
+ # Save Output
67
+ if not os.path.exists("outputs"): os.makedirs("outputs")
68
+ output_filename = "outputs/z_image_controlnet_result.png"
69
+ generated_image.save(output_filename)
70
+ print(f"Image successfully saved as '{output_filename}'")
71
+ generated_image.show()
72
+
73
+ if __name__ == "__main__":
74
+ main()
model_index.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "ZImagePipeline",
3
+ "_diffusers_version": "0.36.0.dev0",
4
+ "scheduler": [
5
+ "diffusers",
6
+ "FlowMatchEulerDiscreteScheduler"
7
+ ],
8
+ "text_encoder": [
9
+ "transformers",
10
+ "Qwen3Model"
11
+ ],
12
+ "tokenizer": [
13
+ "transformers",
14
+ "Qwen2Tokenizer"
15
+ ],
16
+ "transformer": [
17
+ "diffusers",
18
+ "ZImageControlTransformer2DModel"
19
+ ],
20
+ "vae": [
21
+ "diffusers",
22
+ "AutoencoderKL"
23
+ ]
24
+ }
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --extra-index-url https://download.pytorch.org/whl/cu126
2
+
3
+ diffusers
4
+ torch==2.8.0+cu126
5
+ torchvision==0.23.0+cu126
6
+ torchaudio==2.8.0+cu126
7
+ transformers==4.56.0
8
+ bitsandbytes==0.48.1
9
+ xformers==0.0.32.post2
10
+ hf_xet
11
+ gguf
12
+ accelerate
13
+ protobuf
14
+ sacremoses
15
+ sentencepiece
16
+ scipy
17
+ triton-windows<3.5; sys_platform == 'win32'
18
+ triton==3.4.0; sys_platform != 'win32'
scheduler/scheduler_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "FlowMatchEulerDiscreteScheduler",
3
+ "_diffusers_version": "0.36.0.dev0",
4
+ "num_train_timesteps": 1000,
5
+ "use_dynamic_shifting": false,
6
+ "shift": 3.0
7
+ }
text_encoder/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2560,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 9728,
14
+ "max_position_embeddings": 40960,
15
+ "max_window_layers": 36,
16
+ "model_type": "qwen3",
17
+ "num_attention_heads": 32,
18
+ "num_hidden_layers": 36,
19
+ "num_key_value_heads": 8,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_scaling": null,
22
+ "rope_theta": 1000000,
23
+ "sliding_window": null,
24
+ "tie_word_embeddings": true,
25
+ "torch_dtype": "bfloat16",
26
+ "transformers_version": "4.51.0",
27
+ "use_cache": true,
28
+ "use_sliding_window": false,
29
+ "vocab_size": 151936
30
+ }
text_encoder/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "temperature": 0.6,
10
+ "top_k": 20,
11
+ "top_p": 0.95,
12
+ "transformers_version": "4.51.0"
13
+ }
text_encoder/model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:328a91d3122359d5547f9d79521205bc0a46e1f79a792dfe650e99fc2d651223
3
+ size 3957900840
text_encoder/model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cd087b316306a68c562436b5492edbcf6e16c6dba3a1308279caa5a58e21ca5
3
+ size 3987450520
text_encoder/model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ca841ee75b9c61267c0c6148fd8d096d3d21b6d3e161256a9b878154f91fc52
3
+ size 99630640
text_encoder/model.safetensors.index.json ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 8044936192
4
+ },
5
+ "weight_map": {
6
+ "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
7
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
8
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
9
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
10
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
11
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
12
+ "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
15
+ "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
16
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
17
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
18
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
19
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
20
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
21
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
22
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
23
+ "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
24
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
25
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
26
+ "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
27
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
28
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
29
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
30
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
31
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
32
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
33
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
34
+ "model.layers.10.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
35
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
36
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
37
+ "model.layers.10.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
38
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
39
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
40
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors",
41
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
42
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
43
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
44
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
45
+ "model.layers.11.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
46
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
47
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
48
+ "model.layers.11.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
49
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
50
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
51
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors",
52
+ "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
53
+ "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
54
+ "model.layers.12.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
55
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
56
+ "model.layers.12.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
57
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
58
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
59
+ "model.layers.12.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
60
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
61
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
62
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors",
63
+ "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
64
+ "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
65
+ "model.layers.13.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
66
+ "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
67
+ "model.layers.13.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
68
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
69
+ "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
70
+ "model.layers.13.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
71
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
72
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
73
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors",
74
+ "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
75
+ "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
76
+ "model.layers.14.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
77
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
78
+ "model.layers.14.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
79
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
80
+ "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
81
+ "model.layers.14.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
82
+ "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
83
+ "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
84
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
85
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
86
+ "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
87
+ "model.layers.15.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
88
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
89
+ "model.layers.15.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
90
+ "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
91
+ "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
92
+ "model.layers.15.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
93
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
94
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
95
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
96
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
97
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
98
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
99
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
100
+ "model.layers.16.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
101
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
102
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
103
+ "model.layers.16.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
104
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
105
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
106
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
107
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
108
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
109
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
110
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
111
+ "model.layers.17.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
112
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
113
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
114
+ "model.layers.17.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
115
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
116
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
117
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
118
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
119
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
120
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
121
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
122
+ "model.layers.18.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
123
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
124
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
125
+ "model.layers.18.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
126
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
127
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
128
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
129
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
130
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
131
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
132
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
133
+ "model.layers.19.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
134
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
135
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
136
+ "model.layers.19.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
137
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
138
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
139
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
140
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
141
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
142
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
143
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
144
+ "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
145
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
146
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
147
+ "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
148
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
149
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
150
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
151
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
152
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
153
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
154
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
155
+ "model.layers.20.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
156
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
157
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
158
+ "model.layers.20.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
159
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
160
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
161
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
162
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
163
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
164
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
165
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
166
+ "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
167
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
168
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
169
+ "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
170
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
171
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
172
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
173
+ "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
174
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
175
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
176
+ "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
177
+ "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
178
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
179
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
180
+ "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
181
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
182
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
183
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors",
184
+ "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
185
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
186
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
187
+ "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
188
+ "model.layers.23.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
189
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
190
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
191
+ "model.layers.23.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
192
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
193
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
194
+ "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors",
195
+ "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
196
+ "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
197
+ "model.layers.24.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
198
+ "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
199
+ "model.layers.24.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
200
+ "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
201
+ "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
202
+ "model.layers.24.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
203
+ "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
204
+ "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
205
+ "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors",
206
+ "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
207
+ "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
208
+ "model.layers.25.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
209
+ "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
210
+ "model.layers.25.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
211
+ "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
212
+ "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
213
+ "model.layers.25.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
214
+ "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
215
+ "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
216
+ "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors",
217
+ "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
218
+ "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
219
+ "model.layers.26.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
220
+ "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
221
+ "model.layers.26.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
222
+ "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
223
+ "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
224
+ "model.layers.26.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
225
+ "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
226
+ "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
227
+ "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors",
228
+ "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
229
+ "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
230
+ "model.layers.27.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
231
+ "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
232
+ "model.layers.27.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
233
+ "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
234
+ "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
235
+ "model.layers.27.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
236
+ "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
237
+ "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
238
+ "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors",
239
+ "model.layers.28.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
240
+ "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
241
+ "model.layers.28.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
242
+ "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
243
+ "model.layers.28.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
244
+ "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
245
+ "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
246
+ "model.layers.28.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
247
+ "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
248
+ "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
249
+ "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors",
250
+ "model.layers.29.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
251
+ "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
252
+ "model.layers.29.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
253
+ "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
254
+ "model.layers.29.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
255
+ "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
256
+ "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
257
+ "model.layers.29.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
258
+ "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
259
+ "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
260
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
261
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
262
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
263
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
264
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
265
+ "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
266
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
267
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
268
+ "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
269
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
270
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
271
+ "model.layers.30.input_layernorm.weight": "model-00002-of-00003.safetensors",
272
+ "model.layers.30.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
273
+ "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
274
+ "model.layers.30.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
275
+ "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
276
+ "model.layers.30.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
277
+ "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
278
+ "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
279
+ "model.layers.30.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
280
+ "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
281
+ "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
282
+ "model.layers.31.input_layernorm.weight": "model-00002-of-00003.safetensors",
283
+ "model.layers.31.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
284
+ "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
285
+ "model.layers.31.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
286
+ "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
287
+ "model.layers.31.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
288
+ "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
289
+ "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
290
+ "model.layers.31.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
291
+ "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
292
+ "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
293
+ "model.layers.32.input_layernorm.weight": "model-00002-of-00003.safetensors",
294
+ "model.layers.32.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
295
+ "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
296
+ "model.layers.32.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
297
+ "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
298
+ "model.layers.32.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
299
+ "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
300
+ "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
301
+ "model.layers.32.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
302
+ "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
303
+ "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
304
+ "model.layers.33.input_layernorm.weight": "model-00002-of-00003.safetensors",
305
+ "model.layers.33.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
306
+ "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
307
+ "model.layers.33.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
308
+ "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
309
+ "model.layers.33.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
310
+ "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
311
+ "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
312
+ "model.layers.33.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
313
+ "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
314
+ "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
315
+ "model.layers.34.input_layernorm.weight": "model-00002-of-00003.safetensors",
316
+ "model.layers.34.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
317
+ "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
318
+ "model.layers.34.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
319
+ "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
320
+ "model.layers.34.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
321
+ "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
322
+ "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
323
+ "model.layers.34.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
324
+ "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
325
+ "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
326
+ "model.layers.35.input_layernorm.weight": "model-00003-of-00003.safetensors",
327
+ "model.layers.35.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
328
+ "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
329
+ "model.layers.35.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
330
+ "model.layers.35.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
331
+ "model.layers.35.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
332
+ "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
333
+ "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
334
+ "model.layers.35.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
335
+ "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
336
+ "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
337
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
338
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
339
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
340
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
341
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
342
+ "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
343
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
344
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
345
+ "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
346
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
347
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
348
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
349
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
350
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
351
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
352
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
353
+ "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
354
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
355
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
356
+ "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
357
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
358
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
359
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
360
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
361
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
362
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
363
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
364
+ "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
365
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
366
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
367
+ "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
368
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
369
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
370
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
371
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
372
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
373
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
374
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
375
+ "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
376
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
377
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
378
+ "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
379
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
380
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
381
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
382
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
383
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
384
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
385
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
386
+ "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
387
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
388
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
389
+ "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
390
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
391
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
392
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
393
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
394
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
395
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
396
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
397
+ "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
398
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
399
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
400
+ "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
401
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
402
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
403
+ "model.norm.weight": "model-00003-of-00003.safetensors"
404
+ }
405
+ }
tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = '' %}\n {%- endif %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is string %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in content %}\n {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
231
+ "clean_up_tokenization_spaces": false,
232
+ "eos_token": "<|im_end|>",
233
+ "errors": "replace",
234
+ "model_max_length": 131072,
235
+ "pad_token": "<|endoftext|>",
236
+ "split_special_tokens": false,
237
+ "tokenizer_class": "Qwen2Tokenizer",
238
+ "unk_token": null
239
+ }
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
transformer/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "ZImageControlTransformer2DModel",
3
+ "_diffusers_version": "0.36.0.dev0",
4
+ "all_f_patch_size": [
5
+ 1
6
+ ],
7
+ "all_patch_size": [
8
+ 2
9
+ ],
10
+ "axes_dims": [
11
+ 32,
12
+ 48,
13
+ 48
14
+ ],
15
+ "axes_lens": [
16
+ 1536,
17
+ 512,
18
+ 512
19
+ ],
20
+ "cap_feat_dim": 2560,
21
+ "dim": 3840,
22
+ "in_channels": 16,
23
+ "n_heads": 30,
24
+ "n_kv_heads": 30,
25
+ "n_layers": 30,
26
+ "n_refiner_layers": 2,
27
+ "norm_eps": 1e-05,
28
+ "qk_norm": true,
29
+ "rope_theta": 256.0,
30
+ "t_scale": 1000.0
31
+ }
transformer/diffusion_pytorch_model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95facd593e2549e8252acb571c653d57f7ddb7f1060d4e81712f152555a88804
3
+ size 9973693184
transformer/diffusion_pytorch_model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4bbe43ee184a1fb5af4b412d27555f532893bdc3165b1149e304ed82b5d7015
3
+ size 9973714824
transformer/diffusion_pytorch_model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aba4e37a590e63210878160a718d916d80398f4e1f78ab6c9b2b2a00d92769fa
3
+ size 4672282880
transformer/diffusion_pytorch_model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86c085c0d7853f12ce5183499934b54d08371c60f549c5a6b20615cd23989388
3
+ size 3101572408
transformer/diffusion_pytorch_model.safetensors.index.json ADDED
@@ -0,0 +1,664 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 27721207352
4
+ },
5
+ "weight_map": {
6
+ "all_final_layer.2-1.adaLN_modulation.1.bias": "diffusion_pytorch_model-00001-of-00004.safetensors",
7
+ "all_final_layer.2-1.adaLN_modulation.1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
8
+ "all_final_layer.2-1.linear.bias": "diffusion_pytorch_model-00001-of-00004.safetensors",
9
+ "all_final_layer.2-1.linear.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
10
+ "all_x_embedder.2-1.bias": "diffusion_pytorch_model-00001-of-00004.safetensors",
11
+ "all_x_embedder.2-1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
12
+ "cap_embedder.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
13
+ "cap_embedder.1.bias": "diffusion_pytorch_model-00001-of-00004.safetensors",
14
+ "cap_embedder.1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
15
+ "cap_pad_token": "diffusion_pytorch_model-00001-of-00004.safetensors",
16
+ "context_refiner.0.attention.norm_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
17
+ "context_refiner.0.attention.norm_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
18
+ "context_refiner.0.attention.to_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
19
+ "context_refiner.0.attention.to_out.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
20
+ "context_refiner.0.attention.to_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
21
+ "context_refiner.0.attention.to_v.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
22
+ "context_refiner.0.attention_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
23
+ "context_refiner.0.attention_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
24
+ "context_refiner.0.feed_forward.w1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
25
+ "context_refiner.0.feed_forward.w2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
26
+ "context_refiner.0.feed_forward.w3.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
27
+ "context_refiner.0.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
28
+ "context_refiner.0.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
29
+ "context_refiner.1.attention.norm_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
30
+ "context_refiner.1.attention.norm_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
31
+ "context_refiner.1.attention.to_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
32
+ "context_refiner.1.attention.to_out.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
33
+ "context_refiner.1.attention.to_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
34
+ "context_refiner.1.attention.to_v.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
35
+ "context_refiner.1.attention_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
36
+ "context_refiner.1.attention_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
37
+ "context_refiner.1.feed_forward.w1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
38
+ "context_refiner.1.feed_forward.w2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
39
+ "context_refiner.1.feed_forward.w3.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
40
+ "context_refiner.1.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
41
+ "context_refiner.1.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
42
+ "layers.0.adaLN_modulation.0.bias": "diffusion_pytorch_model-00001-of-00004.safetensors",
43
+ "layers.0.adaLN_modulation.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
44
+ "layers.0.attention.norm_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
45
+ "layers.0.attention.norm_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
46
+ "layers.0.attention.to_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
47
+ "layers.0.attention.to_out.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
48
+ "layers.0.attention.to_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
49
+ "layers.0.attention.to_v.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
50
+ "layers.0.attention_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
51
+ "layers.0.attention_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
52
+ "layers.0.feed_forward.w1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
53
+ "layers.0.feed_forward.w2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
54
+ "layers.0.feed_forward.w3.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
55
+ "layers.0.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
56
+ "layers.0.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
57
+ "layers.1.adaLN_modulation.0.bias": "diffusion_pytorch_model-00001-of-00004.safetensors",
58
+ "layers.1.adaLN_modulation.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
59
+ "layers.1.attention.norm_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
60
+ "layers.1.attention.norm_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
61
+ "layers.1.attention.to_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
62
+ "layers.1.attention.to_out.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
63
+ "layers.1.attention.to_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
64
+ "layers.1.attention.to_v.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
65
+ "layers.1.attention_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
66
+ "layers.1.attention_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
67
+ "layers.1.feed_forward.w1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
68
+ "layers.1.feed_forward.w2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
69
+ "layers.1.feed_forward.w3.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
70
+ "layers.1.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
71
+ "layers.1.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
72
+ "layers.10.adaLN_modulation.0.bias": "diffusion_pytorch_model-00002-of-00004.safetensors",
73
+ "layers.10.adaLN_modulation.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
74
+ "layers.10.attention.norm_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
75
+ "layers.10.attention.norm_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
76
+ "layers.10.attention.to_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
77
+ "layers.10.attention.to_out.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
78
+ "layers.10.attention.to_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
79
+ "layers.10.attention.to_v.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
80
+ "layers.10.attention_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
81
+ "layers.10.attention_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
82
+ "layers.10.feed_forward.w1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
83
+ "layers.10.feed_forward.w2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
84
+ "layers.10.feed_forward.w3.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
85
+ "layers.10.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
86
+ "layers.10.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
87
+ "layers.11.adaLN_modulation.0.bias": "diffusion_pytorch_model-00002-of-00004.safetensors",
88
+ "layers.11.adaLN_modulation.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
89
+ "layers.11.attention.norm_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
90
+ "layers.11.attention.norm_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
91
+ "layers.11.attention.to_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
92
+ "layers.11.attention.to_out.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
93
+ "layers.11.attention.to_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
94
+ "layers.11.attention.to_v.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
95
+ "layers.11.attention_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
96
+ "layers.11.attention_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
97
+ "layers.11.feed_forward.w1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
98
+ "layers.11.feed_forward.w2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
99
+ "layers.11.feed_forward.w3.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
100
+ "layers.11.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
101
+ "layers.11.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
102
+ "layers.12.adaLN_modulation.0.bias": "diffusion_pytorch_model-00002-of-00004.safetensors",
103
+ "layers.12.adaLN_modulation.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
104
+ "layers.12.attention.norm_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
105
+ "layers.12.attention.norm_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
106
+ "layers.12.attention.to_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
107
+ "layers.12.attention.to_out.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
108
+ "layers.12.attention.to_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
109
+ "layers.12.attention.to_v.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
110
+ "layers.12.attention_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
111
+ "layers.12.attention_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
112
+ "layers.12.feed_forward.w1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
113
+ "layers.12.feed_forward.w2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
114
+ "layers.12.feed_forward.w3.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
115
+ "layers.12.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
116
+ "layers.12.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
117
+ "layers.13.adaLN_modulation.0.bias": "diffusion_pytorch_model-00002-of-00004.safetensors",
118
+ "layers.13.adaLN_modulation.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
119
+ "layers.13.attention.norm_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
120
+ "layers.13.attention.norm_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
121
+ "layers.13.attention.to_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
122
+ "layers.13.attention.to_out.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
123
+ "layers.13.attention.to_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
124
+ "layers.13.attention.to_v.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
125
+ "layers.13.attention_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
126
+ "layers.13.attention_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
127
+ "layers.13.feed_forward.w1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
128
+ "layers.13.feed_forward.w2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
129
+ "layers.13.feed_forward.w3.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
130
+ "layers.13.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
131
+ "layers.13.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
132
+ "layers.14.adaLN_modulation.0.bias": "diffusion_pytorch_model-00002-of-00004.safetensors",
133
+ "layers.14.adaLN_modulation.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
134
+ "layers.14.attention.norm_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
135
+ "layers.14.attention.norm_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
136
+ "layers.14.attention.to_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
137
+ "layers.14.attention.to_out.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
138
+ "layers.14.attention.to_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
139
+ "layers.14.attention.to_v.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
140
+ "layers.14.attention_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
141
+ "layers.14.attention_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
142
+ "layers.14.feed_forward.w1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
143
+ "layers.14.feed_forward.w2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
144
+ "layers.14.feed_forward.w3.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
145
+ "layers.14.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
146
+ "layers.14.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
147
+ "layers.15.adaLN_modulation.0.bias": "diffusion_pytorch_model-00002-of-00004.safetensors",
148
+ "layers.15.adaLN_modulation.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
149
+ "layers.15.attention.norm_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
150
+ "layers.15.attention.norm_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
151
+ "layers.15.attention.to_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
152
+ "layers.15.attention.to_out.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
153
+ "layers.15.attention.to_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
154
+ "layers.15.attention.to_v.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
155
+ "layers.15.attention_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
156
+ "layers.15.attention_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
157
+ "layers.15.feed_forward.w1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
158
+ "layers.15.feed_forward.w2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
159
+ "layers.15.feed_forward.w3.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
160
+ "layers.15.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
161
+ "layers.15.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
162
+ "layers.16.adaLN_modulation.0.bias": "diffusion_pytorch_model-00002-of-00004.safetensors",
163
+ "layers.16.adaLN_modulation.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
164
+ "layers.16.attention.norm_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
165
+ "layers.16.attention.norm_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
166
+ "layers.16.attention.to_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
167
+ "layers.16.attention.to_out.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
168
+ "layers.16.attention.to_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
169
+ "layers.16.attention.to_v.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
170
+ "layers.16.attention_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
171
+ "layers.16.attention_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
172
+ "layers.16.feed_forward.w1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
173
+ "layers.16.feed_forward.w2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
174
+ "layers.16.feed_forward.w3.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
175
+ "layers.16.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
176
+ "layers.16.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
177
+ "layers.17.adaLN_modulation.0.bias": "diffusion_pytorch_model-00002-of-00004.safetensors",
178
+ "layers.17.adaLN_modulation.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
179
+ "layers.17.attention.norm_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
180
+ "layers.17.attention.norm_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
181
+ "layers.17.attention.to_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
182
+ "layers.17.attention.to_out.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
183
+ "layers.17.attention.to_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
184
+ "layers.17.attention.to_v.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
185
+ "layers.17.attention_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
186
+ "layers.17.attention_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
187
+ "layers.17.feed_forward.w1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
188
+ "layers.17.feed_forward.w2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
189
+ "layers.17.feed_forward.w3.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
190
+ "layers.17.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
191
+ "layers.17.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
192
+ "layers.18.adaLN_modulation.0.bias": "diffusion_pytorch_model-00002-of-00004.safetensors",
193
+ "layers.18.adaLN_modulation.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
194
+ "layers.18.attention.norm_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
195
+ "layers.18.attention.norm_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
196
+ "layers.18.attention.to_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
197
+ "layers.18.attention.to_out.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
198
+ "layers.18.attention.to_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
199
+ "layers.18.attention.to_v.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
200
+ "layers.18.attention_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
201
+ "layers.18.attention_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
202
+ "layers.18.feed_forward.w1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
203
+ "layers.18.feed_forward.w2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
204
+ "layers.18.feed_forward.w3.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
205
+ "layers.18.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
206
+ "layers.18.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
207
+ "layers.19.adaLN_modulation.0.bias": "diffusion_pytorch_model-00002-of-00004.safetensors",
208
+ "layers.19.adaLN_modulation.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
209
+ "layers.19.attention.norm_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
210
+ "layers.19.attention.norm_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
211
+ "layers.19.attention.to_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
212
+ "layers.19.attention.to_out.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
213
+ "layers.19.attention.to_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
214
+ "layers.19.attention.to_v.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
215
+ "layers.19.attention_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
216
+ "layers.19.attention_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
217
+ "layers.19.feed_forward.w1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
218
+ "layers.19.feed_forward.w2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
219
+ "layers.19.feed_forward.w3.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
220
+ "layers.19.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
221
+ "layers.19.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
222
+ "layers.2.adaLN_modulation.0.bias": "diffusion_pytorch_model-00001-of-00004.safetensors",
223
+ "layers.2.adaLN_modulation.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
224
+ "layers.2.attention.norm_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
225
+ "layers.2.attention.norm_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
226
+ "layers.2.attention.to_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
227
+ "layers.2.attention.to_out.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
228
+ "layers.2.attention.to_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
229
+ "layers.2.attention.to_v.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
230
+ "layers.2.attention_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
231
+ "layers.2.attention_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
232
+ "layers.2.feed_forward.w1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
233
+ "layers.2.feed_forward.w2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
234
+ "layers.2.feed_forward.w3.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
235
+ "layers.2.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
236
+ "layers.2.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
237
+ "layers.20.adaLN_modulation.0.bias": "diffusion_pytorch_model-00002-of-00004.safetensors",
238
+ "layers.20.adaLN_modulation.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
239
+ "layers.20.attention.norm_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
240
+ "layers.20.attention.norm_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
241
+ "layers.20.attention.to_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
242
+ "layers.20.attention.to_out.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
243
+ "layers.20.attention.to_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
244
+ "layers.20.attention.to_v.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
245
+ "layers.20.attention_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
246
+ "layers.20.attention_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
247
+ "layers.20.feed_forward.w1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
248
+ "layers.20.feed_forward.w2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
249
+ "layers.20.feed_forward.w3.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
250
+ "layers.20.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
251
+ "layers.20.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
252
+ "layers.21.adaLN_modulation.0.bias": "diffusion_pytorch_model-00002-of-00004.safetensors",
253
+ "layers.21.adaLN_modulation.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
254
+ "layers.21.attention.norm_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
255
+ "layers.21.attention.norm_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
256
+ "layers.21.attention.to_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
257
+ "layers.21.attention.to_out.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
258
+ "layers.21.attention.to_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
259
+ "layers.21.attention.to_v.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
260
+ "layers.21.attention_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
261
+ "layers.21.attention_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
262
+ "layers.21.feed_forward.w1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
263
+ "layers.21.feed_forward.w2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
264
+ "layers.21.feed_forward.w3.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
265
+ "layers.21.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
266
+ "layers.21.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
267
+ "layers.22.adaLN_modulation.0.bias": "diffusion_pytorch_model-00002-of-00004.safetensors",
268
+ "layers.22.adaLN_modulation.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
269
+ "layers.22.attention.norm_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
270
+ "layers.22.attention.norm_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
271
+ "layers.22.attention.to_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
272
+ "layers.22.attention.to_out.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
273
+ "layers.22.attention.to_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
274
+ "layers.22.attention.to_v.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
275
+ "layers.22.attention_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
276
+ "layers.22.attention_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
277
+ "layers.22.feed_forward.w1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
278
+ "layers.22.feed_forward.w2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
279
+ "layers.22.feed_forward.w3.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
280
+ "layers.22.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
281
+ "layers.22.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
282
+ "layers.23.adaLN_modulation.0.bias": "diffusion_pytorch_model-00003-of-00004.safetensors",
283
+ "layers.23.adaLN_modulation.0.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
284
+ "layers.23.attention.norm_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
285
+ "layers.23.attention.norm_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
286
+ "layers.23.attention.to_k.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
287
+ "layers.23.attention.to_out.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
288
+ "layers.23.attention.to_q.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
289
+ "layers.23.attention.to_v.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
290
+ "layers.23.attention_norm1.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
291
+ "layers.23.attention_norm2.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
292
+ "layers.23.feed_forward.w1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
293
+ "layers.23.feed_forward.w2.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
294
+ "layers.23.feed_forward.w3.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
295
+ "layers.23.ffn_norm1.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
296
+ "layers.23.ffn_norm2.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
297
+ "layers.24.adaLN_modulation.0.bias": "diffusion_pytorch_model-00003-of-00004.safetensors",
298
+ "layers.24.adaLN_modulation.0.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
299
+ "layers.24.attention.norm_k.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
300
+ "layers.24.attention.norm_q.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
301
+ "layers.24.attention.to_k.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
302
+ "layers.24.attention.to_out.0.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
303
+ "layers.24.attention.to_q.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
304
+ "layers.24.attention.to_v.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
305
+ "layers.24.attention_norm1.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
306
+ "layers.24.attention_norm2.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
307
+ "layers.24.feed_forward.w1.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
308
+ "layers.24.feed_forward.w2.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
309
+ "layers.24.feed_forward.w3.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
310
+ "layers.24.ffn_norm1.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
311
+ "layers.24.ffn_norm2.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
312
+ "layers.25.adaLN_modulation.0.bias": "diffusion_pytorch_model-00003-of-00004.safetensors",
313
+ "layers.25.adaLN_modulation.0.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
314
+ "layers.25.attention.norm_k.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
315
+ "layers.25.attention.norm_q.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
316
+ "layers.25.attention.to_k.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
317
+ "layers.25.attention.to_out.0.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
318
+ "layers.25.attention.to_q.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
319
+ "layers.25.attention.to_v.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
320
+ "layers.25.attention_norm1.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
321
+ "layers.25.attention_norm2.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
322
+ "layers.25.feed_forward.w1.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
323
+ "layers.25.feed_forward.w2.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
324
+ "layers.25.feed_forward.w3.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
325
+ "layers.25.ffn_norm1.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
326
+ "layers.25.ffn_norm2.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
327
+ "layers.26.adaLN_modulation.0.bias": "diffusion_pytorch_model-00003-of-00004.safetensors",
328
+ "layers.26.adaLN_modulation.0.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
329
+ "layers.26.attention.norm_k.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
330
+ "layers.26.attention.norm_q.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
331
+ "layers.26.attention.to_k.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
332
+ "layers.26.attention.to_out.0.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
333
+ "layers.26.attention.to_q.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
334
+ "layers.26.attention.to_v.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
335
+ "layers.26.attention_norm1.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
336
+ "layers.26.attention_norm2.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
337
+ "layers.26.feed_forward.w1.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
338
+ "layers.26.feed_forward.w2.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
339
+ "layers.26.feed_forward.w3.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
340
+ "layers.26.ffn_norm1.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
341
+ "layers.26.ffn_norm2.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
342
+ "layers.27.adaLN_modulation.0.bias": "diffusion_pytorch_model-00003-of-00004.safetensors",
343
+ "layers.27.adaLN_modulation.0.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
344
+ "layers.27.attention.norm_k.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
345
+ "layers.27.attention.norm_q.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
346
+ "layers.27.attention.to_k.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
347
+ "layers.27.attention.to_out.0.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
348
+ "layers.27.attention.to_q.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
349
+ "layers.27.attention.to_v.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
350
+ "layers.27.attention_norm1.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
351
+ "layers.27.attention_norm2.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
352
+ "layers.27.feed_forward.w1.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
353
+ "layers.27.feed_forward.w2.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
354
+ "layers.27.feed_forward.w3.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
355
+ "layers.27.ffn_norm1.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
356
+ "layers.27.ffn_norm2.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
357
+ "layers.28.adaLN_modulation.0.bias": "diffusion_pytorch_model-00003-of-00004.safetensors",
358
+ "layers.28.adaLN_modulation.0.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
359
+ "layers.28.attention.norm_k.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
360
+ "layers.28.attention.norm_q.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
361
+ "layers.28.attention.to_k.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
362
+ "layers.28.attention.to_out.0.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
363
+ "layers.28.attention.to_q.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
364
+ "layers.28.attention.to_v.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
365
+ "layers.28.attention_norm1.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
366
+ "layers.28.attention_norm2.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
367
+ "layers.28.feed_forward.w1.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
368
+ "layers.28.feed_forward.w2.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
369
+ "layers.28.feed_forward.w3.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
370
+ "layers.28.ffn_norm1.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
371
+ "layers.28.ffn_norm2.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
372
+ "layers.29.adaLN_modulation.0.bias": "diffusion_pytorch_model-00003-of-00004.safetensors",
373
+ "layers.29.adaLN_modulation.0.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
374
+ "layers.29.attention.norm_k.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
375
+ "layers.29.attention.norm_q.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
376
+ "layers.29.attention.to_k.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
377
+ "layers.29.attention.to_out.0.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
378
+ "layers.29.attention.to_q.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
379
+ "layers.29.attention.to_v.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
380
+ "layers.29.attention_norm1.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
381
+ "layers.29.attention_norm2.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
382
+ "layers.29.feed_forward.w1.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
383
+ "layers.29.feed_forward.w2.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
384
+ "layers.29.feed_forward.w3.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
385
+ "layers.29.ffn_norm1.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
386
+ "layers.29.ffn_norm2.weight": "diffusion_pytorch_model-00003-of-00004.safetensors",
387
+ "layers.3.adaLN_modulation.0.bias": "diffusion_pytorch_model-00001-of-00004.safetensors",
388
+ "layers.3.adaLN_modulation.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
389
+ "layers.3.attention.norm_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
390
+ "layers.3.attention.norm_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
391
+ "layers.3.attention.to_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
392
+ "layers.3.attention.to_out.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
393
+ "layers.3.attention.to_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
394
+ "layers.3.attention.to_v.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
395
+ "layers.3.attention_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
396
+ "layers.3.attention_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
397
+ "layers.3.feed_forward.w1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
398
+ "layers.3.feed_forward.w2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
399
+ "layers.3.feed_forward.w3.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
400
+ "layers.3.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
401
+ "layers.3.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
402
+ "layers.4.adaLN_modulation.0.bias": "diffusion_pytorch_model-00001-of-00004.safetensors",
403
+ "layers.4.adaLN_modulation.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
404
+ "layers.4.attention.norm_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
405
+ "layers.4.attention.norm_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
406
+ "layers.4.attention.to_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
407
+ "layers.4.attention.to_out.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
408
+ "layers.4.attention.to_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
409
+ "layers.4.attention.to_v.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
410
+ "layers.4.attention_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
411
+ "layers.4.attention_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
412
+ "layers.4.feed_forward.w1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
413
+ "layers.4.feed_forward.w2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
414
+ "layers.4.feed_forward.w3.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
415
+ "layers.4.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
416
+ "layers.4.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
417
+ "layers.5.adaLN_modulation.0.bias": "diffusion_pytorch_model-00001-of-00004.safetensors",
418
+ "layers.5.adaLN_modulation.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
419
+ "layers.5.attention.norm_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
420
+ "layers.5.attention.norm_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
421
+ "layers.5.attention.to_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
422
+ "layers.5.attention.to_out.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
423
+ "layers.5.attention.to_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
424
+ "layers.5.attention.to_v.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
425
+ "layers.5.attention_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
426
+ "layers.5.attention_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
427
+ "layers.5.feed_forward.w1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
428
+ "layers.5.feed_forward.w2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
429
+ "layers.5.feed_forward.w3.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
430
+ "layers.5.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
431
+ "layers.5.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
432
+ "layers.6.adaLN_modulation.0.bias": "diffusion_pytorch_model-00001-of-00004.safetensors",
433
+ "layers.6.adaLN_modulation.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
434
+ "layers.6.attention.norm_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
435
+ "layers.6.attention.norm_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
436
+ "layers.6.attention.to_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
437
+ "layers.6.attention.to_out.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
438
+ "layers.6.attention.to_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
439
+ "layers.6.attention.to_v.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
440
+ "layers.6.attention_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
441
+ "layers.6.attention_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
442
+ "layers.6.feed_forward.w1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
443
+ "layers.6.feed_forward.w2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
444
+ "layers.6.feed_forward.w3.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
445
+ "layers.6.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
446
+ "layers.6.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
447
+ "layers.7.adaLN_modulation.0.bias": "diffusion_pytorch_model-00001-of-00004.safetensors",
448
+ "layers.7.adaLN_modulation.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
449
+ "layers.7.attention.norm_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
450
+ "layers.7.attention.norm_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
451
+ "layers.7.attention.to_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
452
+ "layers.7.attention.to_out.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
453
+ "layers.7.attention.to_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
454
+ "layers.7.attention.to_v.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
455
+ "layers.7.attention_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
456
+ "layers.7.attention_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
457
+ "layers.7.feed_forward.w1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
458
+ "layers.7.feed_forward.w2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
459
+ "layers.7.feed_forward.w3.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
460
+ "layers.7.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
461
+ "layers.7.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
462
+ "layers.8.adaLN_modulation.0.bias": "diffusion_pytorch_model-00001-of-00004.safetensors",
463
+ "layers.8.adaLN_modulation.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
464
+ "layers.8.attention.norm_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
465
+ "layers.8.attention.norm_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
466
+ "layers.8.attention.to_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
467
+ "layers.8.attention.to_out.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
468
+ "layers.8.attention.to_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
469
+ "layers.8.attention.to_v.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
470
+ "layers.8.attention_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
471
+ "layers.8.attention_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
472
+ "layers.8.feed_forward.w1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
473
+ "layers.8.feed_forward.w2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
474
+ "layers.8.feed_forward.w3.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
475
+ "layers.8.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
476
+ "layers.8.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
477
+ "layers.9.adaLN_modulation.0.bias": "diffusion_pytorch_model-00002-of-00004.safetensors",
478
+ "layers.9.adaLN_modulation.0.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
479
+ "layers.9.attention.norm_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
480
+ "layers.9.attention.norm_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
481
+ "layers.9.attention.to_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
482
+ "layers.9.attention.to_out.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
483
+ "layers.9.attention.to_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
484
+ "layers.9.attention.to_v.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
485
+ "layers.9.attention_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
486
+ "layers.9.attention_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
487
+ "layers.9.feed_forward.w1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
488
+ "layers.9.feed_forward.w2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
489
+ "layers.9.feed_forward.w3.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
490
+ "layers.9.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
491
+ "layers.9.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00004.safetensors",
492
+ "noise_refiner.0.adaLN_modulation.0.bias": "diffusion_pytorch_model-00001-of-00004.safetensors",
493
+ "noise_refiner.0.adaLN_modulation.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
494
+ "noise_refiner.0.attention.norm_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
495
+ "noise_refiner.0.attention.norm_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
496
+ "noise_refiner.0.attention.to_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
497
+ "noise_refiner.0.attention.to_out.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
498
+ "noise_refiner.0.attention.to_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
499
+ "noise_refiner.0.attention.to_v.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
500
+ "noise_refiner.0.attention_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
501
+ "noise_refiner.0.attention_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
502
+ "noise_refiner.0.feed_forward.w1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
503
+ "noise_refiner.0.feed_forward.w2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
504
+ "noise_refiner.0.feed_forward.w3.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
505
+ "noise_refiner.0.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
506
+ "noise_refiner.0.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
507
+ "noise_refiner.1.adaLN_modulation.0.bias": "diffusion_pytorch_model-00001-of-00004.safetensors",
508
+ "noise_refiner.1.adaLN_modulation.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
509
+ "noise_refiner.1.attention.norm_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
510
+ "noise_refiner.1.attention.norm_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
511
+ "noise_refiner.1.attention.to_k.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
512
+ "noise_refiner.1.attention.to_out.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
513
+ "noise_refiner.1.attention.to_q.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
514
+ "noise_refiner.1.attention.to_v.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
515
+ "noise_refiner.1.attention_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
516
+ "noise_refiner.1.attention_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
517
+ "noise_refiner.1.feed_forward.w1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
518
+ "noise_refiner.1.feed_forward.w2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
519
+ "noise_refiner.1.feed_forward.w3.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
520
+ "noise_refiner.1.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
521
+ "noise_refiner.1.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
522
+ "t_embedder.mlp.0.bias": "diffusion_pytorch_model-00001-of-00004.safetensors",
523
+ "t_embedder.mlp.0.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
524
+ "t_embedder.mlp.2.bias": "diffusion_pytorch_model-00001-of-00004.safetensors",
525
+ "t_embedder.mlp.2.weight": "diffusion_pytorch_model-00001-of-00004.safetensors",
526
+ "x_pad_token": "diffusion_pytorch_model-00001-of-00004.safetensors",
527
+ "control_all_x_embedder.2-1.bias": "diffusion_pytorch_model-00004-of-00004.safetensors",
528
+ "control_all_x_embedder.2-1.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
529
+ "control_layers.0.adaLN_modulation.0.bias": "diffusion_pytorch_model-00004-of-00004.safetensors",
530
+ "control_layers.0.adaLN_modulation.0.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
531
+ "control_layers.0.after_proj.bias": "diffusion_pytorch_model-00004-of-00004.safetensors",
532
+ "control_layers.0.after_proj.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
533
+ "control_layers.0.attention.norm_k.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
534
+ "control_layers.0.attention.norm_q.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
535
+ "control_layers.0.attention.to_k.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
536
+ "control_layers.0.attention.to_out.0.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
537
+ "control_layers.0.attention.to_q.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
538
+ "control_layers.0.attention.to_v.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
539
+ "control_layers.0.attention_norm1.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
540
+ "control_layers.0.attention_norm2.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
541
+ "control_layers.0.before_proj.bias": "diffusion_pytorch_model-00004-of-00004.safetensors",
542
+ "control_layers.0.before_proj.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
543
+ "control_layers.0.feed_forward.w1.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
544
+ "control_layers.0.feed_forward.w2.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
545
+ "control_layers.0.feed_forward.w3.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
546
+ "control_layers.0.ffn_norm1.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
547
+ "control_layers.0.ffn_norm2.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
548
+ "control_layers.1.adaLN_modulation.0.bias": "diffusion_pytorch_model-00004-of-00004.safetensors",
549
+ "control_layers.1.adaLN_modulation.0.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
550
+ "control_layers.1.after_proj.bias": "diffusion_pytorch_model-00004-of-00004.safetensors",
551
+ "control_layers.1.after_proj.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
552
+ "control_layers.1.attention.norm_k.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
553
+ "control_layers.1.attention.norm_q.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
554
+ "control_layers.1.attention.to_k.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
555
+ "control_layers.1.attention.to_out.0.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
556
+ "control_layers.1.attention.to_q.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
557
+ "control_layers.1.attention.to_v.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
558
+ "control_layers.1.attention_norm1.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
559
+ "control_layers.1.attention_norm2.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
560
+ "control_layers.1.feed_forward.w1.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
561
+ "control_layers.1.feed_forward.w2.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
562
+ "control_layers.1.feed_forward.w3.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
563
+ "control_layers.1.ffn_norm1.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
564
+ "control_layers.1.ffn_norm2.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
565
+ "control_layers.2.adaLN_modulation.0.bias": "diffusion_pytorch_model-00004-of-00004.safetensors",
566
+ "control_layers.2.adaLN_modulation.0.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
567
+ "control_layers.2.after_proj.bias": "diffusion_pytorch_model-00004-of-00004.safetensors",
568
+ "control_layers.2.after_proj.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
569
+ "control_layers.2.attention.norm_k.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
570
+ "control_layers.2.attention.norm_q.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
571
+ "control_layers.2.attention.to_k.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
572
+ "control_layers.2.attention.to_out.0.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
573
+ "control_layers.2.attention.to_q.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
574
+ "control_layers.2.attention.to_v.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
575
+ "control_layers.2.attention_norm1.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
576
+ "control_layers.2.attention_norm2.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
577
+ "control_layers.2.feed_forward.w1.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
578
+ "control_layers.2.feed_forward.w2.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
579
+ "control_layers.2.feed_forward.w3.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
580
+ "control_layers.2.ffn_norm1.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
581
+ "control_layers.2.ffn_norm2.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
582
+ "control_layers.3.adaLN_modulation.0.bias": "diffusion_pytorch_model-00004-of-00004.safetensors",
583
+ "control_layers.3.adaLN_modulation.0.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
584
+ "control_layers.3.after_proj.bias": "diffusion_pytorch_model-00004-of-00004.safetensors",
585
+ "control_layers.3.after_proj.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
586
+ "control_layers.3.attention.norm_k.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
587
+ "control_layers.3.attention.norm_q.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
588
+ "control_layers.3.attention.to_k.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
589
+ "control_layers.3.attention.to_out.0.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
590
+ "control_layers.3.attention.to_q.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
591
+ "control_layers.3.attention.to_v.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
592
+ "control_layers.3.attention_norm1.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
593
+ "control_layers.3.attention_norm2.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
594
+ "control_layers.3.feed_forward.w1.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
595
+ "control_layers.3.feed_forward.w2.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
596
+ "control_layers.3.feed_forward.w3.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
597
+ "control_layers.3.ffn_norm1.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
598
+ "control_layers.3.ffn_norm2.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
599
+ "control_layers.4.adaLN_modulation.0.bias": "diffusion_pytorch_model-00004-of-00004.safetensors",
600
+ "control_layers.4.adaLN_modulation.0.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
601
+ "control_layers.4.after_proj.bias": "diffusion_pytorch_model-00004-of-00004.safetensors",
602
+ "control_layers.4.after_proj.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
603
+ "control_layers.4.attention.norm_k.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
604
+ "control_layers.4.attention.norm_q.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
605
+ "control_layers.4.attention.to_k.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
606
+ "control_layers.4.attention.to_out.0.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
607
+ "control_layers.4.attention.to_q.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
608
+ "control_layers.4.attention.to_v.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
609
+ "control_layers.4.attention_norm1.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
610
+ "control_layers.4.attention_norm2.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
611
+ "control_layers.4.feed_forward.w1.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
612
+ "control_layers.4.feed_forward.w2.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
613
+ "control_layers.4.feed_forward.w3.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
614
+ "control_layers.4.ffn_norm1.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
615
+ "control_layers.4.ffn_norm2.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
616
+ "control_layers.5.adaLN_modulation.0.bias": "diffusion_pytorch_model-00004-of-00004.safetensors",
617
+ "control_layers.5.adaLN_modulation.0.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
618
+ "control_layers.5.after_proj.bias": "diffusion_pytorch_model-00004-of-00004.safetensors",
619
+ "control_layers.5.after_proj.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
620
+ "control_layers.5.attention.norm_k.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
621
+ "control_layers.5.attention.norm_q.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
622
+ "control_layers.5.attention.to_k.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
623
+ "control_layers.5.attention.to_out.0.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
624
+ "control_layers.5.attention.to_q.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
625
+ "control_layers.5.attention.to_v.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
626
+ "control_layers.5.attention_norm1.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
627
+ "control_layers.5.attention_norm2.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
628
+ "control_layers.5.feed_forward.w1.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
629
+ "control_layers.5.feed_forward.w2.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
630
+ "control_layers.5.feed_forward.w3.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
631
+ "control_layers.5.ffn_norm1.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
632
+ "control_layers.5.ffn_norm2.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
633
+ "control_noise_refiner.0.adaLN_modulation.0.bias": "diffusion_pytorch_model-00004-of-00004.safetensors",
634
+ "control_noise_refiner.0.adaLN_modulation.0.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
635
+ "control_noise_refiner.0.attention.norm_k.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
636
+ "control_noise_refiner.0.attention.norm_q.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
637
+ "control_noise_refiner.0.attention.to_k.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
638
+ "control_noise_refiner.0.attention.to_out.0.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
639
+ "control_noise_refiner.0.attention.to_q.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
640
+ "control_noise_refiner.0.attention.to_v.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
641
+ "control_noise_refiner.0.attention_norm1.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
642
+ "control_noise_refiner.0.attention_norm2.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
643
+ "control_noise_refiner.0.feed_forward.w1.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
644
+ "control_noise_refiner.0.feed_forward.w2.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
645
+ "control_noise_refiner.0.feed_forward.w3.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
646
+ "control_noise_refiner.0.ffn_norm1.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
647
+ "control_noise_refiner.0.ffn_norm2.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
648
+ "control_noise_refiner.1.adaLN_modulation.0.bias": "diffusion_pytorch_model-00004-of-00004.safetensors",
649
+ "control_noise_refiner.1.adaLN_modulation.0.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
650
+ "control_noise_refiner.1.attention.norm_k.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
651
+ "control_noise_refiner.1.attention.norm_q.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
652
+ "control_noise_refiner.1.attention.to_k.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
653
+ "control_noise_refiner.1.attention.to_out.0.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
654
+ "control_noise_refiner.1.attention.to_q.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
655
+ "control_noise_refiner.1.attention.to_v.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
656
+ "control_noise_refiner.1.attention_norm1.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
657
+ "control_noise_refiner.1.attention_norm2.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
658
+ "control_noise_refiner.1.feed_forward.w1.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
659
+ "control_noise_refiner.1.feed_forward.w2.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
660
+ "control_noise_refiner.1.feed_forward.w3.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
661
+ "control_noise_refiner.1.ffn_norm1.weight": "diffusion_pytorch_model-00004-of-00004.safetensors",
662
+ "control_noise_refiner.1.ffn_norm2.weight": "diffusion_pytorch_model-00004-of-00004.safetensors"
663
+ }
664
+ }
vae/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.36.0.dev0",
4
+ "_name_or_path": "flux-dev",
5
+ "act_fn": "silu",
6
+ "block_out_channels": [
7
+ 128,
8
+ 256,
9
+ 512,
10
+ 512
11
+ ],
12
+ "down_block_types": [
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D",
16
+ "DownEncoderBlock2D"
17
+ ],
18
+ "force_upcast": true,
19
+ "in_channels": 3,
20
+ "latent_channels": 16,
21
+ "latents_mean": null,
22
+ "latents_std": null,
23
+ "layers_per_block": 2,
24
+ "mid_block_add_attention": true,
25
+ "norm_num_groups": 32,
26
+ "out_channels": 3,
27
+ "sample_size": 1024,
28
+ "scaling_factor": 0.3611,
29
+ "shift_factor": 0.1159,
30
+ "up_block_types": [
31
+ "UpDecoderBlock2D",
32
+ "UpDecoderBlock2D",
33
+ "UpDecoderBlock2D",
34
+ "UpDecoderBlock2D"
35
+ ],
36
+ "use_post_quant_conv": false,
37
+ "use_quant_conv": false
38
+ }
vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5b59a26851551b67ae1fe58d32e76486e1e812def4696a4bea97f16604d40a3
3
+ size 167666902
z_image_turbo_control_unified_q4_k_m.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:783fb687e6a1e05711ca595a8f118474fc7f93f30252a25eec4d5ce2a87786b7
3
+ size 6469181248