Spaces:

coralLight
/

Hyperparameters-are-all-you-need-4k

Running on Zero

App Files Files Community

coralLight commited on 9 days ago

Commit

8e2dc9a

1 Parent(s): 9ab6494

add 4k inference

Browse files

Files changed (17) hide show

NoiseTransformer.py +26 -0
README.md +5 -5
SVDNoiseUnet.py +430 -0
__pycache__/NoiseTransformer.cpython-39.pyc +0 -0
__pycache__/SVDNoiseUnet.cpython-39.pyc +0 -0
__pycache__/customed_unipc_scheduler.cpython-39.pyc +0 -0
__pycache__/dpm_solver_v3.cpython-39.pyc +0 -0
__pycache__/free_lunch_utils.cpython-39.pyc +0 -0
__pycache__/sampler.cpython-39.pyc +0 -0
__pycache__/uni_pc.cpython-39.pyc +0 -0
app.py +411 -9
customed_unipc_scheduler.py +997 -0
dpm_solver_v3.py +904 -0
free_lunch_utils.py +303 -0
requirements.txt +14 -0
sampler.py +315 -0
uni_pc.py +757 -0

NoiseTransformer.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch.nn as nn
+from torch.nn import functional as F
+from timm import create_model
+__all__ = ['NoiseTransformer']
+class NoiseTransformer(nn.Module):
+    def __init__(self, resolution=(128,96)):
+        super().__init__()
+        self.upsample = lambda x: F.interpolate(x, [224,224])
+        self.downsample = lambda x: F.interpolate(x, [resolution[0],resolution[1]])
+        self.upconv = nn.Conv2d(7,4,(1,1),(1,1),(0,0))
+        self.downconv = nn.Conv2d(4,3,(1,1),(1,1),(0,0))
+        # self.upconv = nn.Conv2d(7,4,(1,1),(1,1),(0,0))
+        self.swin = create_model("swin_tiny_patch4_window7_224",pretrained=True)
+    def forward(self, x, residual=False):
+        if residual:
+            x = self.upconv(self.downsample(self.swin.forward_features(self.downconv(self.upsample(x))))) + x
+        else:
+            x = self.upconv(self.downsample(self.swin.forward_features(self.downconv(self.upsample(x)))))
+        return x

README.md CHANGED Viewed

@@ -1,14 +1,14 @@
 ---
 title: Hyperparameters Are All You Need 4k
-emoji: 🌖
-colorFrom: red
-colorTo: yellow
 sdk: gradio
-sdk_version: 6.0.2
 app_file: app.py
 pinned: false
 license: apache-2.0
-short_description: 'An implementation of fast diffusion ODE solver '
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Hyperparameters Are All You Need 4k
+emoji: 🦀
+colorFrom: yellow
+colorTo: blue
 sdk: gradio
+sdk_version: 6.0.1
 app_file: app.py
 pinned: false
 license: apache-2.0
+short_description: A few-step UniPC solver with customed hyperparameters
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

SVDNoiseUnet.py ADDED Viewed

	@@ -0,0 +1,430 @@

+import torch
+import torch.nn as nn
+import einops
+from torch.nn import functional as F
+from torch.jit import Final
+from timm.layers import use_fused_attn
+from timm.models.layers import PatchEmbed, Mlp, DropPath, trunc_normal_, lecun_normal_, get_act_layer
+from abc import abstractmethod
+from NoiseTransformer import NoiseTransformer
+from einops import rearrange
+__all__ = ['SVDNoiseUnet', 'SVDNoiseUnet_Concise']
+class Attention(nn.Module):
+    fused_attn: Final[bool]
+    def __init__(
+            self,
+            dim: int,
+            num_heads: int = 8,
+            qkv_bias: bool = False,
+            qk_norm: bool = False,
+            attn_drop: float = 0.,
+            proj_drop: float = 0.,
+            norm_layer: nn.Module = nn.LayerNorm,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.fused_attn = use_fused_attn()
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(
+                q, k, v,
+                dropout_p=self.attn_drop.p if self.training else 0.,
+            )
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class SVDNoiseUnet(nn.Module):
+    def __init__(self, in_channels=4, out_channels=4, resolution=(128,96)): # resolution = size // 8
+        super(SVDNoiseUnet, self).__init__()
+        _in_1 = int(resolution[0] * in_channels // 2)
+        _out_1 = int(resolution[0] * out_channels // 2)
+        _in_2 = int(resolution[1] * in_channels // 2)
+        _out_2 = int(resolution[1] * out_channels // 2)
+        self.mlp1 = nn.Sequential(
+            nn.Linear(_in_1, 64),
+            nn.ReLU(inplace=True),
+            nn.Linear(64, _out_1),
+        )
+        self.mlp2 = nn.Sequential(
+            nn.Linear(_in_2, 64),
+            nn.ReLU(inplace=True),
+            nn.Linear(64, _out_2),
+        )
+        self.mlp3 = nn.Sequential(
+            nn.Linear(_in_2, _out_2),
+        )
+        self.attention = Attention(_out_2)
+        self.bn = nn.BatchNorm1d(256)
+        self.bn2 = nn.BatchNorm1d(192)
+        self.mlp4 =  nn.Sequential(
+            nn.Linear(_out_2, 1024),
+            nn.ReLU(inplace=True),
+            nn.Linear(1024, _out_2),
+        )
+        self.ffn = nn.Sequential(
+            nn.Linear(256, 384),  # Expand
+            nn.ReLU(inplace=True),
+            nn.Linear(384, 192)   # Reduce to target size
+        )
+        self.ffn2 = nn.Sequential(
+            nn.Linear(256, 384),  # Expand
+            nn.ReLU(inplace=True),
+            nn.Linear(384, 192)   # Reduce to target size
+        )
+        # self.adaptive_pool = nn.AdaptiveAvgPool2d((256, 192))
+    def forward(self, x, residual=False):
+        b, c, h, w = x.shape
+        x = einops.rearrange(x, "b (a c)h w ->b (a h)(c w)", a=2,c=2) # x -> [1, 256, 256]
+        U, s, V = torch.linalg.svd(x) # U->[b 256 256], s-> [b 256], V->[b 256 256]
+        U_T = U.permute(0, 2, 1)
+        U_out = self.ffn(self.mlp1(U_T))
+        U_out = self.bn(U_out)
+        U_out = U_out.transpose(1, 2)
+        U_out = self.ffn2(U_out)  # [b, 256, 256] -> [b, 256, 192]
+        U_out = self.bn2(U_out)
+        U_out = U_out.transpose(1, 2)
+        # U_out = self.bn(U_out)
+        V_out = self.mlp2(V)
+        s_out = self.mlp3(s).unsqueeze(1)  # s -> [b, 1, 256]  => [b, 256, 256]
+        out = U_out + V_out + s_out
+        # print(out.size())
+        out = out.squeeze(1)
+        out = self.attention(out).mean(1)
+        out = self.mlp4(out) + s
+        diagonal_out = torch.diag_embed(out)
+        padded_diag = F.pad(diagonal_out, (0, 0, 0, 64), mode='constant', value=0)  # Shape: [b, 1, 256, 192]
+        pred = U @ padded_diag @ V
+        return einops.rearrange(pred, "b (a h)(c w) -> b (a c) h w", a=2,c=2)
+class SVDNoiseUnet64(nn.Module):
+    def __init__(self, in_channels=4, out_channels=4, resolution=64): # resolution = size // 8
+        super(SVDNoiseUnet64, self).__init__()
+        _in = int(resolution * in_channels // 2)
+        _out = int(resolution * out_channels // 2)
+        self.mlp1 = nn.Sequential(
+            nn.Linear(_in, 64),
+            nn.ReLU(inplace=True),
+            nn.Linear(64, _out),
+        )
+        self.mlp2 = nn.Sequential(
+            nn.Linear(_in, 64),
+            nn.ReLU(inplace=True),
+            nn.Linear(64, _out),
+        )
+        self.mlp3 = nn.Sequential(
+            nn.Linear(_in, _out),
+        )
+        self.attention = Attention(_out)
+        self.bn = nn.BatchNorm2d(_out)
+        self.mlp4 =  nn.Sequential(
+            nn.Linear(_out, 1024),
+            nn.ReLU(inplace=True),
+            nn.Linear(1024, _out),
+        )
+    def forward(self, x, residual=False):
+        b, c, h, w = x.shape
+        x = einops.rearrange(x, "b (a c)h w ->b (a h)(c w)", a=2,c=2) # x -> [1, 256, 256]
+        U, s, V = torch.linalg.svd(x) # U->[b 256 256], s-> [b 256], V->[b 256 256]
+        U_T = U.permute(0, 2, 1)
+        out = self.mlp1(U_T) + self.mlp2(V) + self.mlp3(s).unsqueeze(1) # s -> [b, 1, 256]  => [b, 256, 256]
+        out = self.attention(out).mean(1)
+        out = self.mlp4(out) + s
+        pred = U @ torch.diag_embed(out) @ V
+        return einops.rearrange(pred, "b (a h)(c w) -> b (a c) h w", a=2,c=2)
+class SVDNoiseUnet128(nn.Module):
+    def __init__(self, in_channels=4, out_channels=4, resolution=128): # resolution = size // 8
+        super(SVDNoiseUnet128, self).__init__()
+        _in = int(resolution * in_channels // 2)
+        _out = int(resolution * out_channels // 2)
+        self.mlp1 = nn.Sequential(
+            nn.Linear(_in, 64),
+            nn.ReLU(inplace=True),
+            nn.Linear(64, _out),
+        )
+        self.mlp2 = nn.Sequential(
+            nn.Linear(_in, 64),
+            nn.ReLU(inplace=True),
+            nn.Linear(64, _out),
+        )
+        self.mlp3 = nn.Sequential(
+            nn.Linear(_in, _out),
+        )
+        self.attention = Attention(_out)
+        self.bn = nn.BatchNorm2d(_out)
+        self.mlp4 =  nn.Sequential(
+            nn.Linear(_out, 1024),
+            nn.ReLU(inplace=True),
+            nn.Linear(1024, _out),
+        )
+    def forward(self, x, residual=False):
+        b, c, h, w = x.shape
+        x = einops.rearrange(x, "b (a c)h w ->b (a h)(c w)", a=2,c=2) # x -> [1, 256, 256]
+        U, s, V = torch.linalg.svd(x) # U->[b 256 256], s-> [b 256], V->[b 256 256]
+        U_T = U.permute(0, 2, 1)
+        out = self.mlp1(U_T) + self.mlp2(V) + self.mlp3(s).unsqueeze(1) # s -> [b, 1, 256]  => [b, 256, 256]
+        out = self.attention(out).mean(1)
+        out = self.mlp4(out) + s
+        pred = U @ torch.diag_embed(out) @ V
+        return einops.rearrange(pred, "b (a h)(c w) -> b (a c) h w", a=2,c=2)
+class SVDNoiseUnet_Concise(nn.Module):
+    def __init__(self, in_channels=4, out_channels=4, resolution=64):
+        super(SVDNoiseUnet_Concise, self).__init__()
+from diffusers.models.normalization import AdaGroupNorm
+class NPNet(nn.Module):
+      def __init__(self, model_id, pretrained_path=' ', device='cuda') -> None:
+            super(NPNet, self).__init__()
+            assert model_id in ['SD1.5', 'DreamShaper', 'DiT']
+            self.model_id = model_id
+            self.device = device
+            self.pretrained_path = pretrained_path
+            (
+                  self.unet_svd,
+                  self.unet_embedding,
+                  self.text_embedding,
+                  self._alpha,
+                  self._beta
+             ) = self.get_model()
+      def save_model(self, save_path: str):
+            """
+            Save this NPNet so that get_model() can later reload it.
+            """
+            torch.save({
+                  "unet_svd":        self.unet_svd.state_dict(),
+                  "unet_embedding":  self.unet_embedding.state_dict(),
+                  "embeeding":       self.text_embedding.state_dict(),  # matches get_model’s key
+                  "alpha":           self._alpha,
+                  "beta":            self._beta,
+            }, save_path)
+            print(f"NPNet saved to {save_path}")
+      def get_model(self):
+            unet_embedding = NoiseTransformer(resolution=(128,96)).to(self.device).to(torch.float32)
+            unet_svd = SVDNoiseUnet(resolution=(128,96)).to(self.device).to(torch.float32)
+            if self.model_id == 'DiT':
+                  text_embedding = AdaGroupNorm(768 * 77, 4, 1, eps=1e-6).to(self.device).to(torch.float32)
+            else:
+                  text_embedding = AdaGroupNorm(768 * 77, 4, 1, eps=1e-6).to(self.device).to(torch.float32)
+            # initialize random _alpha and _beta when no checkpoint is provided
+            _alpha = torch.randn(1, device=self.device)
+            _beta = torch.randn(1, device=self.device)
+            if '.pth' in self.pretrained_path:
+                  gloden_unet = torch.load(self.pretrained_path)
+                  unet_svd.load_state_dict(gloden_unet["unet_svd"],strict=True)
+                  unet_embedding.load_state_dict(gloden_unet["unet_embedding"],strict=True)
+                  text_embedding.load_state_dict(gloden_unet["embeeding"],strict=True)
+                  _alpha = gloden_unet["alpha"]
+                  _beta = gloden_unet["beta"]
+                  print("Load Successfully!")
+                  return unet_svd, unet_embedding, text_embedding, _alpha, _beta
+            else:
+                  return unet_svd, unet_embedding, text_embedding, _alpha, _beta
+      def forward(self, initial_noise, prompt_embeds):
+            prompt_embeds = prompt_embeds.float().view(prompt_embeds.shape[0], -1)
+            text_emb = self.text_embedding(initial_noise.float(), prompt_embeds)
+            encoder_hidden_states_svd = initial_noise
+            encoder_hidden_states_embedding = initial_noise + text_emb
+            golden_embedding = self.unet_embedding(encoder_hidden_states_embedding.float())
+            golden_noise = self.unet_svd(encoder_hidden_states_svd.float()) + (
+                        2 * torch.sigmoid(self._alpha) - 1) * text_emb + self._beta * golden_embedding
+            return golden_noise
+class NPNet64(nn.Module):
+      def __init__(self, model_id, pretrained_path=' ', device='cuda') -> None:
+            super(NPNet64, self).__init__()
+            self.model_id = model_id
+            self.device = device
+            self.pretrained_path = pretrained_path
+            (
+                  self.unet_svd,
+                  self.unet_embedding,
+                  self.text_embedding,
+                  self._alpha,
+                  self._beta
+             ) = self.get_model()
+      def save_model(self, save_path: str):
+            """
+            Save this NPNet so that get_model() can later reload it.
+            """
+            torch.save({
+                  "unet_svd":        self.unet_svd.state_dict(),
+                  "unet_embedding":  self.unet_embedding.state_dict(),
+                  "embeeding":       self.text_embedding.state_dict(),  # matches get_model’s key
+                  "alpha":           self._alpha,
+                  "beta":            self._beta,
+            }, save_path)
+            print(f"NPNet saved to {save_path}")
+      def get_model(self):
+            unet_embedding = NoiseTransformer(resolution=(64,64)).to(self.device).to(torch.float32)
+            unet_svd = SVDNoiseUnet64(resolution=64).to(self.device).to(torch.float32)
+            _alpha = torch.randn(1, device=self.device)
+            _beta = torch.randn(1, device=self.device)
+            text_embedding = AdaGroupNorm(768 * 77, 4, 1, eps=1e-6).to(self.device).to(torch.float32)
+            if '.pth' in self.pretrained_path:
+                  gloden_unet = torch.load(self.pretrained_path)
+                  unet_svd.load_state_dict(gloden_unet["unet_svd"])
+                  unet_embedding.load_state_dict(gloden_unet["unet_embedding"])
+                  text_embedding.load_state_dict(gloden_unet["embeeding"])
+                  _alpha = gloden_unet["alpha"]
+                  _beta = gloden_unet["beta"]
+                  print("Load Successfully!")
+            return unet_svd, unet_embedding, text_embedding, _alpha, _beta
+      def forward(self, initial_noise, prompt_embeds):
+            prompt_embeds = prompt_embeds.float().view(prompt_embeds.shape[0], -1)
+            text_emb = self.text_embedding(initial_noise.float(), prompt_embeds)
+            encoder_hidden_states_svd = initial_noise
+            encoder_hidden_states_embedding = initial_noise + text_emb
+            golden_embedding = self.unet_embedding(encoder_hidden_states_embedding.float())
+            golden_noise = self.unet_svd(encoder_hidden_states_svd.float()) + (
+                        2 * torch.sigmoid(self._alpha) - 1) * text_emb + self._beta * golden_embedding
+            return golden_noise
+class NPNet128(nn.Module):
+      def __init__(self, model_id, pretrained_path=True, device='cuda') -> None:
+            super(NPNet128, self).__init__()
+            assert model_id in ['SDXL', 'DreamShaper', 'DiT']
+            self.model_id = model_id
+            self.device = device
+            self.pretrained_path = pretrained_path
+            (
+                  self.unet_svd,
+                  self.unet_embedding,
+                  self.text_embedding,
+                  self._alpha,
+                  self._beta
+             ) = self.get_model()
+      def get_model(self):
+            unet_embedding = NoiseTransformer(resolution=(128,128)).to(self.device).to(torch.float32)
+            unet_svd = SVDNoiseUnet128(resolution=128).to(self.device).to(torch.float32)
+            if self.model_id == 'DiT':
+                  text_embedding = AdaGroupNorm(1024 * 77, 4, 1, eps=1e-6).to(self.device).to(torch.float32)
+            else:
+                  text_embedding = AdaGroupNorm(2048 * 77, 4, 1, eps=1e-6).to(self.device).to(torch.float32)
+            if '.pth' in self.pretrained_path:
+                  gloden_unet = torch.load(self.pretrained_path)
+                  unet_svd.load_state_dict(gloden_unet["unet_svd"])
+                  unet_embedding.load_state_dict(gloden_unet["unet_embedding"])
+                  text_embedding.load_state_dict(gloden_unet["embeeding"])
+                  _alpha = gloden_unet["alpha"]
+                  _beta = gloden_unet["beta"]
+                  print("Load Successfully!")
+                  return unet_svd, unet_embedding, text_embedding, _alpha, _beta
+            else:
+                  assert ("No Pretrained Weights Found!")
+      def forward(self, initial_noise, prompt_embeds):
+            prompt_embeds = prompt_embeds.float().view(prompt_embeds.shape[0], -1)
+            text_emb = self.text_embedding(initial_noise.float(), prompt_embeds)
+            encoder_hidden_states_svd = initial_noise
+            encoder_hidden_states_embedding = initial_noise + text_emb
+            golden_embedding = self.unet_embedding(encoder_hidden_states_embedding.float())
+            golden_noise = self.unet_svd(encoder_hidden_states_svd.float()) + (
+                        2 * torch.sigmoid(self._alpha) - 1) * text_emb + self._beta * golden_embedding
+            return golden_noise

__pycache__/NoiseTransformer.cpython-39.pyc ADDED Viewed

Binary file (1.45 kB). View file

__pycache__/SVDNoiseUnet.cpython-39.pyc ADDED Viewed

Binary file (11.2 kB). View file

__pycache__/customed_unipc_scheduler.cpython-39.pyc ADDED Viewed

Binary file (28.8 kB). View file

__pycache__/dpm_solver_v3.cpython-39.pyc ADDED Viewed

Binary file (32.2 kB). View file

__pycache__/free_lunch_utils.cpython-39.pyc ADDED Viewed

Binary file (7.78 kB). View file

__pycache__/sampler.cpython-39.pyc ADDED Viewed

Binary file (7.12 kB). View file

__pycache__/uni_pc.cpython-39.pyc ADDED Viewed

Binary file (18.4 kB). View file

app.py CHANGED Viewed

@@ -1,14 +1,416 @@
 import gradio as gr
-import spaces
 import torch
-zero = torch.Tensor([0]).cuda()
-print(zero.device) # <-- 'cpu' 🤔
-@spaces.GPU
-def greet(n):
-    print(zero.device) # <-- 'cuda:0' 🤗
-    return f"Hello {zero + n} Tensor"
-demo = gr.Interface(fn=greet, inputs=gr.Number(), outputs=gr.Text())
-demo.launch()

 import gradio as gr
+import numpy as np
+import random
+import json
+import spaces #[uncomment to use ZeroGPU]
+from diffusers import (
+    AutoencoderKL,
+    StableDiffusionXLPipeline,
+    DPMSolverMultistepScheduler
+)
+from huggingface_hub import login, hf_hub_download
+from PIL import Image
+# from huggingface_hub import login
+from SVDNoiseUnet import NPNet64
+import functools
+import random
+from free_lunch_utils import register_free_upblock2d, register_free_crossattn_upblock2d
 import torch
+import torch.nn as nn
+from einops import rearrange
+from torchvision.utils import make_grid
+import time
+from pytorch_lightning import seed_everything
+from torch import autocast
+from contextlib import contextmanager, nullcontext
+import accelerate
+import torchsde
+from SVDNoiseUnet import NPNet128
+from tqdm import tqdm, trange
+from itertools import islice
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model_repo_id = "Lykon/dreamshaper-xl-1-0"  # Replace to the model you would like to use
+from sampler import UniPCSampler
+from customed_unipc_scheduler import CustomedUniPCMultistepScheduler
+from spandrel import ModelLoader
+precision_scope = autocast
+# 1. Define image conversion functions
+def pil_image_to_torch_bgr(img: Image.Image) -> torch.Tensor:
+    """Convert a PIL image (RGB) to a torch tensor (BGR, uint8 -> float)."""
+    img = np.array(img.convert("RGB"))
+    img = img[:, :, ::-1]  # Flip RGB to BGR
+    img = img.astype(np.float32) / 255.0  # Normalize to [0, 1]
+    img = np.transpose(img, (2, 0, 1))  # HWC to CHW
+    return torch.from_numpy(img.copy()).unsqueeze(0) # Add batch dimension
+def torch_bgr_to_pil_image(tensor: torch.Tensor) -> Image.Image:
+    """Convert a torch tensor (BGR, float) to a PIL image (RGB)."""
+    tensor = tensor.squeeze(0).clamp(0, 1) # Remove batch dimension and clamp
+    img = tensor.detach().cpu().numpy()
+    img = np.transpose(img, (1, 2, 0)) # CHW to HWC
+    img = img[:, :, ::-1] # Flip BGR to RGB
+    img = (img * 255.0).astype(np.uint8)
+    return Image.fromarray(img)
+def extract_into_tensor(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def append_zero(x):
+    return torch.cat([x, x.new_zeros([1])])
+def prepare_sdxl_pipeline_step_parameter( pipe: StableDiffusionXLPipeline
+                                         , prompts
+                                         , need_cfg
+                                         , device
+                                         , negative_prompt = None
+                                         , W = 1024
+                                         , H = 1024): # need to correct the format
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = pipe.encode_prompt(
+            prompt=prompts,
+            negative_prompt=negative_prompt,
+            device=device,
+            do_classifier_free_guidance=need_cfg,
+        )
+    # timesteps = pipe.scheduler.timesteps
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = pooled_prompt_embeds.to(device)
+        original_size = (W, H)
+        crops_coords_top_left = (0, 0)
+        target_size = (W, H)
+        text_encoder_projection_dim = None
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+        if pipe.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = pipe.text_encoder_2.config.projection_dim
+        passed_add_embed_dim = (
+            pipe.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = pipe.unet.add_embedding.linear_1.in_features
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+        add_time_ids = torch.tensor([add_time_ids], dtype=prompt_embeds.dtype)
+        add_time_ids = add_time_ids.to(device)
+        negative_add_time_ids = add_time_ids
+        if need_cfg:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+        ret_dict = {
+            "text_embeds": add_text_embeds,
+            "time_ids": add_time_ids
+        }
+        return prompt_embeds, ret_dict
+# New helper to load a list-of-dicts preference JSON
+# JSON schema: [ { 'human_preference': [int], 'prompt': str, 'file_path': [str] }, ... ]
+def load_preference_json(json_path: str) -> list[dict]:
+    """Load records from a JSON file formatted as a list of preference dicts."""
+    with open(json_path, 'r') as f:
+        data = json.load(f)
+    return data
+# New helper to extract just the prompts from the preference JSON
+# Returns a flat list of all 'prompt' values
+def extract_prompts_from_pref_json(json_path: str) -> list[str]:
+    """Load a JSON of preference records and return only the prompts."""
+    records = load_preference_json(json_path)
+    return [rec['prompt'] for rec in records]
+# Example usage:
+# prompts = extract_prompts_from_pref_json("path/to/preference.json")
+# print(prompts)
+def get_sigmas_karras(n, sigma_min, sigma_max, rho=7., device='cpu',need_append_zero = True):
+    """Constructs the noise schedule of Karras et al. (2022)."""
+    ramp = torch.linspace(0, 1, n)
+    min_inv_rho = sigma_min ** (1 / rho)
+    max_inv_rho = sigma_max ** (1 / rho)
+    sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+    return append_zero(sigmas).to(device) if need_append_zero else sigmas.to(device)
+def extract_into_tensor(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def append_zero(x):
+    return torch.cat([x, x.new_zeros([1])])
+def append_dims(x, target_dims):
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(f'input has {x.ndim} dims but target_dims is {target_dims}, which is less')
+    return x[(...,) + (None,) * dims_to_append]
+def chunk(it, size):
+    it = iter(it)
+    return iter(lambda: tuple(islice(it, size)), ())
+def convert_caption_json_to_str(json):
+    caption = json["caption"]
+    return caption
+DTYPE = torch.float16  # torch.float16 works as well, but pictures seem to be a bit worse
+device = "cuda"
+cyberreal_repo = "cyberdelia/CyberRealisticXL"
+cyberreal_filename = "CyberRealisticXLPlay_V7.0_FP16.safetensors"
+cyberreal_path = hf_hub_download(
+        repo_id=cyberreal_repo,
+        filename=cyberreal_filename,
+        cache_dir="."
+)
+pipe = StableDiffusionXLPipeline.from_single_file(
+        cyberreal_path,
+        torch_dtype=DTYPE,
+)
+up_repo = "uwg/upscaler"
+up_filename = "ESRGAN/4x_NMKD-Siax_200k.pth"
+up_path = hf_hub_download(
+        repo_id=up_repo,
+        filename=up_filename,
+        cache_dir="."
+)
+upscaler = ModelLoader().load_from_file(up_path)
+upscaler.to(device).eval()
+MAX_SEED = np.iinfo(np.int32).max
+MAX_IMAGE_SIZE = 1024
+accelerator = accelerate.Accelerator()
+def generate_image_with_steps(prompt, negative_prompt, seed, width, height, guidance_scale, num_inference_steps):
+    """Helper function to generate image with specific number of steps"""
+    scheduler = CustomedUniPCMultistepScheduler.from_config(pipe.scheduler.config
+                                                            , solver_order = 2 if num_inference_steps==8 else 1
+                                                            ,denoise_to_zero = False
+                                                            , use_afs = True
+                                                            , use_free_predictor = False)
+    start_free_at_step = 4
+    pipe.scheduler = scheduler
+    pipe.to('cuda')
+    with torch.no_grad():
+        with precision_scope("cuda"):
+            prompts =  [prompt]
+            latents = torch.randn(
+                (1, pipe.unet.config.in_channels, height // 8, width // 8),
+                device=device,
+            )
+            latents = latents * pipe.scheduler.init_noise_sigma
+            pipe.scheduler.set_timesteps(num_inference_steps)
+            idx = 0
+            register_free_upblock2d(pipe, b1=1.0, b2=1.0, s1=1.0, s2=1.0)
+            register_free_crossattn_upblock2d(pipe, b1=1.0, b2=1.0, s1=1.0, s2=1.0)
+            for t in tqdm(pipe.scheduler.timesteps):
+                        # Still not enough. I will tell you, what is the best implementation.  Although not via the following code.
+                        # if idx == len(pipe.scheduler.timesteps) - 1:
+                        #     break
+                if idx == start_free_at_step:#(6 if num_inference_steps == 8 else 4):
+                    register_free_upblock2d(pipe, b1=1.2, b2=1.2, s1=0.9, s2=0.9)
+                    register_free_crossattn_upblock2d(pipe, b1=1.2, b2=1.2, s1=0.9, s2=0.9)
+                latent_model_input  = torch.cat([latents] * 2)
+                latent_model_input  = pipe.scheduler.scale_model_input(latent_model_input , timestep=t)
+                negative_prompts = 'lowres, bad anatomy, bad hands, watermark'
+                negative_prompts = 1 * [negative_prompts]
+                use_afs = True
+                use_free_predictor = False
+                prompt_embeds, cond_kwargs = prepare_sdxl_pipeline_step_parameter(pipe
+                                                                                      , prompts
+                                                                                      , need_cfg=True
+                                                                                      , device=pipe.device
+                                                                                      , negative_prompt=negative_prompts
+                                                                                      , W=width
+                                                                                      , H=height)
+                if idx == 0 and use_afs:
+                    noise_pred = latent_model_input * 0.98
+                elif idx == len(pipe.scheduler.timesteps) - 1 and use_free_predictor:
+                    noise_pred = None
+                else:
+                    noise_pred  = pipe.unet(latent_model_input
+                                        , t
+                                        , encoder_hidden_states=prompt_embeds.to(device=latents.device, dtype=latents.dtype)
+                                        , added_cond_kwargs=cond_kwargs).sample
+                if noise_pred is not None:
+                    uncond, cond = noise_pred.chunk(2)
+                    noise_pred  = uncond + (cond - uncond) * guidance_scale
+                latents = pipe.scheduler.step(noise_pred, t, latents).prev_sample
+                idx += 1
+            x_samples_ddim = pipe.vae.decode(latents / pipe.vae.config.scaling_factor).sample
+            x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
+            if True:
+                for x_sample in x_samples_ddim:
+                                # x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
+                    x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
+                    img = Image.fromarray(x_sample.astype(np.uint8))#.save( os.path.join(sample_path, f"{base_count:05}.png"))
+                    input_image_tensor = pil_image_to_torch_bgr(img).to(device)
+                    output_tensor = upscaler(input_image_tensor)
+                    output_image_pil = torch_bgr_to_pil_image(output_tensor)
+    return output_image_pil
+@spaces.GPU #[uncomment to use ZeroGPU]
+def infer(
+    prompt,
+    negative_prompt,
+    seed,
+    randomize_seed,
+    resolution,
+    guidance_scale,
+    num_inference_steps,
+    progress=gr.Progress(track_tqdm=True),
+):
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    # Parse resolution string into width and height
+    width, height = map(int, resolution.split('x'))
+    # Generate image with selected steps
+    image_quick = generate_image_with_steps(prompt, negative_prompt, seed, width, height, guidance_scale, num_inference_steps)
+    pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config
+                                                             , final_sigmas_type="sigma_min"
+                                                             , algorithm_type="sde-dpmsolver++"
+                                                             , use_karras_sigmas=True)
+    # Generate image with 50 steps for high quality
+    negative_prompts = 'lowres, bad anatomy, bad hands, watermark'
+    negative_prompts = 1 * [negative_prompts]
+    image_50_steps = pipe(prompt=[prompt]
+                            ,negative_prompt=negative_prompts
+                            ,num_inference_steps=30
+                            ,guidance_scale=4.0
+                            ,height=height
+                            ,width=width).images
+    for x_sample in image_50_steps:
+        input_image_tensor = pil_image_to_torch_bgr(x_sample).to(device)
+        output_tensor = upscaler(input_image_tensor)
+        img_4k_org = torch_bgr_to_pil_image(output_tensor)
+    return image_quick, img_4k_org, seed
+examples = [
+    "ultra-realistic 8k RAW portrait of a serious Black man in 1920s Harlem, standing on a bustling vintage city street, wearing a textured vintage wool suit, striped dress shirt, bold colorful tie, and a brown felt fedora, cinematic lighting with soft shadows on his deeply expressive face, timeless and melancholic mood, blurred storefronts and pedestrians in background, analog film grain, slightly desaturated color palette, medium format lens capturing fine skin texture, worn fabric, and atmospheric detail, Harlem Renaissance style, captured in natural light, shallow depth of field",
+    "An ultra-realistic 8k HDR editorial photograph of a soft-featured young woman with auburn hair tucked under a linen bonnet, pale freckled skin and downcast eyes filled with quiet resilience, dressed in a modest 1875 working-class Victorian dress with worn shawl, standing near a bustling street market in London, surrounded by wooden carts, hanging meats, and soot-stained brick buildings, soft overcast light and rising chimney smoke blending into a hazy amber atmosphere, cinematic lens depth with visible film grain and rich Kodak Portra-style color grading, historical fashion editorial with immersive composition and a contemplative, narrative mood",
+    "A weathered Victorian house surrounded by lush autumn foliage and overgrown garden paths, its deep teal-painted wood faded and peeling, orange leaves scattering across the stone steps and tangled in the railings of the ornate wooden porch, delicate orange wildflowers growing from cracks in the stairs, arched twin doors with stained glass glowing faintly from within, warm golden light filtering through dusted windows, a few butterflies fluttering through the crisp autumn air, the scene bathed in soft daylight with painterly shadows, magical realism meets gothic nostalgia, cinematic composition with high detail and storybook charm, photorealistic yet slightly stylized, peaceful and enchanted with a hint of mystery",
+]
+css = """
+#col-container {
+    margin: 0 auto;
+    max-width: 640px;
+}
+"""
+with gr.Blocks() as demo:
+    gr.HTML(f"<style>{css}</style>")
+    with gr.Column(elem_id="col-container"):
+        gr.Markdown(" # Hyperparameters are all you need")
+        with gr.Row():
+            prompt = gr.Text(
+                label="Prompt",
+                show_label=False,
+                max_lines=1,
+                placeholder="Enter your prompt",
+                container=False,
+            )
+            run_button = gr.Button("Run", scale=0, variant="primary")
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("### Our fast inference Result using afs to get 1 free steps")
+                result = gr.Image(label="Quick Result", show_label=False)
+            with gr.Column():
+                gr.Markdown("### official 30 steps result")
+                result_30_steps = gr.Image(label="30 Steps Result", show_label=False)
+        with gr.Accordion("Advanced Settings", open=False):
+            negative_prompt = gr.Text(
+                label="Negative prompt",
+                max_lines=1,
+                placeholder="Enter a negative prompt",
+                visible=False,
+            )
+            seed = gr.Slider(
+                label="Seed",
+                minimum=0,
+                maximum=MAX_SEED,
+                step=1,
+                value=0,
+            )
+            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+            resolution = gr.Dropdown(
+                choices=[
+                    "1024x1024",
+                    "1216x832",
+                    "832x1216"
+                ],
+                value="832x1216",
+                label="Resolution",
+            )
+            with gr.Row():
+                guidance_scale = gr.Slider(
+                    label="Guidance scale",
+                    minimum=0.0,
+                    maximum=5.0,
+                    step=0.1,
+                    value=5.0,  # Replace with defaults that work for your model
+                )
+                num_inference_steps = gr.Dropdown(
+                    choices=[6, 7, 8],
+                    value=8,
+                    label="Number of inference steps",
+                )
+        gr.Examples(examples=examples, inputs=[prompt])
+    gr.on(
+        triggers=[run_button.click, prompt.submit],
+        fn=infer,
+        inputs=[
+            prompt,
+            negative_prompt,
+            seed,
+            randomize_seed,
+            resolution,
+            guidance_scale,
+            num_inference_steps,
+        ],
+        outputs=[result, result_20_steps, seed],
+    )
+if __name__ == "__main__":
+    demo.launch()

customed_unipc_scheduler.py ADDED Viewed

	@@ -0,0 +1,997 @@

+# Copyright 2025 TSAIL Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# DISCLAIMER: check https://huggingface.co/papers/2302.04867 and https://github.com/wl-zhao/UniPC for more info
+# The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+import math
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import torch
+import copy
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import deprecate, is_scipy_available
+from diffusers.schedulers.scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+if is_scipy_available():
+    import scipy.stats
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+    elif alpha_transform_type == "exp":
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
+    Args:
+        betas (`torch.Tensor`):
+            the betas that the scheduler is being initialized with.
+    Returns:
+        `torch.Tensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+    return betas
+class CustomedUniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `UniPCMultistepScheduler` is a training-free framework designed for the fast sampling of diffusion models.
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        solver_order (`int`, default `2`):
+            The UniPC order which can be any positive integer. The effective order of accuracy is `solver_order + 1`
+            due to the UniC. It is recommended to use `solver_order=2` for guided sampling, and `solver_order=3` for
+            unconditional sampling.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and `predict_x0=True`.
+        predict_x0 (`bool`, defaults to `True`):
+            Whether to use the updating algorithm on the predicted x0.
+        solver_type (`str`, default `bh2`):
+            Solver type for UniPC. It is recommended to use `bh1` for unconditional sampling when steps < 10, and `bh2`
+            otherwise.
+        lower_order_final (`bool`, default `True`):
+            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
+            stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
+        disable_corrector (`list`, default `[]`):
+            Decides which step to disable the corrector to mitigate the misalignment between `epsilon_theta(x_t, c)`
+            and `epsilon_theta(x_t^c, c)` which can influence convergence for a large guidance scale. Corrector is
+            usually disabled during the first few steps.
+        solver_p (`SchedulerMixin`, default `None`):
+            Any other scheduler that if specified, the algorithm becomes `solver_p + UniC`.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        use_exponential_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use exponential sigmas for step sizes in the noise schedule during the sampling process.
+        use_beta_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use beta sigmas for step sizes in the noise schedule during the sampling process. Refer to [Beta
+            Sampling is All You Need](https://huggingface.co/papers/2407.12173) for more information.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        final_sigmas_type (`str`, defaults to `"zero"`):
+            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
+            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        solver_order: int = 2,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        predict_x0: bool = True,
+        solver_type: str = "bh2",
+        lower_order_final: bool = True,
+        disable_corrector: List[int] = [],
+        solver_p: SchedulerMixin = None,
+        use_karras_sigmas: Optional[bool] = False,
+        use_exponential_sigmas: Optional[bool] = False,
+        use_beta_sigmas: Optional[bool] = False,
+        use_flow_sigmas: Optional[bool] = False,
+        flow_shift: Optional[float] = 1.0,
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+        final_sigmas_type: Optional[str] = "zero",  # "zero", "sigma_min"
+        skip_type: str = "customed_time_karras",
+        denoise_to_zero: bool = False,
+        rescale_betas_zero_snr: bool = False,
+        use_afs: bool = False,
+        use_free_predictor = False
+    ):
+        if self.config.use_beta_sigmas and not is_scipy_available():
+            raise ImportError("Make sure to install scipy if you want to use beta sigmas.")
+        if sum([self.config.use_beta_sigmas, self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
+            raise ValueError(
+                "Only one of `config.use_beta_sigmas`, `config.use_exponential_sigmas`, `config.use_karras_sigmas` can be used."
+            )
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+        self.skip_type = skip_type
+        self.use_free_predictor = use_free_predictor
+        self.use_afs = use_afs
+        self.denoise_to_zero = denoise_to_zero
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        if rescale_betas_zero_snr:
+            # Close to 0 without being 0 so first sigma is not inf
+            # FP16 smallest positive subnormal works well here
+            self.alphas_cumprod[-1] = 2**-24
+        # Currently we only support VP-type noise schedule
+        self.alpha_t = torch.sqrt(self.alphas_cumprod)
+        self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
+        self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
+        self.sigmas = ((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+        if solver_type not in ["bh1", "bh2"]:
+            if solver_type in ["midpoint", "heun", "logrho"]:
+                self.register_to_config(solver_type="bh2")
+            else:
+                raise NotImplementedError(f"{solver_type} is not implemented for {self.__class__}")
+        self.predict_x0 = predict_x0
+        # setable values
+        self.num_inference_steps = None
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
+        self.timesteps = torch.from_numpy(timesteps)
+        self.model_outputs = [None] * solver_order
+        self.timestep_list = [None] * solver_order
+        self.solver_order = solver_order
+        self.lower_order_nums = 0
+        self.disable_corrector = disable_corrector
+        self.solver_p = solver_p
+        self.last_sample = None
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://huggingface.co/papers/2305.08891
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        if self.skip_type == "customed_time_karras":
+            sigma_T = sigmas[-1]
+            sigma_0 = sigmas[0]
+            N = num_inference_steps
+            if N == 9:
+                log_sigmas = np.log(sigmas)
+                sigmas = self.get_sigmas_karras(12, sigma_0, sigma_T, rho=7.0)
+                ct_start = self._sigma_to_t(sigmas[0], log_sigmas)
+                ct_end = self._sigma_to_t(sigmas[9], log_sigmas)
+                if self.denoise_to_zero:
+                    ct_real_end = self._sigma_to_t(sigmas[-1], log_sigmas)
+                timesteps = self.get_sigmas_karras(9 + (1 if self.use_free_predictor else 0), ct_end, ct_start,rho=1.2, customed_final_sigma= ct_real_end if self.denoise_to_zero else None)
+            elif N == 5:
+                log_sigmas = np.log(sigmas)
+                sigmas = self.get_sigmas_karras(8, sigma_0, sigma_T, rho=5.0)
+                ct_start = self._sigma_to_t(sigmas[0], log_sigmas)
+                ct_end = self._sigma_to_t(sigmas[6], log_sigmas)
+                if self.denoise_to_zero:
+                    ct_real_end = self._sigma_to_t(sigmas[-1], log_sigmas)
+                timesteps = self.get_sigmas_karras(5 + (1 if self.use_afs else 0) + (1 if self.use_free_predictor else 0), ct_end, ct_start,rho=1.2, customed_final_sigma= ct_real_end if self.denoise_to_zero else None)
+            elif N == 6:
+                log_sigmas = np.log(sigmas)
+                sigmas = self.get_sigmas_karras(8, sigma_0, sigma_T, rho=5.0)
+                ct_start = self._sigma_to_t(sigmas[0], log_sigmas)
+                ct_end = self._sigma_to_t(sigmas[6], log_sigmas)
+                if self.denoise_to_zero:
+                    ct_real_end = self._sigma_to_t(sigmas[-1], log_sigmas)
+                timesteps = self.get_sigmas_karras(6 + (1 if self.use_afs else 0) + (1 if self.use_free_predictor else 0), ct_end, ct_start,rho=1.2, customed_final_sigma= ct_real_end if self.denoise_to_zero else None)
+            elif N == 7:
+                log_sigmas = np.log(sigmas)
+                sigmas = self.get_sigmas_karras(8, sigma_0, sigma_T, rho=5.0)
+                ct_start = self._sigma_to_t(sigmas[0], log_sigmas)
+                ct_end = self._sigma_to_t(sigmas[6], log_sigmas)
+                if self.denoise_to_zero:
+                    ct_real_end = self._sigma_to_t(sigmas[-1], log_sigmas)
+                timesteps = self.get_sigmas_karras(7 + (1 if self.use_free_predictor else 0), ct_end, ct_start,rho=1.2, customed_final_sigma= ct_real_end if self.denoise_to_zero else None)
+            elif N == 8:
+                log_sigmas = np.log(sigmas).copy()
+                sigmas = self.get_sigmas_karras(8, sigma_0, sigma_T, rho=5.0)
+                ct_start = self._sigma_to_t(sigmas[0], log_sigmas)
+                ct_end = self._sigma_to_t(sigmas[6], log_sigmas)
+                if self.denoise_to_zero:
+                    ct_real_end = self._sigma_to_t(sigmas[-1], log_sigmas)
+                timesteps = self.get_sigmas_karras(8 + (1 if self.use_free_predictor else 0), ct_end, ct_start,rho=1.2, customed_final_sigma= ct_real_end if self.denoise_to_zero else None)
+        if self.use_afs and N > 6:
+            timesteps = np.insert(timesteps,1,(timesteps[0]+timesteps[1]) / 2)
+        timesteps_tmp = copy.deepcopy(timesteps)
+        timesteps_tmp = np.append(timesteps_tmp, self._sigma_to_t(sigmas[-1], log_sigmas))
+        sigmas = np.array([self._t_to_sigma(t, log_sigmas) for t in timesteps_tmp])
+        self.sigmas = torch.from_numpy(sigmas)
+        self.timesteps = torch.from_numpy(timesteps).to(device=device)
+        self.num_inference_steps = len(timesteps)
+        self.model_outputs = [
+            None,
+        ] * self.solver_order
+        self.lower_order_nums = 0
+        self.last_sample = None
+        if self.solver_p:
+            self.solver_p.set_timesteps(self.num_inference_steps, device=device)
+        # add an index counter for schedulers that allow duplicated timesteps
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+        https://huggingface.co/papers/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+        return sample
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+    def _t_to_sigma(self, t, log_sigmas):
+        # t = t
+        low_idx, high_idx, w = np.int64(np.floor(t)), np.clip(np.int64(np.ceil(t)),a_min=0,a_max=999) , t - np.floor(t)
+        log_sigma = (1 - w) * log_sigmas[low_idx] + w * log_sigmas[high_idx]
+        return np.exp(log_sigma)
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._sigma_to_alpha_sigma_t
+    def _sigma_to_alpha_sigma_t(self, sigma):
+        if self.config.use_flow_sigmas:
+            alpha_t = 1 - sigma
+            sigma_t = sigma
+        else:
+            alpha_t = 1 / ((sigma**2 + 1) ** 0.5)
+            sigma_t = sigma * alpha_t
+        return alpha_t, sigma_t
+    def get_sigmas_karras(self, n, in_sigma_min: torch.Tensor, in_sigma_max: torch.Tensor, rho=7., customed_final_sigma = None) -> torch.Tensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = in_sigma_min.item()
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = in_sigma_max.item()
+        ramp = np.linspace(0, 1, n)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        if customed_final_sigma is not None :
+            sigmas[-1] = customed_final_sigma
+        return sigmas
+    def convert_model_output(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Convert the model output to the corresponding type the UniPC algorithm needs.
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+        Returns:
+            `torch.Tensor`:
+                The converted model output.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError("missing `sample` as a required keyword argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        sigma = self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        if model_output is None:
+            return None
+        if self.predict_x0:
+            if self.config.prediction_type == "epsilon":
+                x0_pred = (sample - sigma_t * model_output) / alpha_t
+            elif self.config.prediction_type == "sample":
+                x0_pred = model_output
+            elif self.config.prediction_type == "v_prediction":
+                x0_pred = alpha_t * sample - sigma_t * model_output
+            elif self.config.prediction_type == "flow_prediction":
+                sigma_t = self.sigmas[self.step_index]
+                x0_pred = sample - sigma_t * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, "
+                    "`v_prediction`, or `flow_prediction` for the UniPCMultistepScheduler."
+                )
+            if self.config.thresholding:
+                x0_pred = self._threshold_sample(x0_pred)
+            return x0_pred
+        else:
+            if self.config.prediction_type == "epsilon":
+                return model_output
+            elif self.config.prediction_type == "sample":
+                epsilon = (sample - alpha_t * model_output) / sigma_t
+                return epsilon
+            elif self.config.prediction_type == "v_prediction":
+                epsilon = alpha_t * model_output + sigma_t * sample
+                return epsilon
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the UniPCMultistepScheduler."
+                )
+    def multistep_uni_p_bh_update(
+        self,
+        model_output: torch.Tensor = None,
+        *args,
+        sample: torch.Tensor = None,
+        order: int = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the UniP (B(h) version). Alternatively, `self.solver_p` is used if is specified.
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model at the current timestep.
+            prev_timestep (`int`):
+                The previous discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            order (`int`):
+                The order of UniP at this timestep (corresponds to the *p* in UniPC-p).
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        prev_timestep = args[0] if len(args) > 0 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError("missing `sample` as a required keyword argument")
+        if order is None:
+            if len(args) > 2:
+                order = args[2]
+            else:
+                raise ValueError("missing `order` as a required keyword argument")
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        model_output_list = self.model_outputs
+        s0 = self.timestep_list[-1]
+        m0 = model_output_list[-1]
+        x = sample
+        if self.solver_p:
+            x_t = self.solver_p.step(model_output, s0, x).prev_sample
+            return x_t
+        sigma_t, sigma_s0 = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        h = lambda_t - lambda_s0
+        device = sample.device
+        rks = []
+        D1s = []
+        for i in range(1, order):
+            si = self.step_index - i
+            mi = model_output_list[-(i + 1)]
+            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
+            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
+            rk = (lambda_si - lambda_s0) / h
+            rks.append(rk)
+            D1s.append((mi - m0) / rk)
+        rks.append(1.0)
+        rks = torch.tensor(rks, device=device)
+        R = []
+        b = []
+        hh = -h if self.predict_x0 else h
+        h_phi_1 = torch.expm1(hh)  # h\phi_1(h) = e^h - 1
+        h_phi_k = h_phi_1 / hh - 1
+        factorial_i = 1
+        if self.config.solver_type == "bh1":
+            B_h = hh
+        elif self.config.solver_type == "bh2":
+            B_h = torch.expm1(hh)
+        else:
+            raise NotImplementedError()
+        for i in range(1, order + 1):
+            R.append(torch.pow(rks, i - 1))
+            b.append(h_phi_k * factorial_i / B_h)
+            factorial_i *= i + 1
+            h_phi_k = h_phi_k / hh - 1 / factorial_i
+        R = torch.stack(R)
+        b = torch.tensor(b, device=device)
+        if len(D1s) > 0:
+            D1s = torch.stack(D1s, dim=1)  # (B, K)
+            # for order 2, we use a simplified version
+            if order == 2:
+                rhos_p = torch.tensor([0.5], dtype=x.dtype, device=device)
+            else:
+                rhos_p = torch.linalg.solve(R[:-1, :-1], b[:-1]).to(device).to(x.dtype)
+        else:
+            D1s = None
+        if self.predict_x0:
+            x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
+            if D1s is not None:
+                pred_res = torch.einsum("k,bkc...->bc...", rhos_p, D1s)
+            else:
+                pred_res = 0
+            x_t = x_t_ - alpha_t * B_h * pred_res
+        else:
+            x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
+            if D1s is not None:
+                pred_res = torch.einsum("k,bkc...->bc...", rhos_p, D1s)
+            else:
+                pred_res = 0
+            x_t = x_t_ - sigma_t * B_h * pred_res
+        x_t = x_t.to(x.dtype)
+        return x_t
+    def multistep_uni_c_bh_update(
+        self,
+        this_model_output: torch.Tensor,
+        *args,
+        last_sample: torch.Tensor = None,
+        this_sample: torch.Tensor = None,
+        order: int = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the UniC (B(h) version).
+        Args:
+            this_model_output (`torch.Tensor`):
+                The model outputs at `x_t`.
+            this_timestep (`int`):
+                The current timestep `t`.
+            last_sample (`torch.Tensor`):
+                The generated sample before the last predictor `x_{t-1}`.
+            this_sample (`torch.Tensor`):
+                The generated sample after the last predictor `x_{t}`.
+            order (`int`):
+                The `p` of UniC-p at this step. The effective order of accuracy should be `order + 1`.
+        Returns:
+            `torch.Tensor`:
+                The corrected sample tensor at the current timestep.
+        """
+        this_timestep = args[0] if len(args) > 0 else kwargs.pop("this_timestep", None)
+        if last_sample is None:
+            if len(args) > 1:
+                last_sample = args[1]
+            else:
+                raise ValueError("missing `last_sample` as a required keyword argument")
+        if this_sample is None:
+            if len(args) > 2:
+                this_sample = args[2]
+            else:
+                raise ValueError("missing `this_sample` as a required keyword argument")
+        if order is None:
+            if len(args) > 3:
+                order = args[3]
+            else:
+                raise ValueError("missing `order` as a required keyword argument")
+        if this_timestep is not None:
+            deprecate(
+                "this_timestep",
+                "1.0.0",
+                "Passing `this_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        model_output_list = self.model_outputs
+        m0 = model_output_list[-1]
+        x = last_sample
+        x_t = this_sample
+        model_t = this_model_output
+        sigma_t, sigma_s0 = self.sigmas[self.step_index], self.sigmas[self.step_index - 1]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        h = lambda_t - lambda_s0
+        device = this_sample.device
+        rks = []
+        D1s = []
+        for i in range(1, order):
+            si = self.step_index - (i + 1)
+            mi = model_output_list[-(i + 1)]
+            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
+            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
+            rk = (lambda_si - lambda_s0) / h
+            rks.append(rk)
+            D1s.append((mi - m0) / rk)
+        rks.append(1.0)
+        rks = torch.tensor(rks, device=device)
+        R = []
+        b = []
+        hh = -h if self.predict_x0 else h
+        h_phi_1 = torch.expm1(hh)  # h\phi_1(h) = e^h - 1
+        h_phi_k = h_phi_1 / hh - 1
+        factorial_i = 1
+        if self.config.solver_type == "bh1":
+            B_h = hh
+        elif self.config.solver_type == "bh2":
+            B_h = torch.expm1(hh)
+        else:
+            raise NotImplementedError()
+        for i in range(1, order + 1):
+            R.append(torch.pow(rks, i - 1))
+            b.append(h_phi_k * factorial_i / B_h)
+            factorial_i *= i + 1
+            h_phi_k = h_phi_k / hh - 1 / factorial_i
+        R = torch.stack(R)
+        b = torch.tensor(b, device=device)
+        if len(D1s) > 0:
+            D1s = torch.stack(D1s, dim=1)
+        else:
+            D1s = None
+        # for order 1, we use a simplified version
+        if order == 1:
+            rhos_c = torch.tensor([0.5], dtype=x.dtype, device=device)
+        else:
+            rhos_c = torch.linalg.solve(R, b).to(device).to(x.dtype)
+        if self.predict_x0:
+            x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
+            if D1s is not None:
+                corr_res = torch.einsum("k,bkc...->bc...", rhos_c[:-1], D1s)
+            else:
+                corr_res = 0
+            D1_t = model_t - m0
+            x_t = x_t_ - alpha_t * B_h * (corr_res + rhos_c[-1] * D1_t)
+        else:
+            x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
+            if D1s is not None:
+                corr_res = torch.einsum("k,bkc...->bc...", rhos_c[:-1], D1s)
+            else:
+                corr_res = 0
+            D1_t = model_t - m0
+            x_t = x_t_ - sigma_t * B_h * (corr_res + rhos_c[-1] * D1_t)
+        x_t = x_t.to(x.dtype)
+        return x_t
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+        index_candidates = (schedule_timesteps == timestep).nonzero()
+        if len(index_candidates) == 0:
+            step_index = len(self.timesteps) - 1
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        elif len(index_candidates) > 1:
+            step_index = index_candidates[1].item()
+        else:
+            step_index = index_candidates[0].item()
+        return step_index
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        """
+        Initialize the step_index counter for the scheduler.
+        """
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: Union[int, torch.Tensor],
+        sample: torch.Tensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the multistep UniPC.
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+        if self.step_index is None:
+            self._init_step_index(timestep) # I remember is this part prevent us directly customed the discrete method
+        use_corrector = (
+            self.step_index > 0 and self.step_index - 1 not in self.disable_corrector and self.last_sample is not None
+        )
+        model_output_convert = self.convert_model_output(model_output, sample=sample)
+        if use_corrector and model_output_convert is not None:
+            sample = self.multistep_uni_c_bh_update(
+                this_model_output=model_output_convert,
+                last_sample=self.last_sample,
+                this_sample=sample,
+                order=self.this_order,
+            )
+        for i in range(self.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+            self.timestep_list[i] = self.timestep_list[i + 1]
+        if model_output_convert is not None:
+            self.model_outputs[-1] = model_output_convert
+        self.timestep_list[-1] = timestep
+        if self.config.lower_order_final:
+            this_order = min(self.solver_order, len(self.timesteps) - self.step_index)
+        else:
+            this_order = self.solver_order
+        self.this_order = min(this_order, self.lower_order_nums + 1)  # warmup for multistep
+        assert self.this_order > 0
+        self.last_sample = sample
+        prev_sample = self.multistep_uni_p_bh_update(
+            model_output=model_output,  # pass the original non-converted model output, in case solver-p is used
+            sample=sample,
+            order=self.this_order,
+        )
+        if self.lower_order_nums < self.solver_order:
+            self.lower_order_nums += 1
+        # upon completion increase step index by one
+        self._step_index += 1
+        if not return_dict:
+            return (prev_sample,)
+        return SchedulerOutput(prev_sample=prev_sample)
+    def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+        # begin_index is None when the scheduler is used for training or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timesteps.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timesteps.shape[0]
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        noisy_samples = alpha_t * original_samples + sigma_t * noise
+        return noisy_samples
+    def __len__(self):
+        return self.config.num_train_timesteps

dpm_solver_v3.py ADDED Viewed

	@@ -0,0 +1,904 @@

+import torch
+import torch.nn.functional as F
+import math
+import numpy as np
+import os
+class NoiseScheduleVP:
+    def __init__(
+        self,
+        schedule="discrete",
+        betas=None,
+        alphas_cumprod=None,
+        continuous_beta_0=0.1,
+        continuous_beta_1=20.0,
+    ):
+        """Create a wrapper class for the forward SDE (VP type).
+        ***
+        Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t.
+                We recommend to use schedule='discrete' for the discrete-time diffusion models, especially for high-resolution images.
+        ***
+        The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ).
+        We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper).
+        Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have:
+            log_alpha_t = self.marginal_log_mean_coeff(t)
+            sigma_t = self.marginal_std(t)
+            lambda_t = self.marginal_lambda(t)
+        Moreover, as lambda(t) is an invertible function, we also support its inverse function:
+            t = self.inverse_lambda(lambda_t)
+        ===============================================================
+        We support both discrete-time DPMs (trained on n = 0, 1, ..., N-1) and continuous-time DPMs (trained on t in [t_0, T]).
+        1. For discrete-time DPMs:
+            For discrete-time DPMs trained on n = 0, 1, ..., N-1, we convert the discrete steps to continuous time steps by:
+                t_i = (i + 1) / N
+            e.g. for N = 1000, we have t_0 = 1e-3 and T = t_{N-1} = 1.
+            We solve the corresponding diffusion ODE from time T = 1 to time t_0 = 1e-3.
+            Args:
+                betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details)
+                alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details)
+            Note that we always have alphas_cumprod = cumprod(betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`.
+            **Important**:  Please pay special attention for the args for `alphas_cumprod`:
+                The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that
+                    q_{t_n | 0}(x_{t_n} | x_0) = N ( \sqrt{\hat{alpha_n}} * x_0, (1 - \hat{alpha_n}) * I ).
+                Therefore, the notation \hat{alpha_n} is different from the notation alpha_t in DPM-Solver. In fact, we have
+                    alpha_{t_n} = \sqrt{\hat{alpha_n}},
+                and
+                    log(alpha_{t_n}) = 0.5 * log(\hat{alpha_n}).
+        2. For continuous-time DPMs:
+            We support two types of VPSDEs: linear (DDPM) and cosine (improved-DDPM). The hyperparameters for the noise
+            schedule are the default settings in DDPM and improved-DDPM:
+            Args:
+                beta_min: A `float` number. The smallest beta for the linear schedule.
+                beta_max: A `float` number. The largest beta for the linear schedule.
+                cosine_s: A `float` number. The hyperparameter in the cosine schedule.
+                cosine_beta_max: A `float` number. The hyperparameter in the cosine schedule.
+                T: A `float` number. The ending time of the forward process.
+        ===============================================================
+        Args:
+            schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs,
+                    'linear' or 'cosine' for continuous-time DPMs.
+        Returns:
+            A wrapper object of the forward SDE (VP type).
+        ===============================================================
+        Example:
+        # For discrete-time DPMs, given betas (the beta array for n = 0, 1, ..., N - 1):
+        >>> ns = NoiseScheduleVP('discrete', betas=betas)
+        # For discrete-time DPMs, given alphas_cumprod (the \hat{alpha_n} array for n = 0, 1, ..., N - 1):
+        >>> ns = NoiseScheduleVP('discrete', alphas_cumprod=alphas_cumprod)
+        # For continuous-time DPMs (VPSDE), linear schedule:
+        >>> ns = NoiseScheduleVP('linear', continuous_beta_0=0.1, continuous_beta_1=20.)
+        """
+        if schedule not in ["discrete", "linear", "cosine"]:
+            raise ValueError(
+                "Unsupported noise schedule {}. The schedule needs to be 'discrete' or 'linear' or 'cosine'".format(
+                    schedule
+                )
+            )
+        self.alphas_cumprod = alphas_cumprod
+        self.sigmas = ((1 - alphas_cumprod) / alphas_cumprod) ** 0.5
+        self.log_sigmas = self.sigmas.log()
+        self.schedule = schedule
+        if schedule == "discrete":
+            if betas is not None:
+                log_alphas = 0.5 * torch.log(1 - betas).cumsum(dim=0)
+            else:
+                assert alphas_cumprod is not None
+                log_alphas = 0.5 * torch.log(alphas_cumprod)
+            self.total_N = len(log_alphas)
+            self.T = 1.0
+            self.t_array = torch.linspace(0.0, 1.0, self.total_N + 1)[1:].reshape((1, -1))
+            self.log_alpha_array = log_alphas.reshape(
+                (
+                    1,
+                    -1,
+                )
+            )
+        else:
+            self.total_N = 1000
+            self.beta_0 = continuous_beta_0
+            self.beta_1 = continuous_beta_1
+            self.cosine_s = 0.008
+            self.cosine_beta_max = 999.0
+            self.cosine_t_max = (
+                math.atan(self.cosine_beta_max * (1.0 + self.cosine_s) / math.pi)
+                * 2.0
+                * (1.0 + self.cosine_s)
+                / math.pi
+                - self.cosine_s
+            )
+            self.cosine_log_alpha_0 = math.log(math.cos(self.cosine_s / (1.0 + self.cosine_s) * math.pi / 2.0))
+            self.schedule = schedule
+            if schedule == "cosine":
+                # For the cosine schedule, T = 1 will have numerical issues. So we manually set the ending time T.
+                # Note that T = 0.9946 may be not the optimal setting. However, we find it works well.
+                self.T = 0.9946
+            else:
+                self.T = 1.0
+    def marginal_log_mean_coeff(self, t):
+        """
+        Compute log(alpha_t) of a given continuous-time label t in [0, T].
+        """
+        if self.schedule == "discrete":
+            return interpolate_fn(
+                t.reshape((-1, 1)), self.t_array.to(t.device), self.log_alpha_array.to(t.device)
+            ).reshape((-1))
+        elif self.schedule == "linear":
+            return -0.25 * t**2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
+        elif self.schedule == "cosine":
+            log_alpha_fn = lambda s: torch.log(torch.cos((s + self.cosine_s) / (1.0 + self.cosine_s) * math.pi / 2.0))
+            log_alpha_t = log_alpha_fn(t) - self.cosine_log_alpha_0
+            return log_alpha_t
+    def sigma_to_t(self, sigma, quantize=None):
+        quantize = None
+        log_sigma = sigma.log()
+        dists = log_sigma - self.log_sigmas[:, None]
+        if quantize:
+            return dists.abs().argmin(dim=0).view(sigma.shape)
+        low_idx = dists.ge(0).cumsum(dim=0).argmax(dim=0).clamp(max=self.log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+        low, high = self.log_sigmas[low_idx], self.log_sigmas[high_idx]
+        w = (low - log_sigma) / (low - high)
+        w = w.clamp(0, 1)
+        t = (1 - w) * low_idx + w * high_idx
+        return t.view(sigma.shape)
+    def get_special_sigmas_with_timesteps(self,timesteps):
+        low_idx, high_idx, w = np.minimum(np.floor(timesteps),999), np.minimum(np.ceil(timesteps),999), torch.from_numpy( timesteps - np.floor(timesteps))
+        self.alphas_cumprod = self.alphas_cumprod.to('cpu')
+        alphas = (1 - w) * self.alphas_cumprod[low_idx] + w * self.alphas_cumprod[high_idx]
+        return ((1 - alphas) / alphas) ** 0.5
+    def marginal_alpha(self, t):
+        """
+        Compute alpha_t of a given continuous-time label t in [0, T].
+        """
+        return torch.exp(self.marginal_log_mean_coeff(t))
+    def marginal_std(self, t):
+        """
+        Compute sigma_t of a given continuous-time label t in [0, T].
+        """
+        return torch.sqrt(1.0 - torch.exp(2.0 * self.marginal_log_mean_coeff(t)))
+    def marginal_lambda(self, t):
+        """
+        Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T].
+        """
+        log_mean_coeff = self.marginal_log_mean_coeff(t)
+        log_std = 0.5 * torch.log(1.0 - torch.exp(2.0 * log_mean_coeff))
+        return log_mean_coeff - log_std
+    def inverse_lambda(self, lamb):
+        """
+        Compute the continuous-time label t in [0, T] of a given half-logSNR lambda_t.
+        """
+        if self.schedule == "linear":
+            tmp = 2.0 * (self.beta_1 - self.beta_0) * torch.logaddexp(-2.0 * lamb, torch.zeros((1,)).to(lamb))
+            Delta = self.beta_0**2 + tmp
+            return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0)
+        elif self.schedule == "discrete":
+            log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2.0 * lamb)
+            t = interpolate_fn(
+                log_alpha.reshape((-1, 1)),
+                torch.flip(self.log_alpha_array.to(lamb.device), [1]),
+                torch.flip(self.t_array.to(lamb.device), [1]),
+            )
+            return t.reshape((-1,))
+        else:
+            log_alpha = -0.5 * torch.logaddexp(-2.0 * lamb, torch.zeros((1,)).to(lamb))
+            t_fn = (
+                lambda log_alpha_t: torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0))
+                * 2.0
+                * (1.0 + self.cosine_s)
+                / math.pi
+                - self.cosine_s
+            )
+            t = t_fn(log_alpha)
+            return t
+def model_wrapper(
+    model,
+    noise_schedule,
+    model_type="noise",
+    model_kwargs={},
+    guidance_type="uncond",
+    condition=None,
+    unconditional_condition=None,
+    guidance_scale=1.0,
+    classifier_fn=None,
+    classifier_kwargs={},
+):
+    """Create a wrapper function for the noise prediction model.
+    DPM-Solver needs to solve the continuous-time diffusion ODEs. For DPMs trained on discrete-time labels, we need to
+    firstly wrap the model function to a noise prediction model that accepts the continuous time as the input.
+    We support four types of the diffusion model by setting `model_type`:
+        1. "noise": noise prediction model. (Trained by predicting noise).
+        2. "x_start": data prediction model. (Trained by predicting the data x_0 at time 0).
+        3. "v": velocity prediction model. (Trained by predicting the velocity).
+            The "v" prediction is derivation detailed in Appendix D of [1], and is used in Imagen-Video [2].
+            [1] Salimans, Tim, and Jonathan Ho. "Progressive distillation for fast sampling of diffusion models."
+                arXiv preprint arXiv:2202.00512 (2022).
+            [2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models."
+                arXiv preprint arXiv:2210.02303 (2022).
+        4. "score": marginal score function. (Trained by denoising score matching).
+            Note that the score function and the noise prediction model follows a simple relationship:
+            ```
+                noise(x_t, t) = -sigma_t * score(x_t, t)
+            ```
+    We support three types of guided sampling by DPMs by setting `guidance_type`:
+        1. "uncond": unconditional sampling by DPMs.
+            The input `model` has the following format:
+            ``
+                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
+            ``
+        2. "classifier": classifier guidance sampling [3] by DPMs and another classifier.
+            The input `model` has the following format:
+            ``
+                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
+            ``
+            The input `classifier_fn` has the following format:
+            ``
+                classifier_fn(x, t_input, cond, **classifier_kwargs) -> logits(x, t_input, cond)
+            ``
+            [3] P. Dhariwal and A. Q. Nichol, "Diffusion models beat GANs on image synthesis,"
+                in Advances in Neural Information Processing Systems, vol. 34, 2021, pp. 8780-8794.
+        3. "classifier-free": classifier-free guidance sampling by conditional DPMs.
+            The input `model` has the following format:
+            ``
+                model(x, t_input, cond, **model_kwargs) -> noise | x_start | v | score
+            ``
+            And if cond == `unconditional_condition`, the model output is the unconditional DPM output.
+            [4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance."
+                arXiv preprint arXiv:2207.12598 (2022).
+    The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999)
+    or continuous-time labels (i.e. epsilon to T).
+    We wrap the model function to accept only `x` and `t_continuous` as inputs, and outputs the predicted noise:
+    ``
+        def model_fn(x, t_continuous) -> noise:
+            t_input = get_model_input_time(t_continuous)
+            return noise_pred(model, x, t_input, **model_kwargs)
+    ``
+    where `t_continuous` is the continuous time labels (i.e. epsilon to T). And we use `model_fn` for DPM-Solver.
+    ===============================================================
+    Args:
+        model: A diffusion model with the corresponding format described above.
+        noise_schedule: A noise schedule object, such as NoiseScheduleVP.
+        model_type: A `str`. The parameterization type of the diffusion model.
+                    "noise" or "x_start" or "v" or "score".
+        model_kwargs: A `dict`. A dict for the other inputs of the model function.
+        guidance_type: A `str`. The type of the guidance for sampling.
+                    "uncond" or "classifier" or "classifier-free".
+        condition: A pytorch tensor. The condition for the guided sampling.
+                    Only used for "classifier" or "classifier-free" guidance type.
+        unconditional_condition: A pytorch tensor. The condition for the unconditional sampling.
+                    Only used for "classifier-free" guidance type.
+        guidance_scale: A `float`. The scale for the guided sampling.
+        classifier_fn: A classifier function. Only used for the classifier guidance.
+        classifier_kwargs: A `dict`. A dict for the other inputs of the classifier function.
+    Returns:
+        A noise prediction model that accepts the noised data and the continuous time as the inputs.
+    """
+    def get_model_input_time(t_continuous):
+        """
+        Convert the continuous-time `t_continuous` (in [epsilon, T]) to the model input time.
+        For discrete-time DPMs, we convert `t_continuous` in [1 / N, 1] to `t_input` in [0, 1000 * (N - 1) / N].
+        For continuous-time DPMs, we just use `t_continuous`.
+        """
+        if noise_schedule.schedule == "discrete":
+            return (t_continuous - 1.0 / noise_schedule.total_N) * 1000.0
+        else:
+            return t_continuous
+    def noise_pred_fn(x, t_continuous, cond=None):
+        if t_continuous.reshape((-1,)).shape[0] == 1:
+            t_continuous = t_continuous.expand((x.shape[0]))
+        t_input = get_model_input_time(t_continuous)
+        if cond is None:
+            output = model(x, t_input, None, **model_kwargs)
+        else:
+            output = model(x, t_input, cond, **model_kwargs)
+        if model_type == "noise":
+            return output
+        elif model_type == "x_start":
+            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
+            dims = x.dim()
+            return (x - expand_dims(alpha_t, dims) * output) / expand_dims(sigma_t, dims)
+        elif model_type == "v":
+            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
+            dims = x.dim()
+            return expand_dims(alpha_t, dims) * output + expand_dims(sigma_t, dims) * x
+        elif model_type == "score":
+            sigma_t = noise_schedule.marginal_std(t_continuous)
+            dims = x.dim()
+            return -expand_dims(sigma_t, dims) * output
+    def cond_grad_fn(x, t_input):
+        """
+        Compute the gradient of the classifier, i.e. nabla_{x} log p_t(cond | x_t).
+        """
+        with torch.enable_grad():
+            x_in = x.detach().requires_grad_(True)
+            log_prob = classifier_fn(x_in, t_input, condition, **classifier_kwargs)
+            return torch.autograd.grad(log_prob.sum(), x_in)[0]
+    def model_fn(x, t_continuous):
+        """
+        The noise predicition model function that is used for DPM-Solver.
+        """
+        if t_continuous.reshape((-1,)).shape[0] == 1:
+            t_continuous = t_continuous.expand((x.shape[0]))
+        if guidance_type == "uncond":
+            return noise_pred_fn(x, t_continuous)
+        elif guidance_type == "classifier":
+            assert classifier_fn is not None
+            t_input = get_model_input_time(t_continuous)
+            cond_grad = cond_grad_fn(x, t_input)
+            sigma_t = noise_schedule.marginal_std(t_continuous)
+            noise = noise_pred_fn(x, t_continuous)
+            return noise - guidance_scale * expand_dims(sigma_t, dims=cond_grad.dim()) * cond_grad
+        elif guidance_type == "classifier-free":
+            if guidance_scale == 1.0 or unconditional_condition is None:
+                return noise_pred_fn(x, t_continuous, cond=condition)
+            else:
+                x_in = torch.cat([x] * 2)
+                t_in = torch.cat([t_continuous] * 2)
+                if isinstance(condition, torch.Tensor) and ( isinstance(unconditional_condition, torch.Tensor) or unconditional_condition is None ):
+                    c_in = torch.cat([unconditional_condition, condition])
+                else:
+                    c_in = [condition, unconditional_condition]
+                # c_in = torch.cat([unconditional_condition, condition])
+                noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2)
+                return noise_uncond + guidance_scale * (noise - noise_uncond)
+    assert model_type in ["noise", "x_start", "v"]
+    assert guidance_type in ["uncond", "classifier", "classifier-free"]
+    return model_fn
+def weighted_cumsumexp_trapezoid(a, x, b, cumsum=True):
+    # ∫ b*e^a dx
+    # Input: a,x,b: shape (N+1,...)
+    # Output: y: shape (N+1,...)
+    # y_0 = 0
+    # y_n = sum_{i=1}^{n} 0.5*(x_{i}-x_{i-1})*(b_{i}*e^{a_{i}}+b_{i-1}*e^{a_{i-1}}) (n from 1 to N)
+    assert x.shape[0] == a.shape[0] and x.ndim == a.ndim
+    if b is not None:
+        assert a.shape[0] == b.shape[0] and a.ndim == b.ndim
+    a_max = np.amax(a, axis=0, keepdims=True)
+    if b is not None:
+        b = np.asarray(b)
+        tmp = b * np.exp(a - a_max)
+    else:
+        tmp = np.exp(a - a_max)
+    out = 0.5 * (x[1:] - x[:-1]) * (tmp[1:] + tmp[:-1])
+    if not cumsum:
+        return np.sum(out, axis=0) * np.exp(a_max)
+    out = np.cumsum(out, axis=0)
+    out *= np.exp(a_max)
+    return np.concatenate([np.zeros_like(out[[0]]), out], axis=0)
+def weighted_cumsumexp_trapezoid_torch(a, x, b, cumsum=True):
+    assert x.shape[0] == a.shape[0] and x.ndim == a.ndim
+    if b is not None:
+        assert a.shape[0] == b.shape[0] and a.ndim == b.ndim
+    a_max = torch.amax(a, dim=0, keepdims=True)
+    if b is not None:
+        tmp = b * torch.exp(a - a_max)
+    else:
+        tmp = torch.exp(a - a_max)
+    out = 0.5 * (x[1:] - x[:-1]) * (tmp[1:] + tmp[:-1])
+    if not cumsum:
+        return torch.sum(out, dim=0) * torch.exp(a_max)
+    out = torch.cumsum(out, dim=0)
+    out *= torch.exp(a_max)
+    return torch.concat([torch.zeros_like(out[[0]]), out], dim=0)
+def index_list(lst, index):
+    new_lst = []
+    for i in index:
+        new_lst.append(lst[i])
+    return new_lst
+class DPM_Solver_v3:
+    def __init__(
+        self,
+        statistics_dir,
+        noise_schedule,
+        steps=10,
+        t_start=None,
+        t_end=None,
+        skip_type="time_uniform",
+        degenerated=False,
+        device="cuda",
+    ):
+        self.device = device
+        self.model = None
+        self.noise_schedule = noise_schedule
+        self.steps = steps
+        t_0 = 1.0 / self.noise_schedule.total_N if t_end is None else t_end
+        t_T = self.noise_schedule.T if t_start is None else t_start
+        assert (
+            t_0 > 0 and t_T > 0
+        ), "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
+        l = np.load(os.path.join(statistics_dir, "l.npz"))["l"]
+        sb = np.load(os.path.join(statistics_dir, "sb.npz"))
+        s, b = sb["s"], sb["b"]
+        if degenerated:
+            l = np.ones_like(l)
+            s = np.zeros_like(s)
+            b = np.zeros_like(b)
+        self.statistics_steps = l.shape[0] - 1
+        ts = noise_schedule.marginal_lambda(
+            self.get_time_steps("logSNR", t_T, t_0, self.statistics_steps, "cpu")
+        ).numpy()[:, None, None, None]
+        self.ts = torch.from_numpy(ts).cuda()
+        self.lambda_T = self.ts[0].cpu().item()
+        self.lambda_0 = self.ts[-1].cpu().item()
+        z = np.zeros_like(l)
+        o = np.ones_like(l)
+        L = weighted_cumsumexp_trapezoid(z, ts, l)
+        S = weighted_cumsumexp_trapezoid(z, ts, s)
+        I = weighted_cumsumexp_trapezoid(L + S, ts, o)
+        B = weighted_cumsumexp_trapezoid(-S, ts, b)
+        C = weighted_cumsumexp_trapezoid(L + S, ts, B)
+        self.l = torch.from_numpy(l).cuda()
+        self.s = torch.from_numpy(s).cuda()
+        self.b = torch.from_numpy(b).cuda()
+        self.L = torch.from_numpy(L).cuda()
+        self.S = torch.from_numpy(S).cuda()
+        self.I = torch.from_numpy(I).cuda()
+        self.B = torch.from_numpy(B).cuda()
+        self.C = torch.from_numpy(C).cuda()
+        # precompute timesteps
+        if skip_type == "logSNR" or skip_type == "time_uniform" or skip_type == "time_quadratic" or skip_type == "customed_time_karras":
+            self.timesteps = self.get_time_steps(skip_type, t_T=t_T, t_0=t_0, N=steps, device=device)
+            self.indexes = self.convert_to_indexes(self.timesteps)
+            self.timesteps = self.convert_to_timesteps(self.indexes, device)
+        elif skip_type == "edm":
+            self.indexes, self.timesteps = self.get_timesteps_edm(N=steps, device=device)
+            self.timesteps = self.convert_to_timesteps(self.indexes, device)
+        else:
+            raise ValueError(f"Unsupported timestep strategy {skip_type}")
+        print("Indexes", self.indexes)
+        print("Time steps", self.timesteps)
+        print("LogSNR steps", self.noise_schedule.marginal_lambda(self.timesteps))
+        # store high-order exponential coefficients (lazy)
+        self.exp_coeffs = {}
+    def noise_prediction_fn(self, x, t):
+        """
+        Return the noise prediction model.
+        """
+        return self.model(x, t)
+    def convert_to_indexes(self, timesteps):
+        logSNR_steps = self.noise_schedule.marginal_lambda(timesteps)
+        indexes = list(
+            (self.statistics_steps * (logSNR_steps - self.lambda_T) / (self.lambda_0 - self.lambda_T))
+            .round()
+            .cpu()
+            .numpy()
+            .astype(np.int64)
+        )
+        return indexes
+    def convert_to_timesteps(self, indexes, device):
+        logSNR_steps = (
+            self.lambda_T + (self.lambda_0 - self.lambda_T) * torch.Tensor(indexes).to(device) / self.statistics_steps
+        )
+        return self.noise_schedule.inverse_lambda(logSNR_steps)
+    def append_zero(self, x):
+        return torch.cat([x, x.new_zeros([1])])
+    def get_sigmas_karras(self, n, sigma_min, sigma_max, rho=7., device='cpu', need_append_zero=True):
+        """Constructs the noise schedule of Karras et al. (2022)."""
+        ramp = torch.linspace(0, 1, n)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return self.append_zero(sigmas).to(device) if need_append_zero else sigmas.to(device)
+    def sigma_to_t(self, sigma, quantize=None):
+        quantize = False
+        log_sigma = sigma.log()
+        dists = log_sigma - self.noise_schedule.log_sigmas[:, None]
+        if quantize:
+            return dists.abs().argmin(dim=0).view(sigma.shape)
+        low_idx = dists.ge(0).cumsum(dim=0).argmax(dim=0).clamp(max=self.noise_schedule.log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+        low, high = self.noise_schedule.log_sigmas[low_idx], self.noise_schedule.log_sigmas[high_idx]
+        w = (low - log_sigma) / (low - high)
+        w = w.clamp(0, 1)
+        t = (1 - w) * low_idx + w * high_idx
+        return t.view(sigma.shape)
+    def get_time_steps(self, skip_type, t_T, t_0, N, device):
+        """Compute the intermediate time steps for sampling.
+        Args:
+            skip_type: A `str`. The type for the spacing of the time steps. We support three types:
+                - 'logSNR': uniform logSNR for the time steps.
+                - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.)
+                - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.)
+            t_T: A `float`. The starting time of the sampling (default is T).
+            t_0: A `float`. The ending time of the sampling (default is epsilon).
+            N: A `int`. The total number of the spacing of the time steps.
+            device: A torch device.
+        Returns:
+            A pytorch tensor of the time steps, with the shape (N + 1,).
+        """
+        if skip_type == "logSNR":
+            lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device))
+            lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device))
+            logSNR_steps = torch.linspace(lambda_T.cpu().item(), lambda_0.cpu().item(), N + 1).to(device)
+            return self.noise_schedule.inverse_lambda(logSNR_steps)
+        elif skip_type == "time_uniform":
+            return torch.linspace(t_T, t_0, N + 1).to(device)
+        elif skip_type == "time_quadratic":
+            t_order = 2
+            t = torch.linspace(t_T ** (1.0 / t_order), t_0 ** (1.0 / t_order), N + 1).pow(t_order).to(device)
+            return t
+        elif skip_type == "customed_time_karras":
+            sigma_T = self.noise_schedule.sigmas[-1].cpu().item()
+            sigma_0 = self.noise_schedule.sigmas[0].cpu().item()
+            if N == 8:
+                sigmas = self.get_sigmas_karras(12, sigma_0, sigma_T, rho=7.0, device=device)
+                ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[9])
+                ct = self.get_sigmas_karras(9, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+            elif N == 5:
+                sigmas = self.get_sigmas_karras(8, sigma_0, sigma_T, rho=5.0, device=device)
+                ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[6])
+                ct = self.get_sigmas_karras(6, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+            elif N == 6:
+                sigmas = self.sigmas = self.get_sigmas_karras(8, sigma_0, sigma_T, rho=5.0, device=device)
+                ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[6])
+                ct = self.get_sigmas_karras(7, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+            none_k_ct = torch.from_numpy(np.array(real_ct)).to(device)
+            return none_k_ct#real_ct
+        else:
+            raise ValueError(
+                "Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type)
+            )
+    def get_timesteps_edm(self, N, device):
+        """Constructs the noise schedule of Karras et al. (2022)."""
+        rho = 7.0  # 7.0 is the value used in the paper
+        sigma_min: float = np.exp(-self.lambda_0)
+        sigma_max: float = np.exp(-self.lambda_T)
+        ramp = np.linspace(0, 1, N + 1)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        lambdas = torch.Tensor(-np.log(sigmas)).to(device)
+        timesteps = self.noise_schedule.inverse_lambda(lambdas)
+        indexes = list(
+            (self.statistics_steps * (lambdas - self.lambda_T) / (self.lambda_0 - self.lambda_T))
+            .round()
+            .cpu()
+            .numpy()
+            .astype(np.int64)
+        )
+        return indexes, timesteps
+    def get_g(self, f_t, i_s, i_t):
+        return torch.exp(self.S[i_s] - self.S[i_t]) * f_t - torch.exp(self.S[i_s]) * (self.B[i_t] - self.B[i_s])
+    def compute_exponential_coefficients_high_order(self, i_s, i_t, order=2):
+        key = (i_s, i_t, order)
+        if key in self.exp_coeffs.keys():
+            coeffs = self.exp_coeffs[key]
+        else:
+            n = order - 1
+            a = self.L[i_s : i_t + 1] + self.S[i_s : i_t + 1] - self.L[i_s] - self.S[i_s]
+            x = self.ts[i_s : i_t + 1]
+            b = (self.ts[i_s : i_t + 1] - self.ts[i_s]) ** n / math.factorial(n)
+            coeffs = weighted_cumsumexp_trapezoid_torch(a, x, b, cumsum=False)
+            self.exp_coeffs[key] = coeffs
+        return coeffs
+    def compute_high_order_derivatives(self, n, lambda_0n, g_0n, pseudo=False):
+        # return g^(1), ..., g^(n)
+        if pseudo:
+            D = [[] for _ in range(n + 1)]
+            D[0] = g_0n
+            for i in range(1, n + 1):
+                for j in range(n - i + 1):
+                    D[i].append((D[i - 1][j] - D[i - 1][j + 1]) / (lambda_0n[j] - lambda_0n[i + j]))
+            return [D[i][0] * math.factorial(i) for i in range(1, n + 1)]
+        else:
+            R = []
+            for i in range(1, n + 1):
+                R.append(torch.pow(lambda_0n[1:] - lambda_0n[0], i))
+            R = torch.stack(R).t()
+            B = (torch.stack(g_0n[1:]) - g_0n[0]).reshape(n, -1)
+            shape = g_0n[0].shape
+            solution = torch.linalg.inv(R) @ B
+            solution = solution.reshape([n] + list(shape))
+            return [solution[i - 1] * math.factorial(i) for i in range(1, n + 1)]
+    def multistep_predictor_update(self, x_lst, eps_lst, time_lst, index_lst, t, i_t, order=1, pseudo=False):
+        # x_lst: [..., x_s]
+        # eps_lst: [..., eps_s]
+        # time_lst: [..., time_s]
+        ns = self.noise_schedule
+        n = order - 1
+        indexes = [-i - 1 for i in range(n + 1)]
+        x_0n = index_list(x_lst, indexes)
+        eps_0n = index_list(eps_lst, indexes)
+        time_0n = torch.FloatTensor(index_list(time_lst, indexes)).cuda()
+        index_0n = index_list(index_lst, indexes)
+        lambda_0n = ns.marginal_lambda(time_0n)
+        alpha_0n = ns.marginal_alpha(time_0n)
+        sigma_0n = ns.marginal_std(time_0n)
+        alpha_s, alpha_t = alpha_0n[0], ns.marginal_alpha(t)
+        i_s = index_0n[0]
+        x_s = x_0n[0]
+        g_0n = []
+        for i in range(n + 1):
+            f_i = (sigma_0n[i] * eps_0n[i] - self.l[index_0n[i]] * x_0n[i]) / alpha_0n[i]
+            g_i = self.get_g(f_i, index_0n[0], index_0n[i])
+            g_0n.append(g_i)
+        g_0 = g_0n[0]
+        x_t = (
+            alpha_t / alpha_s * torch.exp(self.L[i_s] - self.L[i_t]) * x_s
+            - alpha_t * torch.exp(-self.L[i_t] - self.S[i_s]) * (self.I[i_t] - self.I[i_s]) * g_0
+            - alpha_t
+            * torch.exp(-self.L[i_t])
+            * (self.C[i_t] - self.C[i_s] - self.B[i_s] * (self.I[i_t] - self.I[i_s]))
+        )
+        if order > 1:
+            g_d = self.compute_high_order_derivatives(n, lambda_0n, g_0n, pseudo=pseudo)
+            for i in range(order - 1):
+                x_t = (
+                    x_t
+                    - alpha_t
+                    * torch.exp(self.L[i_s] - self.L[i_t])
+                    * self.compute_exponential_coefficients_high_order(i_s, i_t, order=i + 2)
+                    * g_d[i]
+                )
+        return x_t
+    def multistep_corrector_update(self, x_lst, eps_lst, time_lst, index_lst, order=1, pseudo=False):
+        # x_lst: [..., x_s, x_t]
+        # eps_lst: [..., eps_s, eps_t]
+        # lambda_lst: [..., lambda_s, lambda_t]
+        ns = self.noise_schedule
+        n = order - 1
+        indexes = [-i - 1 for i in range(n + 1)]
+        indexes[0] = -2
+        indexes[1] = -1
+        x_0n = index_list(x_lst, indexes)
+        eps_0n = index_list(eps_lst, indexes)
+        time_0n = torch.FloatTensor(index_list(time_lst, indexes)).cuda()
+        index_0n = index_list(index_lst, indexes)
+        lambda_0n = ns.marginal_lambda(time_0n)
+        alpha_0n = ns.marginal_alpha(time_0n)
+        sigma_0n = ns.marginal_std(time_0n)
+        alpha_s, alpha_t = alpha_0n[0], alpha_0n[1]
+        i_s, i_t = index_0n[0], index_0n[1]
+        x_s = x_0n[0]
+        g_0n = []
+        for i in range(n + 1):
+            f_i = (sigma_0n[i] * eps_0n[i] - self.l[index_0n[i]] * x_0n[i]) / alpha_0n[i]
+            g_i = self.get_g(f_i, index_0n[0], index_0n[i])
+            g_0n.append(g_i)
+        g_0 = g_0n[0]
+        x_t_new = (
+            alpha_t / alpha_s * torch.exp(self.L[i_s] - self.L[i_t]) * x_s
+            - alpha_t * torch.exp(-self.L[i_t] - self.S[i_s]) * (self.I[i_t] - self.I[i_s]) * g_0
+            - alpha_t
+            * torch.exp(-self.L[i_t])
+            * (self.C[i_t] - self.C[i_s] - self.B[i_s] * (self.I[i_t] - self.I[i_s]))
+        )
+        if order > 1:
+            g_d = self.compute_high_order_derivatives(n, lambda_0n, g_0n, pseudo=pseudo)
+            for i in range(order - 1):
+                x_t_new = (
+                    x_t_new
+                    - alpha_t
+                    * torch.exp(self.L[i_s] - self.L[i_t])
+                    * self.compute_exponential_coefficients_high_order(i_s, i_t, order=i + 2)
+                    * g_d[i]
+                )
+        return x_t_new
+    def sample(
+        self,
+        x,
+        model_fn,
+        order,
+        p_pseudo,
+        use_corrector,
+        c_pseudo,
+        lower_order_final,
+        start_free_u_step=None,
+        free_u_apply_callback=None,
+        free_u_stop_callback=None,
+        half=False,
+        return_intermediate=False,
+    ):
+        self.model = lambda x, t: model_fn(x, t.expand((x.shape[0])))
+        steps = self.steps
+        cached_x = []
+        cached_model_output = []
+        cached_time = []
+        cached_index = []
+        indexes, timesteps = self.indexes, self.timesteps
+        step_p_order = 0
+        if free_u_stop_callback is not None:
+            free_u_stop_callback()
+        for step in range(1, steps + 1):
+            if start_free_u_step is not None and step == start_free_u_step and free_u_apply_callback is not None:
+                free_u_apply_callback()
+            cached_x.append(x)
+            cached_model_output.append(self.noise_prediction_fn(x, timesteps[step - 1]))
+            cached_time.append(timesteps[step - 1])
+            cached_index.append(indexes[step - 1])
+            if use_corrector and (timesteps[step - 1] > 0.5 or not half):
+                step_c_order = step_p_order + c_pseudo
+                if step_c_order > 1:
+                    x_new = self.multistep_corrector_update(
+                        cached_x, cached_model_output, cached_time, cached_index, order=step_c_order, pseudo=c_pseudo
+                    )
+                    sigma_t = self.noise_schedule.marginal_std(cached_time[-1])
+                    l_t = self.l[cached_index[-1]]
+                    N_old = sigma_t * cached_model_output[-1] - l_t * cached_x[-1]
+                    cached_x[-1] = x_new
+                    cached_model_output[-1] = (N_old + l_t * cached_x[-1]) / sigma_t
+            if step < order:
+                step_p_order = step
+            else:
+                step_p_order = order
+            if lower_order_final:
+                step_p_order = min(step_p_order, steps + 1 - step)
+            t = timesteps[step]
+            i_t = indexes[step]
+            x = self.multistep_predictor_update(
+                cached_x, cached_model_output, cached_time, cached_index, t, i_t, order=step_p_order, pseudo=p_pseudo
+            )
+        if return_intermediate:
+            return x, cached_x
+        else:
+            return x
+#############################################################
+# other utility functions
+#############################################################
+def interpolate_fn(x, xp, yp):
+    """
+    A piecewise linear function y = f(x), using xp and yp as keypoints.
+    We implement f(x) in a differentiable way (i.e. applicable for autograd).
+    The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.)
+    Args:
+        x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver).
+        xp: PyTorch tensor with shape [C, K], where K is the number of keypoints.
+        yp: PyTorch tensor with shape [C, K].
+    Returns:
+        The function values f(x), with shape [N, C].
+    """
+    N, K = x.shape[0], xp.shape[1]
+    all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2)
+    sorted_all_x, x_indices = torch.sort(all_x, dim=2)
+    x_idx = torch.argmin(x_indices, dim=2)
+    cand_start_idx = x_idx - 1
+    start_idx = torch.where(
+        torch.eq(x_idx, 0),
+        torch.tensor(1, device=x.device),
+        torch.where(
+            torch.eq(x_idx, K),
+            torch.tensor(K - 2, device=x.device),
+            cand_start_idx,
+        ),
+    )
+    end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
+    start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
+    end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2)
+    start_idx2 = torch.where(
+        torch.eq(x_idx, 0),
+        torch.tensor(0, device=x.device),
+        torch.where(
+            torch.eq(x_idx, K),
+            torch.tensor(K - 2, device=x.device),
+            cand_start_idx,
+        ),
+    )
+    y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1)
+    start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
+    end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
+    cand = start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x)
+    return cand
+def expand_dims(v, dims):
+    """
+    Expand the tensor `v` to the dim `dims`.
+    Args:
+        `v`: a PyTorch tensor with shape [N].
+        `dim`: a `int`.
+    Returns:
+        a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
+    """
+    return v[(...,) + (None,) * (dims - 1)]

free_lunch_utils.py ADDED Viewed

	@@ -0,0 +1,303 @@

+import torch
+import torch.fft as fft
+from diffusers.utils import is_torch_version
+from typing import Any, Dict, List, Optional, Tuple, Union
+def isinstance_str(x: object, cls_name: str):
+    """
+    Checks whether x has any class *named* cls_name in its ancestry.
+    Doesn't require access to the class's implementation.
+    Useful for patching!
+    """
+    for _cls in x.__class__.__mro__:
+        if _cls.__name__ == cls_name:
+            return True
+    return False
+def Fourier_filter(x, threshold, scale):
+    dtype = x.dtype
+    x = x.type(torch.float32)
+    # FFT
+    x_freq = fft.fftn(x, dim=(-2, -1))
+    x_freq = fft.fftshift(x_freq, dim=(-2, -1))
+    B, C, H, W = x_freq.shape
+    mask = torch.ones((B, C, H, W)).cuda()
+    crow, ccol = H // 2, W //2
+    mask[..., crow - threshold:crow + threshold, ccol - threshold:ccol + threshold] = scale
+    x_freq = x_freq * mask
+    # IFFT
+    x_freq = fft.ifftshift(x_freq, dim=(-2, -1))
+    x_filtered = fft.ifftn(x_freq, dim=(-2, -1)).real
+    x_filtered = x_filtered.type(dtype)
+    return x_filtered
+def register_upblock2d(model):
+    def up_forward(self):
+        def forward(hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
+            for resnet in self.resnets:
+                # pop res hidden states
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+                #print(f"in upblock2d, hidden states shape: {hidden_states.shape}")
+                hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+                if self.training and self.gradient_checkpointing:
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs)
+                        return custom_forward
+                    if is_torch_version(">=", "1.11.0"):
+                        hidden_states = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                        )
+                    else:
+                        hidden_states = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(resnet), hidden_states, temb
+                        )
+                else:
+                    hidden_states = resnet(hidden_states, temb)
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+            return hidden_states
+        return forward
+    for i, upsample_block in enumerate(model.unet.up_blocks):
+        if isinstance_str(upsample_block, "UpBlock2D"):
+            upsample_block.forward = up_forward(upsample_block)
+def register_free_upblock2d(model, b1=1.2, b2=1.4, s1=0.9, s2=0.2):
+    def up_forward(self):
+        def forward(hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
+            for resnet in self.resnets:
+                # pop res hidden states
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+                #print(f"in free upblock2d, hidden states shape: {hidden_states.shape}")
+                # --------------- FreeU code -----------------------
+                # Only operate on the first two stages
+                if hidden_states.shape[1] == 1280:
+                    hidden_states[:,:640] = hidden_states[:,:640] * self.b1
+                    res_hidden_states = Fourier_filter(res_hidden_states, threshold=1, scale=self.s1)
+                if hidden_states.shape[1] == 640:
+                    hidden_states[:,:320] = hidden_states[:,:320] * self.b2
+                    res_hidden_states = Fourier_filter(res_hidden_states, threshold=1, scale=self.s2)
+                # ---------------------------------------------------------
+                hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+                if self.training and self.gradient_checkpointing:
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs)
+                        return custom_forward
+                    if is_torch_version(">=", "1.11.0"):
+                        hidden_states = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                        )
+                    else:
+                        hidden_states = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(resnet), hidden_states, temb
+                        )
+                else:
+                    hidden_states = resnet(hidden_states, temb)
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+            return hidden_states
+        return forward
+    for i, upsample_block in enumerate(model.unet.up_blocks):
+        if isinstance_str(upsample_block, "UpBlock2D"):
+            upsample_block.forward = up_forward(upsample_block)
+            setattr(upsample_block, 'b1', b1)
+            setattr(upsample_block, 'b2', b2)
+            setattr(upsample_block, 's1', s1)
+            setattr(upsample_block, 's2', s2)
+def register_crossattn_upblock2d(model):
+    def up_forward(self):
+        def forward(
+            hidden_states: torch.FloatTensor,
+            res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+            temb: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            upsample_size: Optional[int] = None,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        ):
+            for resnet, attn in zip(self.resnets, self.attentions):
+                # pop res hidden states
+                #print(f"in crossatten upblock2d, hidden states shape: {hidden_states.shape}")
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+                hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+                if self.training and self.gradient_checkpointing:
+                    def create_custom_forward(module, return_dict=None):
+                        def custom_forward(*inputs):
+                            if return_dict is not None:
+                                return module(*inputs, return_dict=return_dict)
+                            else:
+                                return module(*inputs)
+                        return custom_forward
+                    ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        **ckpt_kwargs,
+                    )
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(attn, return_dict=False),
+                        hidden_states,
+                        encoder_hidden_states,
+                        None,  # timestep
+                        None,  # class_labels
+                        cross_attention_kwargs,
+                        attention_mask,
+                        encoder_attention_mask,
+                        **ckpt_kwargs,
+                    )[0]
+                else:
+                    hidden_states = resnet(hidden_states, temb)
+                    hidden_states = attn(
+                        hidden_states,
+                        encoder_hidden_states=encoder_hidden_states,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        attention_mask=attention_mask,
+                        encoder_attention_mask=encoder_attention_mask,
+                        return_dict=False,
+                    )[0]
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+            return hidden_states
+        return forward
+    for i, upsample_block in enumerate(model.unet.up_blocks):
+        if isinstance_str(upsample_block, "CrossAttnUpBlock2D"):
+            upsample_block.forward = up_forward(upsample_block)
+def register_free_crossattn_upblock2d(model, b1=1.2, b2=1.4, s1=0.9, s2=0.2):
+    def up_forward(self):
+        def forward(
+            hidden_states: torch.FloatTensor,
+            res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+            temb: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            upsample_size: Optional[int] = None,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        ):
+            for resnet, attn in zip(self.resnets, self.attentions):
+                # pop res hidden states
+                #print(f"in free crossatten upblock2d, hidden states shape: {hidden_states.shape}")
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+                # --------------- FreeU code -----------------------
+                # Only operate on the first two stages
+                if hidden_states.shape[1] == 1280:
+                    hidden_states[:,:640] = hidden_states[:,:640] * self.b1
+                    res_hidden_states = Fourier_filter(res_hidden_states, threshold=1, scale=self.s1)
+                if hidden_states.shape[1] == 640:
+                    hidden_states[:,:320] = hidden_states[:,:320] * self.b2
+                    res_hidden_states = Fourier_filter(res_hidden_states, threshold=1, scale=self.s2)
+                # ---------------------------------------------------------
+                hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+                if self.training and self.gradient_checkpointing:
+                    def create_custom_forward(module, return_dict=None):
+                        def custom_forward(*inputs):
+                            if return_dict is not None:
+                                return module(*inputs, return_dict=return_dict)
+                            else:
+                                return module(*inputs)
+                        return custom_forward
+                    ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        **ckpt_kwargs,
+                    )
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(attn, return_dict=False),
+                        hidden_states,
+                        encoder_hidden_states,
+                        None,  # timestep
+                        None,  # class_labels
+                        cross_attention_kwargs,
+                        attention_mask,
+                        encoder_attention_mask,
+                        **ckpt_kwargs,
+                    )[0]
+                else:
+                    hidden_states = resnet(hidden_states, temb)
+                    # hidden_states = attn(
+                    #     hidden_states,
+                    #     encoder_hidden_states=encoder_hidden_states,
+                    #     cross_attention_kwargs=cross_attention_kwargs,
+                    #     encoder_attention_mask=encoder_attention_mask,
+                    #     return_dict=False,
+                    # )[0]
+                    hidden_states = attn(
+                        hidden_states,
+                        encoder_hidden_states=encoder_hidden_states,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                    )[0]
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+            return hidden_states
+        return forward
+    for i, upsample_block in enumerate(model.unet.up_blocks):
+        if isinstance_str(upsample_block, "CrossAttnUpBlock2D"):
+            upsample_block.forward = up_forward(upsample_block)
+            setattr(upsample_block, 'b1', b1)
+            setattr(upsample_block, 'b2', b2)
+            setattr(upsample_block, 's1', s1)
+            setattr(upsample_block, 's2', s2)

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+tqdm
+einops
+pytorch_lightning
+accelerate>=0.20.0
+torchsde
+pycocotools
+diffusers== 0.32.2
+timm
+transformers==4.49
+torch>=2.0.0
+opencv-python
+omegaconf
+gradio==3.45.0
+spandrel

sampler.py ADDED Viewed

	@@ -0,0 +1,315 @@

+"""SAMPLING ONLY."""
+import torch
+from dpm_solver_v3 import NoiseScheduleVP, model_wrapper, DPM_Solver_v3
+from uni_pc import UniPC
+from free_lunch_utils import register_free_upblock2d, register_free_crossattn_upblock2d
+class DPMSolverv3Sampler:
+    def __init__(self, stats_dir, pipe, steps, guidance_scale, **kwargs):
+        super().__init__()
+        self.model = pipe
+        to_torch = lambda x: x.clone().detach().to(torch.float32).to(pipe.device)
+        DTYPE = torch.float32  # torch.float16 works as well, but pictures seem to be a bit worse
+        device = "cuda"
+        noise_scheduler = pipe.scheduler
+        alpha_schedule = noise_scheduler.alphas_cumprod.to(device=device, dtype=DTYPE)
+        self.alphas_cumprod = alpha_schedule #to_torch(model.alphas_cumprod)
+        self.device = device
+        self.guidance_scale = guidance_scale
+        self.ns = NoiseScheduleVP("discrete", alphas_cumprod=self.alphas_cumprod)
+        assert stats_dir is not None, f"No statistics file found in {stats_dir}."
+        print("Use statistics", stats_dir)
+        self.dpm_solver_v3 = DPM_Solver_v3(
+            statistics_dir=stats_dir,
+            noise_schedule=self.ns,
+            steps=steps,
+            t_start=None,
+            t_end=None,
+            skip_type="customed_time_karras",
+            degenerated=False,
+            device=self.device,
+        )
+        self.steps = steps
+    @torch.no_grad()
+    def apply_free_unet(self):
+        register_free_upblock2d(self.model, b1=1.1, b2=1.1, s1=0.9, s2=0.2)
+        register_free_crossattn_upblock2d(self.model, b1=1.1, b2=1.1, s1=0.9, s2=0.2)
+    @torch.no_grad()
+    def stop_free_unet(self):
+        register_free_upblock2d(self.model, b1=1.0, b2=1.0, s1=1.0, s2=1.0)
+        register_free_crossattn_upblock2d(self.model, b1=1.0, b2=1.0, s1=1.0, s2=1.0)
+    @torch.no_grad()
+    def sample(
+        self,
+        batch_size,
+        shape,
+        conditioning=None,
+        x_T=None,
+        unconditional_conditioning=None,
+        use_corrector=False,
+        half=False,
+        start_free_u_step=None,
+        # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
+        **kwargs,
+    ):
+        if conditioning is not None:
+            cond_in = torch.cat([unconditional_conditioning, conditioning])
+            # extra_args = {'cond': conditioning, 'uncond': unconditional_conditioning, 'cond_scale': self.guidance_scale}
+            if isinstance(conditioning, dict):
+                cbs = conditioning[list(conditioning.keys())[0]].shape[0]
+                if cbs != batch_size:
+                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
+            else:
+                if conditioning.shape[0] != batch_size:
+                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
+        # sampling
+        C, H, W = shape
+        size = (batch_size, C, H, W)
+        if x_T is None:
+            img = torch.randn(size, device=self.device)
+        else:
+            img = x_T
+        if conditioning is None:
+            model_fn = model_wrapper(
+                lambda x, t, c: self.model.unet(x, t, encoder_hidden_states=c).sample,
+                self.ns,
+                model_type="noise",
+                guidance_type="uncond",
+            )
+            ORDER = 3
+        else:
+            model_fn = model_wrapper(
+                lambda x, t, c: self.model.unet(x, t, encoder_hidden_states=c).sample,
+                self.ns,
+                model_type="noise",
+                guidance_type="classifier-free",
+                condition=conditioning,
+                unconditional_condition=unconditional_conditioning,
+                guidance_scale=self.guidance_scale,
+            )
+            if self.steps == 8:
+                ORDER = 2
+            else:
+                ORDER = 1
+        x = self.dpm_solver_v3.sample(
+            img,
+            model_fn,
+            order=ORDER,
+            p_pseudo=False,
+            c_pseudo=True,
+            lower_order_final=True,
+            use_corrector=use_corrector,
+            start_free_u_step=start_free_u_step,
+            free_u_apply_callback=self.apply_free_unet if start_free_u_step is not None else None,
+            free_u_stop_callback=self.stop_free_unet if start_free_u_step is not None else None,
+            half=half,
+        )
+        return x.to(self.device), None
+class UniPCSampler:
+    def __init__(self
+                 , pipe
+                 , model_closure
+                 , steps
+                 , guidance_scale,denoise_to_zero=False
+                 , need_fp16_discrete_method = False
+                 , ultilize_vae_in_fp16 = False
+                 , is_high_resoulution = True
+                 , skip_type="customed_time_karras"
+                 , force_not_use_afs=False
+                 , **kwargs):
+        super().__init__()
+        # self.model = pipe
+        self.model = model_closure(pipe)
+        self.pipe = pipe
+        self.need_fp16_discrete_method = need_fp16_discrete_method
+        # to_torch = lambda x: x.clone().detach().to(torch.float32).to(pipe.device)
+        DTYPE = self.pipe.unet.dtype  # torch.float16 works as well, but pictures seem to be a bit worse
+        device = self.pipe.device
+        noise_scheduler = pipe.scheduler
+        alpha_schedule = noise_scheduler.alphas_cumprod.to(device=device, dtype=DTYPE)
+        self.alphas_cumprod = alpha_schedule #to_torch(model.alphas_cumprod)
+        self.device = device
+        self.guidance_scale = guidance_scale
+        self.use_afs = steps <= 8 and is_high_resoulution and not force_not_use_afs
+        self.ns = NoiseScheduleVP("discrete", alphas_cumprod=self.alphas_cumprod)
+        self.unipc_solver = UniPC(
+            noise_schedule=self.ns,
+            steps=steps,
+            t_start=None,
+            t_end=None,
+            skip_type=skip_type,
+            degenerated=False,
+            use_afs=self.use_afs,
+            device=self.device,
+            denoise_to_zero=denoise_to_zero,
+            need_fp16_discrete_method = self.need_fp16_discrete_method,
+            ultilize_vae_in_fp16 = ultilize_vae_in_fp16,
+            is_high_resoulution=is_high_resoulution,
+        )
+        self.steps = steps
+    @torch.no_grad()
+    def apply_free_unet(self):
+        register_free_upblock2d(self.pipe, b1=1.2, b2=1.2, s1=0.9, s2=0.2)
+        register_free_crossattn_upblock2d(self.pipe, b1=1.2, b2=1.2, s1=0.9, s2=0.2)
+    @torch.no_grad()
+    def stop_free_unet(self):
+        register_free_upblock2d(self.pipe, b1=1.0, b2=1.0, s1=1.0, s2=1.0)
+        register_free_crossattn_upblock2d(self.pipe, b1=1.0, b2=1.0, s1=1.0, s2=1.0)
+    @torch.no_grad()
+    def sample(
+        self,
+        batch_size,
+        shape,
+        conditioning=None,
+        x_T=None,
+        unconditional_conditioning=None,
+        use_corrector=False,
+        half=False,
+        start_free_u_step=None,
+        xl_preprocess_closure=None,
+        npnet=None,
+        # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
+        **kwargs,
+    ):
+        # sampling
+        C, H, W = shape
+        size = (batch_size, C, H, W)
+        new_img = None
+        if xl_preprocess_closure is not None:
+            prompt_embeds, cond_kwargs = xl_preprocess_closure(pipe=self.pipe,prompts = conditioning, need_cfg=True, device=self.device,negative_prompts=unconditional_conditioning)
+        if x_T is None:
+            img = torch.randn(size, device=self.device)
+        else:
+            img = x_T
+        if xl_preprocess_closure is not None and npnet is not None:
+            c, _ = prompt_embeds
+            c = c.unsqueeze(0)  # add dummy dimension for npnet
+            new_img = npnet(img, c)
+        if conditioning is None:
+            model_fn = model_wrapper(
+                lambda x, t, c: self.model(x, t, c),
+                self.ns,
+                model_type="noise",
+                guidance_type="uncond",
+            )
+            ORDER = 3
+        else:
+            model_fn = model_wrapper(
+                lambda x, t, c: self.model(x, t, c),
+                self.ns,
+                model_type="noise",
+                guidance_type="classifier-free",
+                condition=conditioning if xl_preprocess_closure is None else prompt_embeds,
+                unconditional_condition=unconditional_conditioning if xl_preprocess_closure is None else cond_kwargs,
+                guidance_scale=self.guidance_scale,
+            )
+            if self.steps >= 7:
+                ORDER = 2
+            else:
+                ORDER = 1
+        x, full_cache = self.unipc_solver.sample(
+            x=img,
+            model_fn=model_fn,
+            order=ORDER,
+            use_corrector=use_corrector,
+            lower_order_final=True,
+            start_free_u_step=start_free_u_step,
+            free_u_apply_callback=self.apply_free_unet if start_free_u_step is not None else None,
+            free_u_stop_callback=self.stop_free_unet if start_free_u_step is not None else None,
+            npnet_x=new_img if new_img is not None else None,
+            npnet_scale=self.guidance_scale if new_img is not None else None,
+            half=half,
+        )
+        return x.to(self.device), full_cache
+    @torch.no_grad()
+    def sample_mix(
+        self,
+        batch_size,
+        shape,
+        conditioning=None,
+        x_T=None,
+        unconditional_conditioning=None,
+        use_corrector=False,
+        half=False,
+        start_free_u_step=None,
+        xl_preprocess_closure=None,
+        npnet=None,
+        # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
+        **kwargs,
+    ):
+        # sampling
+        C, H, W = shape
+        size = (batch_size, C, H, W)
+        if xl_preprocess_closure is not None:
+            prompt_embeds, cond_kwargs = xl_preprocess_closure(pipe=self.pipe,prompts = conditioning, need_cfg=True, device=self.device,negative_prompts=unconditional_conditioning)
+        if x_T is None:
+            img = torch.randn(size, device=self.device)
+        else:
+            img = x_T
+        if xl_preprocess_closure is not None and npnet is not None:
+            c, _ = prompt_embeds
+            c = c.unsqueeze(0)  # add dummy dimension for npnet
+            img = npnet(img, c)
+        if conditioning is None:
+            model_fn = model_wrapper(
+                lambda x, t, c: self.model(x, t, c),
+                self.ns,
+                model_type="noise",
+                guidance_type="uncond",
+            )
+            ORDER = 3
+        else:
+            model_fn = model_wrapper(
+                lambda x, t, c: self.model(x, t, c),
+                self.ns,
+                model_type="noise",
+                guidance_type="classifier-free",
+                condition=conditioning if xl_preprocess_closure is None else prompt_embeds,
+                unconditional_condition=unconditional_conditioning if xl_preprocess_closure is None else cond_kwargs,
+                guidance_scale=self.guidance_scale,
+            )
+            if self.steps >= 8 and not self.need_fp16_discrete_method:
+                ORDER = 2
+            else:
+                ORDER = 1
+        x, full_cache = self.unipc_solver.sample_mix(
+            x=img,
+            model_fn=model_fn,
+            order=ORDER,
+            use_corrector=use_corrector,
+            lower_order_final=True,
+            start_free_u_step=start_free_u_step,
+            free_u_apply_callback=self.apply_free_unet if start_free_u_step is not None else None,
+            free_u_stop_callback=self.stop_free_unet if start_free_u_step is not None else None,
+            half=half,
+        )
+        return x.to(self.device), full_cache

uni_pc.py ADDED Viewed

	@@ -0,0 +1,757 @@

+from dpm_solver_v3 import NoiseScheduleVP, model_wrapper
+import torch
+import torch.nn.functional as F
+import math
+import numpy as np
+import os
+class UniPC:
+    def __init__(
+        self,
+        noise_schedule,
+        steps=10,
+        t_start=None,
+        t_end=None,
+        skip_type="customed_time_karras",
+        degenerated=False,
+        use_afs = False,
+        denoise_to_zero=False,
+        need_fp16_discrete_method = False,
+        ultilize_vae_in_fp16 = False,
+        is_high_resoulution = True,
+        device="cuda",
+    ):
+        self.device = device
+        self.model = None
+        self.noise_schedule = noise_schedule
+        self.steps = steps if not use_afs else steps + 1
+        self.use_afs = use_afs
+        self.ultilize_vae_in_fp16 = ultilize_vae_in_fp16
+        self.need_fp16_discrete_method = need_fp16_discrete_method
+        t_0 = 1.0 / self.noise_schedule.total_N if t_end is None else t_end
+        t_T = self.noise_schedule.T if t_start is None else t_start
+        self.is_high_resolution = is_high_resoulution
+        assert (
+            t_0 > 0 and t_T > 0
+        ), "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
+        # precompute timesteps
+        if skip_type == "logSNR" or skip_type == "time_uniform" or skip_type == "time_quadratic" or skip_type == "customed_time_karras":
+            self.timesteps = self.get_time_steps(skip_type
+                                                 , t_T=t_T
+                                                 , t_0=t_0
+                                                 , N=steps
+                                                 , device=device,denoise_to_zero=denoise_to_zero
+                                                 , is_high_resolution=self.is_high_resolution)
+        else:
+            raise ValueError(f"Unsupported timestep strategy {skip_type}")
+        self.lambda_T = self.timesteps[0].cpu().item()
+        self.lambda_0 = self.timesteps[-1].cpu().item()
+        # print("Time steps", self.timesteps)
+        # print("LogSNR steps", self.noise_schedule.marginal_lambda(self.timesteps))
+        # store high-order exponential coefficients (lazy)
+        self.exp_coeffs = {}
+    def noise_prediction_fn(self, x, t):
+        """
+        Return the noise prediction model.
+        """
+        return self.model(x, t)
+    def append_zero(self, x):
+        return torch.cat([x, x.new_zeros([1])])
+    def get_sigmas_karras(self, n, sigma_min, sigma_max, rho=7., device='cpu', need_append_zero=True):
+        """Constructs the noise schedule of Karras et al. (2022)."""
+        ramp = torch.linspace(0, 1, n)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return self.append_zero(sigmas).to(device) if need_append_zero else sigmas.to(device)
+    def sigma_to_t(self, sigma, quantize=None):
+        quantize = False
+        log_sigma = sigma.log()
+        dists = log_sigma - self.noise_schedule.log_sigmas[:, None]
+        if quantize:
+            return dists.abs().argmin(dim=0).view(sigma.shape)
+        low_idx = dists.ge(0).cumsum(dim=0).argmax(dim=0).clamp(max=self.noise_schedule.log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+        low, high = self.noise_schedule.log_sigmas[low_idx], self.noise_schedule.log_sigmas[high_idx]
+        w = (low - log_sigma) / (low - high)
+        w = w.clamp(0, 1)
+        t = (1 - w) * low_idx + w * high_idx
+        return t.view(sigma.shape)
+    def get_time_steps(self, skip_type, t_T, t_0, N, device, denoise_to_zero=False, is_high_resolution=True):
+        """Compute the intermediate time steps for sampling.
+        Args:
+            skip_type: A `str`. The type for the spacing of the time steps. We support three types:
+                - 'logSNR': uniform logSNR for the time steps.
+                - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.)
+                - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.)
+            t_T: A `float`. The starting time of the sampling (default is T).
+            t_0: A `float`. The ending time of the sampling (default is epsilon).
+            N: A `int`. The total number of the spacing of the time steps.
+            device: A torch device.
+        Returns:
+            A pytorch tensor of the time steps, with the shape (N + 1,).
+        """
+        if skip_type == "logSNR":
+            lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device))
+            lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device))
+            logSNR_steps = torch.linspace(lambda_T.cpu().item(), lambda_0.cpu().item(), N + 1).to(device)
+            return self.noise_schedule.inverse_lambda(logSNR_steps)
+        elif skip_type == "time_uniform":
+            return torch.linspace(t_T, t_0, N + 1).to(device)
+        elif skip_type == "time_quadratic":
+            t_order = 2
+            t = torch.linspace(t_T ** (1.0 / t_order), t_0 ** (1.0 / t_order), N + 1).pow(t_order).to(device)
+            return t
+        elif skip_type == "customed_time_karras" and is_high_resolution:
+            sigma_T = self.noise_schedule.sigmas[-1].cpu().item()
+            sigma_0 = self.noise_schedule.sigmas[0].cpu().item()
+            if N == 8:
+                sigmas = self.get_sigmas_karras(12, sigma_0, sigma_T,rho=12.0, device=device)
+                if not self.need_fp16_discrete_method:
+                    ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[10])
+                    ct = self.get_sigmas_karras(9, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                    sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                    real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+                else:
+                    sigmas = self.get_sigmas_karras(8, sigma_0, sigma_T, rho=5.0, device=device)
+                    ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[6])
+                    ct = self.get_sigmas_karras(8, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                    sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                    tmp_t = [self.noise_schedule.sigma_to_t(sigma).to('cpu') for sigma in sigmas_ct]
+                    real_ct = [ t / 999 for t in tmp_t]
+            elif N == 5:
+                sigmas = self.get_sigmas_karras(8, sigma_0, sigma_T, rho=5.0, device=device)
+                if not self.need_fp16_discrete_method:
+                    sigmas = self.get_sigmas_karras(12, sigma_0, sigma_T,rho=12.0, device=device)
+                    ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[9])
+                    ct = self.get_sigmas_karras(6, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                    sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                    real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+                else:
+                    ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[6])
+                    ct = self.get_sigmas_karras(5, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                    sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                    real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+            elif N == 6:
+                sigmas = self.get_sigmas_karras(8, sigma_0, sigma_T, rho=5.0, device=device)
+                if not self.need_fp16_discrete_method:
+                    sigmas = self.get_sigmas_karras(12, sigma_0, sigma_T,rho=12.0, device=device)
+                    ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[10])
+                    ct = self.get_sigmas_karras(7, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                    sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                    real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+                else:
+                    if denoise_to_zero:
+                        ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[6])
+                        ct = self.get_sigmas_karras(6, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                        sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                        real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+                        real_ct.append(torch.tensor(t_0).to(dtype=real_ct[-1].dtype,device='cpu'))
+                    else:
+                        sigmas = self.get_sigmas_karras(12, sigma_0, sigma_T, rho=7.0, device=device)
+                        ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[7])
+                        ct = self.get_sigmas_karras(7, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                        sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                        real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+            elif N == 7:
+                sigmas = self.get_sigmas_karras(8, sigma_0, sigma_T, rho=5.0, device=device)
+                if not self.need_fp16_discrete_method:
+                    ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[6])
+                    ct = self.get_sigmas_karras(8, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                    sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                    real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+                else:
+                    if denoise_to_zero:
+                        ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[6])
+                        ct = self.get_sigmas_karras(7, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                        sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                        real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+                        real_ct.append(torch.tensor(t_0).to(dtype=real_ct[-1].dtype,device='cpu'))
+            # if denoise_to_zero:
+            #     real_ct.append(torch.tensor(t_0).to(dtype=real_ct[-1].dtype,device='cpu'))
+            if self.use_afs:
+                tmp_t = (real_ct[0] + real_ct[1]) / 2
+                real_ct.insert(1, tmp_t)
+            none_k_ct = torch.from_numpy(np.array(real_ct)).to(device)
+            return none_k_ct#real_ct
+        elif skip_type == "customed_time_karras" and not is_high_resolution:
+            sigma_T = self.noise_schedule.sigmas[-1].cpu().item()
+            sigma_0 = self.noise_schedule.sigmas[0].cpu().item()
+            if N == 8:
+                sigmas = self.get_sigmas_karras(12, sigma_0, sigma_T, rho=7.0, device=device)
+                ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[9])
+                ct = self.get_sigmas_karras(9, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+            elif N == 5:
+                sigmas = self.get_sigmas_karras(8, sigma_0, sigma_T, rho=5.0, device=device)
+                ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[6])
+                ct = self.get_sigmas_karras(6, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+            elif N == 6:
+                sigmas = self.sigmas = self.get_sigmas_karras(8, sigma_0, sigma_T, rho=5.0, device=device)
+                ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[6])
+                ct = self.get_sigmas_karras(7, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+            none_k_ct = torch.from_numpy(np.array(real_ct)).to(device)
+            return none_k_ct#real_ct
+        else:
+            raise ValueError(
+                "Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type)
+            )
+    def multistep_uni_pc_update(self, x, model_prev_list:list, t_prev_list: list, t, order, **kwargs):
+        if len(model_prev_list) == 0 or len(t_prev_list) == 0:
+            return None, None
+        if len(t.shape) == 0:
+            t = t.view(-1)
+        if True:#'bh' in self.variant:
+            return self.multistep_uni_pc_bh_update(x, model_prev_list, t_prev_list, t, order, **kwargs)
+        else:
+            # assert self.variant == 'vary_coeff'
+            return self.multistep_uni_pc_vary_update(x, model_prev_list, t_prev_list, t, order, **kwargs)
+    def multistep_uni_pc_sde_update(self, x, model_prev_list:list, t_prev_list: list, t, order, level = 1.0, **kwargs):
+        if len(model_prev_list) == 0 or len(t_prev_list) == 0:
+            return None, None
+        if len(t.shape) == 0:
+            t = t.view(-1)
+        if True:#'bh' in self.variant:
+            return self.multistep_uni_pc_bh_sde_update(x, model_prev_list, t_prev_list, t, level=level, order= order, **kwargs)
+        else:
+            # assert self.variant == 'vary_coeff'
+            return self.multistep_uni_pc_vary_update(x, model_prev_list, t_prev_list, t, order, **kwargs)
+    def multistep_uni_pc_bh_update(self, x, model_prev_list, t_prev_list, t, order, x_t=None, use_corrector=True):
+        # print(f'using unified predictor-corrector with order {order} (solver type: B(h))')
+        ns = self.noise_schedule
+        assert order <= len(model_prev_list)
+        dims = x.dim()
+        # first compute rks
+        t_prev_0 = t_prev_list[-1]
+        lambda_prev_0 = ns.marginal_lambda(t_prev_0)
+        lambda_t = ns.marginal_lambda(t)
+        model_prev_0 = model_prev_list[-1]
+        sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
+        log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
+        alpha_t = torch.exp(log_alpha_t)
+        h = lambda_t - lambda_prev_0
+        rks = []
+        D1s = []
+        for i in range(1, order):
+            t_prev_i = t_prev_list[-(i + 1)]
+            model_prev_i = model_prev_list[-(i + 1)]
+            lambda_prev_i = ns.marginal_lambda(t_prev_i)
+            rk = ((lambda_prev_i - lambda_prev_0) / h)[0]
+            rks.append(rk)
+            D1s.append((model_prev_i - model_prev_0) / rk)
+        rks.append(1.)
+        rks = torch.tensor(rks, device=x.device)
+        R = []
+        b = []
+        hh = h[0]
+        h_phi_1 = torch.expm1(hh) # h\phi_1(h) = e^h - 1
+        h_phi_k = h_phi_1 / hh - 1
+        factorial_i = 1
+        if True:
+            B_h = hh
+        else:
+            B_h = torch.expm1(hh)
+        for i in range(1, order + 1):
+            R.append(torch.pow(rks, i - 1))
+            b.append(h_phi_k * factorial_i / B_h)
+            factorial_i *= (i + 1)
+            h_phi_k = h_phi_k / hh - 1 / factorial_i
+        R = torch.stack(R)
+        b = torch.tensor(b, device=x.device)
+        # now predictor
+        use_predictor = len(D1s) > 0 and x_t is None
+        if len(D1s) > 0:
+            D1s = torch.stack(D1s, dim=1) # (B, K)
+            if x_t is None:
+                # for order 2, we use a simplified version
+                if order == 2:
+                    rhos_p = torch.tensor([0.5], device=b.device)
+                else:
+                    rhos_p = torch.linalg.solve(R[:-1, :-1], b[:-1])
+        else:
+            D1s = None
+        if use_corrector:
+            # print('using corrector')
+            # for order 1, we use a simplified version
+            if order == 1:
+                rhos_c = torch.tensor([0.5], device=b.device)
+            else:
+                rhos_c = torch.linalg.solve(R, b)
+        model_t = None
+        x_t_ = (
+            expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
+            - expand_dims(sigma_t * h_phi_1, dims) * model_prev_0
+        )
+        if x_t is None:
+            if use_predictor:
+                pred_res = torch.einsum('k,bkchw->bchw', rhos_p, D1s)
+            else:
+                pred_res = 0
+            x_t = x_t_ - expand_dims(sigma_t * B_h, dims) * pred_res
+        if use_corrector:
+            model_t = self.noise_prediction_fn(x_t, t)
+            if D1s is not None:
+                corr_res = torch.einsum('k,bkchw->bchw', rhos_c[:-1], D1s)
+            else:
+                corr_res = 0
+            D1_t = (model_t - model_prev_0)
+            x_t = x_t_ - expand_dims(sigma_t * B_h, dims) * (corr_res + rhos_c[-1] * D1_t)
+        return x_t, model_t
+    def multistep_uni_pc_bh_sde_update(self, x, model_prev_list, t_prev_list, t, order, level = 0, x_t=None, use_corrector=True):
+        # print(f'using unified predictor-corrector with order {order} (solver type: B(h))')
+        ns = self.noise_schedule
+        assert order <= len(model_prev_list)
+        dims = x.dim()
+        # first compute rks
+        t_prev_0 = t_prev_list[-1]
+        lambda_prev_0 = ns.marginal_lambda(t_prev_0)
+        lambda_t = ns.marginal_lambda(t)
+        model_prev_0 = model_prev_list[-1]
+        sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
+        log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
+        alpha_t = torch.exp(log_alpha_t)
+        h = lambda_t - lambda_prev_0
+        z = torch.randn(x.shape, device=self.device)
+        z = sigma_t * torch.sqrt(torch.expm1(2.0 * h[0])) * z
+        rks = []
+        D1s = []
+        for i in range(1, order):
+            t_prev_i = t_prev_list[-(i + 1)]
+            model_prev_i = model_prev_list[-(i + 1)]
+            lambda_prev_i = ns.marginal_lambda(t_prev_i)
+            rk = ((lambda_prev_i - lambda_prev_0) / h)[0]
+            rks.append(rk)
+            D1s.append((model_prev_i - model_prev_0) / rk)
+        rks.append(1.)
+        rks = torch.tensor(rks, device=x.device)
+        R = []
+        b = []
+        hh = h[0]
+        h_phi_1 = torch.expm1(hh) # h\phi_1(h) = e^h - 1
+        h_phi_k = h_phi_1 / hh - 1
+        factorial_i = 1
+        if True:
+            B_h = hh
+        else:
+            B_h = torch.expm1(hh)
+        for i in range(1, order + 1):
+            R.append(torch.pow(rks, i - 1))
+            b.append(h_phi_k * factorial_i / B_h)
+            factorial_i *= (i + 1)
+            h_phi_k = h_phi_k / hh - 1 / factorial_i
+        R = torch.stack(R)
+        b = torch.tensor(b, device=x.device)
+        # now predictor
+        use_predictor = len(D1s) > 0 and x_t is None
+        if len(D1s) > 0:
+            D1s = torch.stack(D1s, dim=1) # (B, K)
+            if x_t is None:
+                # for order 2, we use a simplified version
+                if order == 2:
+                    rhos_p = torch.tensor([0.5], device=b.device)
+                else:
+                    rhos_p = torch.linalg.solve(R[:-1, :-1], b[:-1])
+        else:
+            D1s = None
+        if use_corrector:
+            # print('using corrector')
+            # for order 1, we use a simplified version
+            if order == 1:
+                rhos_c = torch.tensor([0.5], device=b.device)
+            else:
+                rhos_c = torch.linalg.solve(R, b)
+        model_t = None
+        x_t_ = (
+            expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
+            - expand_dims(sigma_t * h_phi_1, dims) * (1 + level) * model_prev_0
+        )
+        if x_t is None:
+            if use_predictor:
+                pred_res = torch.einsum('k,bkchw->bchw', rhos_p, D1s)
+            else:
+                pred_res = 0
+            x_t_p = (
+                expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
+                - expand_dims(sigma_t * h_phi_1, dims) * model_prev_0
+            )
+            x_t = x_t_p - expand_dims(sigma_t * B_h, dims) * pred_res
+        if use_corrector:
+            model_t = self.noise_prediction_fn(x_t, t)
+            if D1s is not None:
+                corr_res = torch.einsum('k,bkchw->bchw', rhos_c[:-1], D1s)
+            else:
+                corr_res = 0
+            D1_t = (model_t - model_prev_0)
+            x_t = x_t_ - (1 + level)  * expand_dims(sigma_t * B_h, dims) * (corr_res + rhos_c[-1] * D1_t) + z * level
+        return x_t, model_t
+    def multistep_uni_pc_vary_update(self, x, model_prev_list, t_prev_list, t, order, use_corrector=True):
+        # print(f'using unified predictor-corrector with order {order} (solver type: vary coeff)')
+        ns = self.noise_schedule
+        assert order <= len(model_prev_list)
+        dims = x.dim()
+        # first compute rks
+        t_prev_0 = t_prev_list[-1]
+        lambda_prev_0 = ns.marginal_lambda(t_prev_0)
+        lambda_t = ns.marginal_lambda(t)
+        model_prev_0 = model_prev_list[-1]
+        sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
+        log_alpha_t = ns.marginal_log_mean_coeff(t)
+        alpha_t = torch.exp(log_alpha_t)
+        h = lambda_t - lambda_prev_0
+        rks = []
+        D1s = []
+        for i in range(1, order):
+            t_prev_i = t_prev_list[-(i + 1)]
+            model_prev_i = model_prev_list[-(i + 1)]
+            lambda_prev_i = ns.marginal_lambda(t_prev_i)
+            rk = ((lambda_prev_i - lambda_prev_0) / h)[0]
+            rks.append(rk)
+            D1s.append((model_prev_i - model_prev_0) / rk)
+        rks.append(1.)
+        rks = torch.tensor(rks, device=x.device)
+        K = len(rks)
+        # build C matrix
+        C = []
+        col = torch.ones_like(rks)
+        for k in range(1, K + 1):
+            C.append(col)
+            col = col * rks / (k + 1)
+        C = torch.stack(C, dim=1)
+        if len(D1s) > 0:
+            D1s = torch.stack(D1s, dim=1) # (B, K)
+            C_inv_p = torch.linalg.inv(C[:-1, :-1])
+            A_p = C_inv_p
+        if use_corrector:
+            # print('using corrector')
+            C_inv = torch.linalg.inv(C)
+            A_c = C_inv
+        hh = h
+        h_phi_1 = torch.expm1(hh)
+        h_phi_ks = []
+        factorial_k = 1
+        h_phi_k = h_phi_1
+        for k in range(1, K + 2):
+            h_phi_ks.append(h_phi_k)
+            h_phi_k = h_phi_k / hh - 1 / factorial_k
+            factorial_k *= (k + 1)
+        model_t = None
+        if True:
+            log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
+            x_t_ = (
+                expand_dims((torch.exp(log_alpha_t - log_alpha_prev_0)),dims) * x
+                - expand_dims((sigma_t * h_phi_1),dims) * model_prev_0
+            )
+            # now predictor
+            x_t = x_t_
+            if len(D1s) > 0:
+                # compute the residuals for predictor
+                for k in range(K - 1):
+                    x_t = x_t - expand_dims(sigma_t * h_phi_ks[k + 1],dims) * torch.einsum('bkchw,k->bchw', D1s, A_p[k])
+            # now corrector
+            if use_corrector:
+                model_t = self.noise_prediction_fn(x_t, t)
+                D1_t = (model_t - model_prev_0)
+                x_t = x_t_
+                k = 0
+                for k in range(K - 1):
+                    x_t = x_t - expand_dims(sigma_t * h_phi_ks[k + 1],dims) * torch.einsum('bkchw,k->bchw', D1s, A_c[k][:-1])
+                x_t = x_t - expand_dims(sigma_t * h_phi_ks[K],dims) * (D1_t * A_c[k][-1])
+        return x_t, model_t
+    def sample(
+        self,
+        x,
+        model_fn,
+        order,
+        use_corrector,
+        lower_order_final,
+        start_free_u_step=None,
+        free_u_apply_callback=None,
+        free_u_stop_callback=None,
+        npnet_x = None,
+        npnet_scale = None,
+        half=False,
+        return_intermediate=False,
+    ):
+        self.model = lambda x, t: model_fn(x, t.expand((x.shape[0])))
+        steps = self.steps
+        vec_t = self.timesteps[0].expand((x.shape[0]))
+        if free_u_stop_callback is not None:
+            free_u_stop_callback()
+        if start_free_u_step is not None and 0 == start_free_u_step and free_u_apply_callback is not None:
+            free_u_apply_callback()
+            has_called_free_u = True
+        if not self.use_afs:
+            fir_output = self.noise_prediction_fn(x, vec_t)
+        else:
+            fir_output = x  # ultilize npnet there in the future
+            if npnet_x is not None and npnet_scale is not None:
+                fir_output = npnet_x
+                # fir_output = fir_output - npnet_scale * (npnet_out - fir_output) #guidance_scale * (noise - noise_uncond)
+                x = fir_output.clone().detach().to(fir_output.device)
+        model_prev_list = [fir_output]
+        full_cache = [fir_output]
+        t_prev_list = [vec_t]
+        has_called_free_u = False
+        for init_order in range(1, order):
+            if start_free_u_step is not None and init_order == start_free_u_step and free_u_apply_callback is not None and (not has_called_free_u):
+                free_u_apply_callback()
+                has_called_free_u = True
+            vec_t = self.timesteps[init_order].expand(x.shape[0])
+            x, model_x = self.multistep_uni_pc_update(x, model_prev_list, t_prev_list, vec_t, init_order, use_corrector=True)
+            if model_x is None:
+                model_x = self.noise_prediction_fn(x, vec_t)
+                x = model_x.clone().detach().to(torch.float32).to(model_x.device)
+            full_cache.append(x)
+            model_prev_list.append(model_x)
+            t_prev_list.append(vec_t)
+        for step in range(order, steps + 1):
+            if start_free_u_step is not None and step == start_free_u_step and free_u_apply_callback is not None and (not has_called_free_u):
+                free_u_apply_callback()
+            vec_t = self.timesteps[step].expand(x.shape[0])
+            if lower_order_final:
+                step_order = min(order, steps + 1 - step)
+            else:
+                step_order = order
+            # print('this step order:', step_order)
+            if step == steps:
+                # print('do not run corrector at the last step')
+                use_corrector = False
+            else:
+                use_corrector = True
+            x, model_x =  self.multistep_uni_pc_update(x, model_prev_list, t_prev_list, vec_t, step_order, use_corrector=use_corrector)
+            for i in range(order - 1):
+                t_prev_list[i] = t_prev_list[i + 1]
+                model_prev_list[i] = model_prev_list[i + 1]
+            t_prev_list[-1] = vec_t
+                    # We do not need to evaluate the final model value.
+            full_cache.append(x)
+            if step < steps:
+                if model_x is None:
+                    model_x = self.noise_prediction_fn(x, vec_t)
+                model_prev_list[-1] = model_x
+        return x, full_cache
+    def sample_mix(
+        self,
+        x,
+        model_fn,
+        order,
+        use_corrector,
+        lower_order_final,
+        start_free_u_step=None,
+        free_u_apply_callback=None,
+        free_u_stop_callback=None,
+        noise_level = 0.1,
+        half=False,
+        return_intermediate=False,
+    ):
+        self.model = lambda x, t: model_fn(x, t.expand((x.shape[0])))
+        steps = self.steps
+        vec_t = self.timesteps[0].expand((x.shape[0]))
+        fir_output = self.noise_prediction_fn(x, vec_t)
+        model_prev_list = [fir_output]
+        full_cache = [fir_output]
+        t_prev_list = [vec_t]
+        has_called_free_u = False
+        if free_u_stop_callback is not None:
+            free_u_stop_callback()
+        for init_order in range(1, order):
+            if start_free_u_step is not None and init_order == start_free_u_step and free_u_apply_callback is not None:
+                free_u_apply_callback()
+                has_called_free_u = True
+            vec_t = self.timesteps[init_order].expand(x.shape[0])
+            if start_free_u_step is not None and init_order >= start_free_u_step and free_u_apply_callback is not None:
+                x, model_x = self.multistep_uni_pc_sde_update(x
+                                                              , model_prev_list
+                                                              , t_prev_list
+                                                              , vec_t
+                                                              , init_order
+                                                              , use_corrector=True
+                                                              ,level=noise_level)
+            else:
+                x, model_x = self.multistep_uni_pc_sde_update(x
+                                                              , model_prev_list
+                                                              , t_prev_list
+                                                              , vec_t
+                                                              , init_order
+                                                              , use_corrector=True
+                                                              ,level=0.0)
+            if model_x is None:
+                model_x = self.noise_prediction_fn(x, vec_t)
+                x = model_x.clone().detach().to(torch.float32).to(model_x.device)
+            full_cache.append(x)
+            model_prev_list.append(model_x)
+            t_prev_list.append(vec_t)
+        if free_u_stop_callback is not None:
+            free_u_stop_callback()
+        for step in range(order, steps + 1):
+            if start_free_u_step is not None and step == start_free_u_step and free_u_apply_callback is not None and (not has_called_free_u):
+                free_u_apply_callback()
+            vec_t = self.timesteps[step].expand(x.shape[0])
+            if lower_order_final:
+                step_order = min(order, steps + 1 - step)
+            else:
+                step_order = order
+            # print('this step order:', step_order)
+            if step == steps:
+                # print('do not run corrector at the last step')
+                use_corrector = False
+            else:
+                use_corrector = True
+            if start_free_u_step is not None and step >= start_free_u_step and free_u_apply_callback is not None:
+                x, model_x =  self.multistep_uni_pc_sde_update(x
+                                                               , model_prev_list
+                                                               , t_prev_list
+                                                               , vec_t
+                                                               , step_order
+                                                               , use_corrector=use_corrector
+                                                               , level=noise_level)
+            else:
+                x, model_x =  self.multistep_uni_pc_sde_update(x
+                                                               , model_prev_list
+                                                               , t_prev_list
+                                                               , vec_t
+                                                               , step_order
+                                                               , use_corrector=use_corrector
+                                                               , level=0.0)
+            for i in range(order - 1):
+                t_prev_list[i] = t_prev_list[i + 1]
+                model_prev_list[i] = model_prev_list[i + 1]
+            t_prev_list[-1] = vec_t
+                    # We do not need to evaluate the final model value.
+            full_cache.append(x)
+            if step < steps:
+                if model_x is None:
+                    model_x = self.noise_prediction_fn(x, vec_t)
+                model_prev_list[-1] = model_x
+        return x, full_cache
+#############################################################
+# other utility functions
+#############################################################
+def interpolate_fn(x, xp, yp):
+    """
+    A piecewise linear function y = f(x), using xp and yp as keypoints.
+    We implement f(x) in a differentiable way (i.e. applicable for autograd).
+    The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.)
+    Args:
+        x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver).
+        xp: PyTorch tensor with shape [C, K], where K is the number of keypoints.
+        yp: PyTorch tensor with shape [C, K].
+    Returns:
+        The function values f(x), with shape [N, C].
+    """
+    N, K = x.shape[0], xp.shape[1]
+    all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2)
+    sorted_all_x, x_indices = torch.sort(all_x, dim=2)
+    x_idx = torch.argmin(x_indices, dim=2)
+    cand_start_idx = x_idx - 1
+    start_idx = torch.where(
+        torch.eq(x_idx, 0),
+        torch.tensor(1, device=x.device),
+        torch.where(
+            torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
+        ),
+    )
+    end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
+    start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
+    end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2)
+    start_idx2 = torch.where(
+        torch.eq(x_idx, 0),
+        torch.tensor(0, device=x.device),
+        torch.where(
+            torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
+        ),
+    )
+    y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1)
+    start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
+    end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
+    cand = start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x)
+    return cand
+def expand_dims(v, dims):
+    """
+    Expand the tensor `v` to the dim `dims`.
+    Args:
+        `v`: a PyTorch tensor with shape [N].
+        `dim`: a `int`.
+    Returns:
+        a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
+    """
+    return v[(...,) + (None,)*(dims - 1)]