[exmaple] diffuser, support quant inference for stable diffusion (#2186)

2022-12-23 16:06:29 +08:00 · 2022-12-23 16:06:29 +08:00 · 1cf6d92d7c
parent bc0e271e71
commit 1cf6d92d7c
3 changed files with 116 additions and 4 deletions
--- a/examples/images/diffusion/scripts/img2img.py
+++ b/examples/images/diffusion/scripts/img2img.py
@ -22,6 +22,7 @@ from imwatermark import WatermarkEncoder
 from scripts.txt2img import put_watermark
 from ldm.util import instantiate_from_config
 from ldm.models.diffusion.ddim import DDIMSampler
+from utils import replace_module, getModelSize


 def chunk(it, size):
@ -44,7 +45,6 @@ def load_model_from_config(config, ckpt, verbose=False):
        print("unexpected keys:")
        print(u)

-    model.cuda()
    model.eval()
    return model

@ -183,6 +183,12 @@ def main():
        choices=["full", "autocast"],
        default="autocast"
    )
+    parser.add_argument(
+        "--use_int8",
+        type=bool,
+        default=False,
+        help="use int8 for inference",
+    )

    opt = parser.parse_args()
    seed_everything(opt.seed)
@ -193,6 +199,12 @@ def main():
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model = model.to(device)

+    # quantize model
+    if opt.use_int8:
+        model = replace_module(model)
+        # # to compute the model size
+        # getModelSize(model)
+    
    sampler = DDIMSampler(model)

    os.makedirs(opt.outdir, exist_ok=True)
@ -280,3 +292,5 @@ def main():

 if __name__ == "__main__":
    main()
+    # # to compute the mem allocated
+    # print(torch.cuda.max_memory_allocated() / 1024 / 1024)
--- a/examples/images/diffusion/scripts/txt2img.py
+++ b/examples/images/diffusion/scripts/txt2img.py
@ -20,6 +20,7 @@ from ldm.util import instantiate_from_config
 from ldm.models.diffusion.ddim import DDIMSampler
 from ldm.models.diffusion.plms import PLMSSampler
 from ldm.models.diffusion.dpm_solver import DPMSolverSampler
+from utils import replace_module, getModelSize

 torch.set_grad_enabled(False)

@ -43,7 +44,6 @@ def load_model_from_config(config, ckpt, verbose=False):
        print("unexpected keys:")
        print(u)

-    model.cuda()
    model.eval()
    return model

@ -174,6 +174,12 @@ def parse_args():
        default=1,
        help="repeat each prompt in file this often",
    )
+    parser.add_argument(
+        "--use_int8",
+        type=bool,
+        default=False,
+        help="use int8 for inference",
+    )
    opt = parser.parse_args()
    return opt

@ -191,10 +197,17 @@ def main(opt):

    config = OmegaConf.load(f"{opt.config}")
    model = load_model_from_config(config, f"{opt.ckpt}")
-
+    
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-    model = model.to(device)

+    model = model.to(device)
+    
+    # quantize model
+    if opt.use_int8:
+        model = replace_module(model)
+        # # to compute the model size
+        # getModelSize(model)
+    
    if opt.plms:
        sampler = PLMSSampler(model)
    elif opt.dpm:
@ -290,3 +303,5 @@ def main(opt):
 if __name__ == "__main__":
    opt = parse_args()
    main(opt)
+    # # to compute the mem allocated
+    # print(torch.cuda.max_memory_allocated() / 1024 / 1024)
--- a/examples/images/diffusion/scripts/utils.py
+++ b/examples/images/diffusion/scripts/utils.py
@ -0,0 +1,83 @@
+import bitsandbytes as bnb
+import torch.nn as nn
+import torch
+
+class Linear8bit(nn.Linear):
+    def __init__(
+        self,
+        input_features,
+        output_features,
+        bias=True,
+        has_fp16_weights=False,
+        memory_efficient_backward=False,
+        threshold=6.0,
+        weight_data=None,
+        bias_data=None
+    ):
+        super(Linear8bit, self).__init__(
+            input_features, output_features, bias
+        )
+        self.state = bnb.MatmulLtState()
+        self.bias = bias_data
+        self.state.threshold = threshold
+        self.state.has_fp16_weights = has_fp16_weights
+        self.state.memory_efficient_backward = memory_efficient_backward
+        if threshold > 0.0 and not has_fp16_weights:
+            self.state.use_pool = True
+            
+        self.register_parameter("SCB", nn.Parameter(torch.empty(0), requires_grad=False))
+        self.weight = weight_data
+        self.quant()
+        
+
+    def quant(self):  
+        weight = self.weight.data.contiguous().half().cuda()
+        CB, _, SCB, _, _ = bnb.functional.double_quant(weight)
+        delattr(self, "weight")
+        setattr(self, "weight", nn.Parameter(CB, requires_grad=False))
+        delattr(self, "SCB")
+        setattr(self, "SCB", nn.Parameter(SCB, requires_grad=False))
+        del weight
+
+    def forward(self, x):
+        self.state.is_training = self.training
+        
+        if self.bias is not None and self.bias.dtype != torch.float16:
+            self.bias.data = self.bias.data.half()
+        
+        self.state.CB = self.weight.data
+        self.state.SCB = self.SCB.data
+        
+        out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
+        del self.state.CxB
+        return out
+
+def replace_module(model):
+    for name, module in model.named_children():
+        if len(list(module.children())) > 0:
+            replace_module(module)
+
+        if isinstance(module, nn.Linear) and "out_proj" not in name:    
+            model._modules[name] = Linear8bit(
+                        input_features=module.in_features,
+                        output_features=module.out_features,
+                        threshold=6.0,
+                        weight_data=module.weight,
+                        bias_data=module.bias,
+                )
+    return model
+
+def getModelSize(model):
+    param_size = 0
+    param_sum = 0
+    for param in model.parameters():
+        param_size += param.nelement() * param.element_size()
+        param_sum += param.nelement()
+    buffer_size = 0
+    buffer_sum = 0
+    for buffer in model.buffers():
+        buffer_size += buffer.nelement() * buffer.element_size()
+        buffer_sum += buffer.nelement()
+    all_size = (param_size + buffer_size) / 1024 / 1024
+    print('Model Size: {:.3f}MB'.format(all_size))
+    return (param_size, param_sum, buffer_size, buffer_sum, all_size)