microsoft · xieofxie · Jan 24, 2025 · Jan 24, 2025 · Jan 24, 2025 · Jan 24, 2025
diff --git a/examples/stable_diffusion/.gitignore b/examples/stable_diffusion/.gitignore
@@ -1,2 +1,3 @@
 /footprints/
 /result_*.png
+/quantize_data/
diff --git a/examples/stable_diffusion/README.md b/examples/stable_diffusion/README.md
@@ -179,3 +179,27 @@
 Run `python stable_diffusion.py --help` for additional options. A few particularly relevant ones:
 - `--image_path <str>`: the input image path for image to image inference.
 - `--img_to_img_example`: image to image example. The default input image is `assets/dog.png`, the default prompt is `amazing watercolor painting`.
+
+## Stable Diffusion Optimization with QDQ for QNN EP
+
+### Generate data for static quantization
+
+To get better result, we need to generate real data from original model instead of using random data for static quantization.
+
+First generate onnx unoptimized model (it also generates an optimized model using random data):
+
+`python stable_diffusion.py --model_id stabilityai/stable-diffusion-2-1-base --provider qnn --optimize --use_random_data --data_num 1` 
+
+Then generate data (updating the prompt to generate more will be better):
+
+`python stable_diffusion.py --model_id stabilityai/stable-diffusion-2-1-base --provider qnn --generate_data --num_inference_steps 5 --seed 0 --test_unoptimized --prompt "hamburger swims in the river"` 
+
+### Optimize
+
+`python stable_diffusion.py --model_id stabilityai/stable-diffusion-2-1-base --provider qnn --optimize --clean_cache`
+
+### Test
+
+We could add `--test_unoptimized` first to generate from original model for comparison.
+
+`python stable_diffusion.py --model_id stabilityai/stable-diffusion-2-1-base --provider qnn --num_inference_steps 5 --guidance_scale 7.5 --prompt "cat and dog" --seed 0`
diff --git a/examples/stable_diffusion/config_text_encoder.json b/examples/stable_diffusion/config_text_encoder.json
@@ -23,6 +23,12 @@
             "user_script": "user_script.py",
             "load_dataset_config": { "type": "local_dataset" },
             "dataloader_config": { "type": "text_encoder_data_loader", "batch_size": 1 }
+        },
+        {
+            "name": "quantize_data_config",
+            "user_script": "user_script.py",
+            "load_dataset_config": { "type": "local_dataset" },
+            "dataloader_config": { "type": "text_encoder_quantize_data_loader", "batch_size": 1 }
         }
     ],
     "evaluators": {
@@ -38,7 +44,7 @@
         }
     },
     "passes": {
-        "convert": { "type": "OnnxConversion", "target_opset": 14 },
+        "convert": { "type": "OnnxConversion", "target_opset": 17 },
         "ov_convert": {
             "type": "OpenVINOConversion",
             "user_script": "user_script.py",
@@ -83,6 +89,27 @@
             "float16": true,
             "use_gpu": true,
             "keep_io_types": false
+        },
+        "dynamic_shape_to_fixed": {
+            "type": "DynamicToFixedShape",
+            "dim_param": [ "batch", "sequence" ],
+            "dim_value": [ 1, 77 ]
+        },
+        "qnn_preprocess": {
+            "type": "QNNPreprocess",
+            "fuse_layernorm": true
+        },
+        "quantization": {
+            "type": "OnnxStaticQuantization",
+            "data_config": "quantize_data_config",
+            "activation_type": "QUInt16",
+            "weight_type": "QUInt8",
+            "calibrate_method": "MinMax",
+            "quant_preprocess": true,
+            "prepare_qnn_config": true,
+            "op_types_to_quantize": [ "MatMul", "LayerNormalization", "Reshape", "Transpose", "Mul", "Gather", "Gelu", "Flatten", "ArgMax" ],
+            "append_first_op_types_to_quantize_list": false,
+            "nodes_to_exclude": [ "Add", "Softmax" ]
         }
     },
     "pass_flows": [ [ "convert", "optimize" ] ],

diff --git a/examples/stable_diffusion/config_unet.json b/examples/stable_diffusion/config_unet.json
@@ -32,6 +32,12 @@
             "user_script": "user_script.py",
             "load_dataset_config": { "type": "local_dataset" },
             "dataloader_config": { "type": "unet_data_loader", "batch_size": 1 }
+        },
+        {
+            "name": "quantize_data_config",
+            "user_script": "user_script.py",
+            "load_dataset_config": { "type": "local_dataset" },
+            "dataloader_config": { "type": "unet_quantize_data_loader", "batch_size": 1 }
         }
     ],
     "evaluators": {
@@ -49,7 +55,7 @@
     "passes": {
         "convert": {
             "type": "OnnxConversion",
-            "target_opset": 14,
+            "target_opset": 17,
             "save_as_external_data": true,
             "all_tensors_to_one_file": true,
             "external_data_name": "weights.pb"
@@ -98,6 +104,24 @@
             "float16": true,
             "use_gpu": true,
             "keep_io_types": false
+        },
+        "dynamic_shape_to_fixed": {
+            "type": "DynamicToFixedShape",
+            "dim_param": [ "unet_sample_batch", "unet_sample_channels", "unet_sample_height", "unet_sample_width", "unet_time_batch", "unet_hidden_batch", "unet_hidden_sequence" ],
+            "dim_value": [ 1, 4, 64, 64, 1, 1, 77 ]
+        },
+        "qnn_preprocess": {
+            "type": "QNNPreprocess",
+            "fuse_layernorm": true
+        },
+        "quantization": {
+            "type": "OnnxStaticQuantization",
+            "data_config": "quantize_data_config",
+            "activation_type": "QUInt16",
+            "weight_type": "QUInt8",
+            "calibrate_method": "MinMax",
+            "quant_preprocess": true,
+            "prepare_qnn_config": true
         }
     },
     "pass_flows": [ [ "convert", "optimize" ] ],

diff --git a/examples/stable_diffusion/config_vae_decoder.json b/examples/stable_diffusion/config_vae_decoder.json
@@ -30,6 +30,12 @@
             "user_script": "user_script.py",
             "load_dataset_config": { "type": "local_dataset" },
             "dataloader_config": { "type": "vae_decoder_data_loader", "batch_size": 1 }
+        },
+        {
+            "name": "quantize_data_config",
+            "user_script": "user_script.py",
+            "load_dataset_config": { "type": "local_dataset" },
+            "dataloader_config": { "type": "vae_decoder_quantize_data_loader", "batch_size": 1 }
         }
     ],
     "evaluators": {
@@ -45,7 +51,7 @@
         }
     },
     "passes": {
-        "convert": { "type": "OnnxConversion", "target_opset": 14 },
+        "convert": { "type": "OnnxConversion", "target_opset": 17 },
         "ov_convert": {
             "type": "OpenVINOConversion",
             "user_script": "user_script.py",
@@ -90,6 +96,24 @@
             "float16": true,
             "use_gpu": true,
             "keep_io_types": false
+        },
+        "dynamic_shape_to_fixed": {
+            "type": "DynamicToFixedShape",
+            "dim_param": [ "decoder_batch", "decoder_channels", "decoder_height", "decoder_width" ],
+            "dim_value": [ 1, 4, 64, 64 ]
+        },
+        "qnn_preprocess": {
+            "type": "QNNPreprocess",
+            "fuse_layernorm": true
+        },
+        "quantization": {
+            "type": "OnnxStaticQuantization",
+            "data_config": "quantize_data_config",
+            "activation_type": "QUInt16",
+            "weight_type": "QUInt8",
+            "calibrate_method": "MinMax",
+            "quant_preprocess": true,
+            "prepare_qnn_config": true
         }
     },
     "pass_flows": [ [ "convert", "optimize" ] ],

diff --git a/examples/stable_diffusion/config_vae_encoder.json b/examples/stable_diffusion/config_vae_encoder.json
@@ -25,6 +25,12 @@
             "user_script": "user_script.py",
             "load_dataset_config": { "type": "local_dataset" },
             "dataloader_config": { "type": "vae_encoder_data_loader", "batch_size": 1 }
+        },
+        {
+            "name": "quantize_data_config",
+            "user_script": "user_script.py",
+            "load_dataset_config": { "type": "local_dataset" },
+            "dataloader_config": { "type": "vae_encoder_quantize_data_loader", "batch_size": 1 }
         }
     ],
     "evaluators": {
@@ -40,7 +46,7 @@
         }
     },
     "passes": {
-        "convert": { "type": "OnnxConversion", "target_opset": 14 },
+        "convert": { "type": "OnnxConversion", "target_opset": 17 },
         "ov_convert": {
             "type": "OpenVINOConversion",
             "user_script": "user_script.py",
@@ -85,6 +91,24 @@
             "float16": true,
             "use_gpu": true,
             "keep_io_types": false
+        },
+        "dynamic_shape_to_fixed": {
+            "type": "DynamicToFixedShape",
+            "dim_param": [ "encoder_batch", "encoder_channels", "encoder_height", "encoder_width", "Addlatent_sample_dim_0", "Addlatent_sample_dim_1", "Addlatent_sample_dim_2", "Addlatent_sample_dim_3" ],
+            "dim_value": [ 1, 3, 512, 512, 1, 4, 64, 64 ]
+        },
+        "qnn_preprocess": {
+            "type": "QNNPreprocess",
+            "fuse_layernorm": true
+        },
+        "quantization": {
+            "type": "OnnxStaticQuantization",
+            "data_config": "quantize_data_config",
+            "activation_type": "QUInt16",
+            "weight_type": "QUInt8",
+            "calibrate_method": "MinMax",
+            "quant_preprocess": true,
+            "prepare_qnn_config": true
         }
     },
     "pass_flows": [ [ "convert", "optimize" ] ],

diff --git a/examples/stable_diffusion/sd_utils/config.py b/examples/stable_diffusion/sd_utils/config.py
@@ -6,3 +6,6 @@
 vae_sample_size = 512
 unet_sample_size = 64
 cross_attention_dim = 768
+rand_data = True
+data_dir = "quantize_data"
+data_num = 10
diff --git a/examples/stable_diffusion/sd_utils/ort.py b/examples/stable_diffusion/sd_utils/ort.py
@@ -2,6 +2,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
+
 import json
 import shutil
 import sys
@@ -63,7 +64,7 @@
         for footprint in footprints.values():
             if footprint["from_pass"] == "OnnxConversion":
                 conversion_footprint = footprint
-            elif footprint["from_pass"] == "OrtTransformersOptimization":
+            elif footprint["from_pass"] == "OrtTransformersOptimization" or footprint["from_pass"] == "OnnxStaticQuantization":
                 optimizer_footprint = footprint
 
         assert conversion_footprint
@@ -75,7 +76,7 @@
        model_info[submodel_name] = {
            "unoptimized": {
                "path": Path(unoptimized_olive_model.model_path),
            },
            "optimized": {
                "path": Path(optimized_olive_model.model_path),
            },
@@ -138,7 +139,7 @@
     unet_sample_size = config.unet_sample_size
 
     if static_dims:
-        hidden_batch_size = batch_size if (guidance_scale == 0.0) else batch_size * 2
+        hidden_batch_size = batch_size if (guidance_scale <= 1.0) else batch_size * 2
         # Not necessary, but helps DML EP further optimize runtime performance.
         # batch_size is doubled for sample & hidden state because of classifier free guidance:
         # https://github.com/huggingface/diffusers/blob/46c52f9b9607e6ecb29c782c052aea313e6487b7/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L672