intel · Jan 8, 2025
diff --git a/‎models_v2/pytorch/LCM/inference/cpu/README.md
+6 b/‎models_v2/pytorch/LCM/inference/cpu/README.md
+6
diff --git a/‎models_v2/pytorch/LCM/inference/cpu/diffusers.patch
+23-14 b/‎models_v2/pytorch/LCM/inference/cpu/diffusers.patch
+23-14
diff --git a/‎models_v2/pytorch/LCM/inference/cpu/do_calibration.sh
+56 b/‎models_v2/pytorch/LCM/inference/cpu/do_calibration.sh
+56
diff --git a/‎models_v2/pytorch/LCM/inference/cpu/inference.py
+98-167 b/‎models_v2/pytorch/LCM/inference/cpu/inference.py
+98-167
diff --git a/‎models_v2/pytorch/LCM/inference/cpu/run_model.sh
+28-2 b/‎models_v2/pytorch/LCM/inference/cpu/run_model.sh
+28-2
diff --git a/‎models_v2/pytorch/stable_diffusion/inference/cpu/README.md
+8-5 b/‎models_v2/pytorch/stable_diffusion/inference/cpu/README.md
+8-5
diff --git a/‎models_v2/pytorch/stable_diffusion/inference/cpu/diffusers.patch
+23-14 b/‎models_v2/pytorch/stable_diffusion/inference/cpu/diffusers.patch
+23-14
diff --git a/‎models_v2/pytorch/stable_diffusion/inference/cpu/do_calibration.sh
+55 b/‎models_v2/pytorch/stable_diffusion/inference/cpu/do_calibration.sh
+55
diff --git a/‎models_v2/pytorch/stable_diffusion/inference/cpu/inference.py
+98-167 b/‎models_v2/pytorch/stable_diffusion/inference/cpu/inference.py
+98-167
diff --git a/‎models_v2/pytorch/stable_diffusion/inference/cpu/run_model.sh
+12-4 b/‎models_v2/pytorch/stable_diffusion/inference/cpu/run_model.sh
+12-4
@@ -62,6 +62,12 @@ bash download_dataset.sh
 | **BATCH_SIZE** (optional)  |                        `export BATCH_SIZE=<set a value for batch size, else it will run with default batch size>`                                |
 | **TORCH_INDUCTOR** (optional)    | `export TORCH_INDUCTOR=< 0 or 1> (Compile model with PyTorch Inductor backend)`   |
 
+* NOTE:
+For `compile-inductor` mode, please do calibration to get quantized model before running `INT8-BF16` or `INT8-FP32`.
+  ```
+  bash do_calibration.sh
+  ```
+
 8. Run `run_model.sh`
 
 ## Output
 
@@ -1,5 +1,5 @@
 diff --git a/src/diffusers/models/transformer_2d.py b/src/diffusers/models/transformer_2d.py
-index 24abf54d..3fa7df5f 100644
+index 24abf54d6..3fa7df5f3 100644
 --- a/src/diffusers/models/transformer_2d.py
 +++ b/src/diffusers/models/transformer_2d.py
@@ -385,7 +385,7 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
@@ -21,7 +21,7 @@ index 24abf54d..3fa7df5f 100644
              output = hidden_states + residual
          elif self.is_input_vectorized:
 diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
-index f248b243..27d4802d 100644
+index f248b243f..7c83d2cf5 100644
 --- a/src/diffusers/models/unet_2d_condition.py
 +++ b/src/diffusers/models/unet_2d_condition.py
@@ -799,8 +799,8 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
@@ -34,8 +34,17 @@ index f248b243..27d4802d 100644
          attention_mask: Optional[torch.Tensor] = None,
          cross_attention_kwargs: Optional[Dict[str, Any]] = None,
          added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+@@ -808,7 +808,7 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
+         mid_block_additional_residual: Optional[torch.Tensor] = None,
+         down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+         encoder_attention_mask: Optional[torch.Tensor] = None,
+-        return_dict: bool = True,
++        return_dict: bool = False,
+     ) -> Union[UNet2DConditionOutput, Tuple]:
+         r"""
+         The [`UNet2DConditionModel`] forward method.
 diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py
-index ff5eea2d..10ea4af1 100644
+index ff5eea2d5..8a9461c87 100644
 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py
 +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py
@@ -701,17 +701,33 @@ class LatentConsistencyModelPipeline(
@@ -58,16 +67,16 @@ index ff5eea2d..10ea4af1 100644
 +                    model_pred = self.traced_unet(
 +                        latents.to(memory_format=torch.channels_last).to(dtype=self.precision),
 +                        t,
-+                        encoder_hidden_states=prompt_embeds.to(dtype=self.precision),
-+                        timestep_cond=w_embedding.to(dtype=self.precision)
-+                    )['sample']
++                        prompt_embeds.to(dtype=self.precision),
++                        w_embedding.to(dtype=self.precision)
++                    )[0]
 +                elif hasattr(self, 'precision'):
 +                    model_pred = self.unet(
 +                        latents.to(memory_format=torch.channels_last).to(dtype=self.precision),
 +                        t,
-+                        encoder_hidden_states=prompt_embeds.to(dtype=self.precision),
-+                        timestep_cond=w_embedding.to(dtype=self.precision)
-+                    )['sample']
++                        prompt_embeds.to(dtype=self.precision),
++                        w_embedding.to(dtype=self.precision)
++                    )[0]
 +                else:
 +                    model_pred = self.unet(
 +                        latents,
@@ -91,7 +100,7 @@ index ff5eea2d..10ea4af1 100644
          if not output_type == "latent":
              image = self.vae.decode(denoised / self.vae.config.scaling_factor, return_dict=False)[0]
 diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
-index 9911cbe7..98c7f2ab 100644
+index 9911cbe75..a4e7101e3 100644
 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
 +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -832,19 +832,33 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
@@ -116,14 +125,14 @@ index 9911cbe7..98c7f2ab 100644
 +                    noise_pred = self.traced_unet(
 +                        latent_model_input.to(memory_format=torch.channels_last).to(dtype=self.precision),
 +                        t,
-+                        encoder_hidden_states=prompt_embeds.to(dtype=self.precision)
-+                    )['sample']
++                        prompt_embeds.to(dtype=self.precision)
++                    )[0]
 +                elif hasattr(self, 'precision'):
 +                    noise_pred = self.unet(
 +                        latent_model_input.to(memory_format=torch.channels_last).to(dtype=self.precision),
 +                        t,
-+                        encoder_hidden_states=prompt_embeds.to(dtype=self.precision)
-+                    )['sample']
++                        prompt_embeds.to(dtype=self.precision)
++                    )[0]
 +                else:
 +                    noise_pred = self.unet(
 +                        latent_model_input,
 
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+MODEL_DIR=${MODEL_DIR-$PWD}
+
+if [ ! -e "${MODEL_DIR}/inference.py" ]; then
+  echo "Could not find the script of inference.py. Please set environment variable '\${MODEL_DIR}'."
+  echo "From which the inference.py exist at the: \${MODEL_DIR}/inference.py"
+  exit 1
+fi
+
+if [ ! -d "${DATASET_DIR}" ]; then
+  echo "The DATASET_DIR \${DATASET_DIR} does not exist"
+  exit 1
+fi
+
+if [ -z "${OUTPUT_DIR}" ]; then
+  echo "The required environment variable OUTPUT_DIR has not been set"
+  exit 1
+fi
+
+INT8_MODEL=${INT8_MODEL:-"quantized_model.pt2"}
+
+mkdir -p ${OUTPUT_DIR}
+
+export DNNL_PRIMITIVE_CACHE_CAPACITY=1024
+export KMP_BLOCKTIME=200
+export KMP_AFFINITY=granularity=fine,compact,1,0
+
+export TORCHINDUCTOR_FREEZING=1
+export TORCHINDUCTOR_CPP_ENABLE_TILING_HEURISTIC=0
+export TORCHINDUCTOR_ENABLE_LINEAR_BINARY_FOLDING=1
+
+python -m torch.backends.xeon.run_cpu --disable-numactl \
+            --log_path ${OUTPUT_DIR} \
+            ${MODEL_DIR}/inference.py \
+            --model_name_or_path="SimianLuo/LCM_Dreamshaper_v7" \
+            --dataset_path=${DATASET_DIR} \
+            --quantized_model_path=${INT8_MODEL} \
+            --compile_inductor \
+            --precision=int8-bf16 \
+            --calibration
@@ -64,10 +64,36 @@ elif [ "${PRECISION}" == "fp16" ]; then
     ARGS="$ARGS --precision=fp16"
     echo "### running fp16 datatype"
 elif [ "${PRECISION}" == "int8-bf16" ]; then
-    ARGS="$ARGS --precision=int8-bf16 --configure-dir=conv_and_linear131.json"
+    ARGS="$ARGS --precision=int8-bf16"
+    if [ "${RUN_MODE}" == "ipex-jit" ]; then
+        ARGS="$ARGS --configure-dir=conv_and_linear131.json"
+    elif [ "${RUN_MODE}" == "compile-inductor" ]; then
+        if [ ! -f "${INT8_MODEL}" ]; then
+            echo "The required file INT8_MODEL does not exist"
+            exit 1
+        fi
+        ARGS="$ARGS --quantized_model_path=${INT8_MODEL}"
+    else
+        echo "For int8-bf16 datatype, the specified mode '${RUN_MODE}' is unsupported."
+        echo "Supported mode are: ipex-jit, compile-inductor"
+        exit 1
+    fi
     echo "### running int8-bf16 datatype"
 elif [ "${PRECISION}" == "int8-fp32" ]; then
-    ARGS="$ARGS --precision=int8-fp32 --configure-dir=conv_and_linear131.json"
+    ARGS="$ARGS --precision=int8-fp32"
+    if [ "${RUN_MODE}" == "ipex-jit" ]; then
+        ARGS="$ARGS --configure-dir=conv_and_linear131.json"
+    elif [ "${RUN_MODE}" == "compile-inductor" ]; then
+        if [ ! -f "${INT8_MODEL}" ]; then
+            echo "The required file INT8_MODEL does not exist"
+            exit 1
+        fi
+        ARGS="$ARGS --quantized_model_path=${INT8_MODEL}"
+    else
+        echo "For int8-fp32 datatype, the specified mode '${RUN_MODE}' is unsupported."
+        echo "Supported mode are: ipex-jit, compile-inductor"
+        exit 1
+    fi
     echo "### running int8-fp32 datatype"
 elif [ "${PRECISION}" == "bf32" ]; then
     ARGS="$ARGS --precision=bf32"
 
@@ -43,10 +43,6 @@ export DATASET_DIR=<directory where the dataset will be saved>
 bash download_dataset.sh
 ```
 
-### **NOTE**:Int8 model
-
-Please get a quant_model.pt before run INT8-BF16 model or INT8-FP32 model. Please refer the [link](https://github.com/intel/intel-extension-for-transformers/blob/v1.5/examples/huggingface/pytorch/text-to-image/quantization/qat/README.md).
-
 # Inference
 1. `git clone https://github.com/IntelAI/models.git`
 2. `cd models/models_v2/pytorch/stable_diffusion/inference/cpu`
@@ -61,7 +57,6 @@ Please get a quant_model.pt before run INT8-BF16 model or INT8-FP32 model. Pleas
     ```
 5. Install the latest CPU versions of [torch, torchvision and intel_extension_for_pytorch](https://intel.github.io/intel-extension-for-pytorch/index.html#installation)
 
-
 6. Setup required environment paramaters
 
 | **Parameter**                |                                  **export command**                                  |
@@ -79,6 +74,14 @@ Please get a quant_model.pt before run INT8-BF16 model or INT8-FP32 model. Pleas
 | **LOCAL_BATCH_SIZE** (optional for DISTRIBUTED)    |                               `export LOCAL_BATCH_SIZE=64`                                |
 7. Run `run_model.sh`
 
+* NOTE:
+Please get quantized model before running `INT8-BF16` or `INT8-FP32`.
+For `ipex-jit` mode, please refer the [link](https://github.com/intel/intel-extension-for-transformers/blob/v1.5/examples/huggingface/pytorch/text-to-image/quantization/qat/README.md).
+For `compile-inductor` mode, please do calibration first:
+  ```
+  bash do_calibration.sh
+  ```
+
 ## Output
 
 Single-tile output will typically looks like:
 
@@ -1,5 +1,5 @@
 diff --git a/src/diffusers/models/transformer_2d.py b/src/diffusers/models/transformer_2d.py
-index 24abf54d..3fa7df5f 100644
+index 24abf54d6..3fa7df5f3 100644
 --- a/src/diffusers/models/transformer_2d.py
 +++ b/src/diffusers/models/transformer_2d.py
@@ -385,7 +385,7 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
@@ -21,7 +21,7 @@ index 24abf54d..3fa7df5f 100644
              output = hidden_states + residual
          elif self.is_input_vectorized:
 diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
-index f248b243..27d4802d 100644
+index f248b243f..7c83d2cf5 100644
 --- a/src/diffusers/models/unet_2d_condition.py
 +++ b/src/diffusers/models/unet_2d_condition.py
@@ -799,8 +799,8 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
@@ -34,8 +34,17 @@ index f248b243..27d4802d 100644
          attention_mask: Optional[torch.Tensor] = None,
          cross_attention_kwargs: Optional[Dict[str, Any]] = None,
          added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+@@ -808,7 +808,7 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
+         mid_block_additional_residual: Optional[torch.Tensor] = None,
+         down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+         encoder_attention_mask: Optional[torch.Tensor] = None,
+-        return_dict: bool = True,
++        return_dict: bool = False,
+     ) -> Union[UNet2DConditionOutput, Tuple]:
+         r"""
+         The [`UNet2DConditionModel`] forward method.
 diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py
-index ff5eea2d..10ea4af1 100644
+index ff5eea2d5..8a9461c87 100644
 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py
 +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py
@@ -701,17 +701,33 @@ class LatentConsistencyModelPipeline(
@@ -58,16 +67,16 @@ index ff5eea2d..10ea4af1 100644
 +                    model_pred = self.traced_unet(
 +                        latents.to(memory_format=torch.channels_last).to(dtype=self.precision),
 +                        t,
-+                        encoder_hidden_states=prompt_embeds.to(dtype=self.precision),
-+                        timestep_cond=w_embedding.to(dtype=self.precision)
-+                    )['sample']
++                        prompt_embeds.to(dtype=self.precision),
++                        w_embedding.to(dtype=self.precision)
++                    )[0]
 +                elif hasattr(self, 'precision'):
 +                    model_pred = self.unet(
 +                        latents.to(memory_format=torch.channels_last).to(dtype=self.precision),
 +                        t,
-+                        encoder_hidden_states=prompt_embeds.to(dtype=self.precision),
-+                        timestep_cond=w_embedding.to(dtype=self.precision)
-+                    )['sample']
++                        prompt_embeds.to(dtype=self.precision),
++                        w_embedding.to(dtype=self.precision)
++                    )[0]
 +                else:
 +                    model_pred = self.unet(
 +                        latents,
@@ -91,7 +100,7 @@ index ff5eea2d..10ea4af1 100644
          if not output_type == "latent":
              image = self.vae.decode(denoised / self.vae.config.scaling_factor, return_dict=False)[0]
 diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
-index 9911cbe7..98c7f2ab 100644
+index 9911cbe75..a4e7101e3 100644
 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
 +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -832,19 +832,33 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
@@ -116,14 +125,14 @@ index 9911cbe7..98c7f2ab 100644
 +                    noise_pred = self.traced_unet(
 +                        latent_model_input.to(memory_format=torch.channels_last).to(dtype=self.precision),
 +                        t,
-+                        encoder_hidden_states=prompt_embeds.to(dtype=self.precision)
-+                    )['sample']
++                        prompt_embeds.to(dtype=self.precision)
++                    )[0]
 +                elif hasattr(self, 'precision'):
 +                    noise_pred = self.unet(
 +                        latent_model_input.to(memory_format=torch.channels_last).to(dtype=self.precision),
 +                        t,
-+                        encoder_hidden_states=prompt_embeds.to(dtype=self.precision)
-+                    )['sample']
++                        prompt_embeds.to(dtype=self.precision)
++                    )[0]
 +                else:
 +                    noise_pred = self.unet(
 +                        latent_model_input,
 
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+MODEL_DIR=${MODEL_DIR-$PWD}
+
+if [ ! -e "${MODEL_DIR}/inference.py" ]; then
+  echo "Could not find the script of inference.py. Please set environment variable '\${MODEL_DIR}'."
+  echo "From which the inference.py exist at the: \${MODEL_DIR}/inference.py"
+  exit 1
+fi
+
+if [ ! -d "${DATASET_DIR}" ]; then
+  echo "The DATASET_DIR \${DATASET_DIR} does not exist"
+  exit 1
+fi
+
+if [ -z "${OUTPUT_DIR}" ]; then
+  echo "The required environment variable OUTPUT_DIR has not been set"
+  exit 1
+fi
+
+INT8_MODEL=${INT8_MODEL:-"quantized_model.pt2"}
+
+mkdir -p ${OUTPUT_DIR}
+
+export DNNL_PRIMITIVE_CACHE_CAPACITY=1024
+export KMP_BLOCKTIME=200
+export KMP_AFFINITY=granularity=fine,compact,1,0
+
+export TORCHINDUCTOR_FREEZING=1
+export TORCHINDUCTOR_CPP_ENABLE_TILING_HEURISTIC=0
+export TORCHINDUCTOR_ENABLE_LINEAR_BINARY_FOLDING=1
+
+python -m torch.backends.xeon.run_cpu --disable-numactl \
+            --log_path ${OUTPUT_DIR} \
+            ${MODEL_DIR}/inference.py \
+            --dataset_path=${DATASET_DIR} \
+            --quantized_model_path=${INT8_MODEL} \
+            --compile_inductor \
+            --precision=int8-bf16 \
+            --calibration
@@ -116,22 +116,30 @@ elif [[ "${PRECISION}" == "fp16" ]]; then
     echo "### running fp16 datatype"
 elif [[ "${PRECISION}" == "int8-bf16" ]]; then
     ARGS="$ARGS --precision=int8-bf16"
-    if [ "${MODE}" == "ipex-jit" ]; then
+    if [[ "${MODE}" == "ipex-jit" || "${MODE}" == "compile-inductor" ]]; then
         if [ ! -f "${INT8_MODEL}" ]; then
             echo "The required file INT8_MODEL does not exist"
             exit 1
         fi
-        ARGS="$ARGS --int8_model_path=${INT8_MODEL}"
+        ARGS="$ARGS --quantized_model_path=${INT8_MODEL}"
+    else
+        echo "For int8-bf16 datatype, the specified mode '${MODE}' is unsupported."
+        echo "Supported mode are: ipex-jit, compile-inductor"
+        exit 1
     fi
     echo "### running int8-bf16 datatype"
 elif [[ "${PRECISION}" == "int8-fp32" ]]; then
     ARGS="$ARGS --precision=int8-fp32"
-    if [ "${MODE}" == "ipex-jit" ]; then
+    if [[ "${MODE}" == "ipex-jit" || "${MODE}" == "compile-inductor" ]]; then
         if [ ! -f "${INT8_MODEL}" ]; then
             echo "The required file INT8_MODEL does not exist"
             exit 1
         fi
-        ARGS="$ARGS --int8_model_path=${INT8_MODEL}"
+        ARGS="$ARGS --quantized_model_path=${INT8_MODEL}"
+    else
+        echo "For int8-fp32 datatype, the specified mode '${MODE}' is unsupported."
+        echo "Supported mode are: ipex-jit, compile-inductor"
+        exit 1
     fi
     echo "### running int8-fp32 datatype"
 elif [[ "${PRECISION}" == "bf32" ]]; then