diff --git a/graph_net/sample_pass/resumable_sample_pass_mixin.py b/graph_net/sample_pass/resumable_sample_pass_mixin.py
index 804005988..7ffb4b2d6 100644
--- a/graph_net/sample_pass/resumable_sample_pass_mixin.py
+++ b/graph_net/sample_pass/resumable_sample_pass_mixin.py
@@ -45,7 +45,7 @@ def resumable_handle_sample(self, rel_model_path: str):
         self._inc_num_handled_models_or_exit()
 
     def _inc_num_handled_models_or_exit(self):
-        if self.config["limits_handled_models"] is None:
+        if self.config.get("limits_handled_models", None) is None:
             return
         self.num_handled_models += 1
         if self.num_handled_models >= self.config["limits_handled_models"]:
diff --git a/graph_net/tools/generate_subgraph_dataset.sh b/graph_net/tools/generate_subgraph_dataset.sh
index 121977f3f..457864db7 100755
--- a/graph_net/tools/generate_subgraph_dataset.sh
+++ b/graph_net/tools/generate_subgraph_dataset.sh
@@ -3,16 +3,18 @@ set -x
 
 MIN_SEQ_OPS=${1:-4}
 MAX_SEQ_OPS=${2:-64}
-GPU_ID=${3:-0}
+GPU_ID=${3:-5}
 
 OP_RANGE=$MIN_SEQ_OPS-$MAX_SEQ_OPS
 
 export CUDA_VISIBLE_DEVICES="${GPU_ID}"
+export PYTHONPATH=/work/GraphNet:/work/abstract_pass/Athena:$PYTHONPATH
 
 GRAPH_NET_ROOT=$(python3 -c "import graph_net; import os; print(os.path.dirname(os.path.dirname(graph_net.__file__)))")
 RESUME="true"
 
-DECOMPOSE_WORKSPACE=/tmp/subgraph_dataset_workspace
+#DECOMPOSE_WORKSPACE=/tmp/subgraph_dataset_workspace
+DECOMPOSE_WORKSPACE=/work/graphnet_test_workspace/subgraph_dataset_20260203
 DEVICE_REWRITED_OUTPUT_DIR=$DECOMPOSE_WORKSPACE/01_device_rewrited_samples
 DIMENSION_GENERALIZED_OUTPUT_DIR=$DECOMPOSE_WORKSPACE/02_dimension_generalized_samples
 OP_NAMES_OUTPUT_DIR=$DECOMPOSE_WORKSPACE/03_sample_op_names
@@ -26,16 +28,24 @@ GROUPED_FUSIBLE_SUBGRAPH_RANGES_DIR=$DECOMPOSE_WORKSPACE/10_grouped_fusible_subg
 SUBGRAPH_DIMENSION_GENERALIZED_OUTPUT_DIR=$DECOMPOSE_WORKSPACE/11_dimension_generalized_fusible_subgraphs
 RENAMED_DIMENSION_GENERALIZED_FUSIBLE_SUBGRAPH_DIR=$DECOMPOSE_WORKSPACE/12_renamed_dimension_generalized_fusible_subgraphs
 DEDUPLICATED_DIMENSION_GENERALIZED_FUSIBLE_SUBGRAPH_DIR=$DECOMPOSE_WORKSPACE/13_deduplicated_dimension_generalized_fusible_subgraphs
-UNITTESTS_OUTPUT_DIR=$DECOMPOSE_WORKSPACE/14_kernelbench_unittests
+DTYPE_GENERALIZED_OUTPUT_DIR=$DECOMPOSE_WORKSPACE/14_dtype_generalized_fusible_subgraphs
+UNITTESTS_OUTPUT_DIR=$DECOMPOSE_WORKSPACE/15_kernelbench_unittests
 
 mkdir -p "$DECOMPOSE_WORKSPACE"
 
-model_list="$GRAPH_NET_ROOT/graph_net/config/small100_torch_samples_list.txt" 
+model_list="$GRAPH_NET_ROOT/graph_net/config/torch_samples_list.txt"
 device_rewrited_sample_list=${DECOMPOSE_WORKSPACE}/device_rewrited_sample_list.txt
 range_decomposed_subgraph_list=${DECOMPOSE_WORKSPACE}/range_decomposed_subgraph_sample_list.txt
 deduplicated_subgraph_list=${DECOMPOSE_WORKSPACE}/deduplicated_subgraph_sample_list.txt
 dimension_generalized_subgraph_list=${DECOMPOSE_WORKSPACE}/dimension_generalized_subgraph_sample_list.txt
 deduplicated_fusible_subgraphs_list=${DECOMPOSE_WORKSPACE}/deduplicated_dimension_generalized_subgraph_sample_list.txt
+dtype_generalized_subgraphs_list=${DECOMPOSE_WORKSPACE}/dtype_generalized_subgraphs_sample_list.txt
+
+if [[ "$model_list" == *"torch_samples_list.txt" ]]; then
+    USE_SUBPROCESS_ARGS="--use-subprocess"
+else
+    USE_SUBPROCESS_ARGS=""
+fi
 
 function generate_generalized_subgraph_list() {
     local target_dir="$1"
@@ -84,7 +94,7 @@ EOF
 function dimension_generalizer(){
     echo ">>> [2] Apply dimension generalization for samples under ${device_rewrited_sample_list}."
     echo ">>>"
-    python3 -m graph_net.apply_sample_pass \
+    python3 -m graph_net.apply_sample_pass ${USE_SUBPROCESS_ARGS} \
         --model-path-list $device_rewrited_sample_list \
         --sample-pass-file-path "$GRAPH_NET_ROOT/graph_net/dimension_generalizer.py" \
         --sample-pass-class-name "ApplyDimGenPasses" \
@@ -104,7 +114,7 @@ EOF
 function generate_op_names() {
     echo ">>> [3] Generate op_names.txt for samples in ${model_list}."
     echo ">>>"
-    python3 -m graph_net.model_path_handler \
+    python3 -m graph_net.model_path_handler ${USE_SUBPROCESS_ARGS} \
         --model-path-list $model_list \
         --handler-config=$(base64 -w 0 <<EOF
 {
@@ -150,7 +160,7 @@ EOF
 function range_decompose() {
     echo ">>> [5] Decompose according to subgraph_ranges.json for samples in ${device_rewrited_sample_list}."
     echo ">>>"
-    python3 -m graph_net.model_path_handler \
+    python3 -m graph_net.model_path_handler ${USE_SUBPROCESS_ARGS} \
         --model-path-list "$device_rewrited_sample_list" \
         --handler-config=$(base64 -w 0 <<EOF
 {
@@ -263,8 +273,8 @@ function subgraph_dimension_generalizer(){
     for index in {0..8}; do
         echo ">>> Generating dimension generalized subgraph variant index: ${index}"
         dimension_generalized_sample_list="${DIMENSION_GENERALIZED_OUTPUT_DIR}/${index}/dimension_generalized_sample_list.txt"
-        generate_subgraph_list ${DIMENSION_GENERALIZED_OUTPUT_DIR}/${index} ${dimension_generalized_samples_list}
-        python3 -m graph_net.model_path_handler \
+        generate_subgraph_list ${DIMENSION_GENERALIZED_OUTPUT_DIR}/${index} ${dimension_generalized_sample_list}
+        python3 -m graph_net.model_path_handler ${USE_SUBPROCESS_ARGS} \
             --model-path-list "${dimension_generalized_sample_list}" \
             --handler-config $(base64 -w 0 <<EOF
 {
@@ -319,18 +329,38 @@ function remove_duplicate_dimension_generalized_fusible_graphs() {
     done
 }
 
+function dtype_generalizer() {
+    echo ">>> [12] Data type generalizer for samples under ${DEDUPLICATED_DIMENSION_GENERALIZED_FUSIBLE_SUBGRAPH_DIR}."
+    echo ">>>"
+    python3 -m graph_net.apply_sample_pass \
+        --model-path-list $deduplicated_fusible_subgraphs_list \
+        --sample-pass-file-path "$GRAPH_NET_ROOT/graph_net/torch/sample_pass/dtype_generalizer.py" \
+        --sample-pass-class-name ApplyDataTypeGeneralizationPasses \
+        --sample-pass-config $(base64 -w 0 <<EOF
+{
+    "output_dir": "$DTYPE_GENERALIZED_OUTPUT_DIR",
+    "model_path_prefix": "$DEDUPLICATED_DIMENSION_GENERALIZED_FUSIBLE_SUBGRAPH_DIR",
+    "model_runnable_predicator_filepath": "$GRAPH_NET_ROOT/graph_net/torch/constraint_util.py",
+    "try_run": false,
+    "device": "cuda",
+    "resume": ${RESUME}
+}
+EOF
+)
+}
+
 function generate_unittests() {
-    echo ">>> [12] Generate unittests for subgraph samples under ${DEDUPLICATED_DIMENSION_GENERALIZED_FUSIBLE_SUBGRAPH_DIR}. "
+    echo ">>> [13] Generate unittests for subgraph samples under ${DTYPE_GENERALIZED_OUTPUT_DIR}. "
     echo ">>>"
     python3 -m graph_net.model_path_handler \
-        --model-path-list ${deduplicated_fusible_subgraphs_list} \
+        --model-path-list ${dtype_generalized_subgraphs_list} \
         --handler-config=$(base64 -w 0 <<EOF
 {
     "handler_path": "${GRAPH_NET_ROOT}/graph_net/sample_pass/agent_unittest_generator.py",
     "handler_class_name": "AgentUnittestGeneratorPass",
     "handler_config": {
         "framework": "torch",
-        "model_path_prefix": "${DEDUPLICATED_DIMENSION_GENERALIZED_FUSIBLE_SUBGRAPH_DIR}",
+        "model_path_prefix": "${DTYPE_GENERALIZED_OUTPUT_DIR}",
         "output_dir": "${UNITTESTS_OUTPUT_DIR}",
         "device": "cuda",
         "generate_main": false,
@@ -376,8 +406,93 @@ main() {
     remove_duplicate_dimension_generalized_fusible_graphs 2>&1 | tee ${DECOMPOSE_WORKSPACE}/log_remove_duplicate_dimension_generalized_subgraphs_${suffix}.txt
     generate_generalized_subgraph_list ${DEDUPLICATED_DIMENSION_GENERALIZED_FUSIBLE_SUBGRAPH_DIR} ${deduplicated_fusible_subgraphs_list}
 
+    # dtype generalization
+    dtype_generalizer 2>&1 | tee ${DECOMPOSE_WORKSPACE}/log_dtype_generalizer_${suffix}.txt
+    generate_generalized_subgraph_list ${DTYPE_GENERALIZED_OUTPUT_DIR} ${dtype_generalized_subgraphs_list}
+
     # generate kernelbench format unittest
     generate_unittests 2>&1 | tee ${DECOMPOSE_WORKSPACE}/log_unittests_${suffix}.txt
 }
 
+summary() {
+    num_original_samples=`cat $model_list | grep "^samples/" | wc -l`
+    echo "Number of original graphnet samples: $num_original_samples"
+
+    num_device_rewrited_samples=`find ${DEVICE_REWRITED_OUTPUT_DIR} -name "model.py" | wc -l`
+    device_rewrited_successed_precent=$(( num_device_rewrited_samples * 100 / num_original_samples ))
+    echo "- [Step  1] device rewrite: successed=${num_device_rewrited_samples}, percent=$device_rewrited_successed_precent%"
+
+    num_successed_dimension_generalized_samples=`find ${DIMENSION_GENERALIZED_OUTPUT_DIR} -name "model.py" | wc -l`
+    dimension_generalized_samples_successed_percent=$((num_successed_dimension_generalized_samples * 100 / (num_original_samples * 9)))
+    echo "- [Step  2] dimension generalization: successed=${num_successed_dimension_generalized_samples}, percent=${dimension_generalized_samples_successed_percent}%"
+    for index in {0..8}; do
+        num_successed_dimension_generalized_samples=`find ${DIMENSION_GENERALIZED_OUTPUT_DIR}/${index} -name "model.py" | wc -l`
+        dimension_generalized_samples_successed_percent=$(( num_successed_dimension_generalized_samples * 100 / num_original_samples ))
+        echo "    ${index}, successed=${num_successed_dimension_generalized_samples}, percent=${dimension_generalized_samples_successed_percent}%"
+    done
+    echo ""
+
+    num_successed_op_names=`find ${OP_NAMES_OUTPUT_DIR} -name op_names.txt | wc -l`
+    op_names_successed_percent=$(( num_successed_op_names * 100 / num_original_samples ))
+    echo "- [Step  3] generate op names: successed=${num_successed_op_names}, percent=${op_names_successed_percent}%"
+
+    num_typical_subgraph_ranges=`find ${SUBGRAPH_RANGES_JSON_ROOT} -name typical_subgraph_ranges.json | wc -l`
+    typical_subgraph_ranges_successed_percent=$(( num_typical_subgraph_ranges * 100 / num_original_samples ))
+    echo "- [Step  4] generate typical subgraph ranges: successed=${num_typical_subgraph_ranges}, percent=${typical_subgraph_ranges_successed_percent}%"
+
+    num_successed_range_decomposed_subgraphs=`find ${RANGE_DECOMPOSE_OUTPUT_DIR} -name "model.py" | wc -l`
+    echo "- [Step  5] range decompose: successed=${num_successed_range_decomposed_subgraphs}"
+    
+    num_renamed_subgraphs=`find ${GRAPH_VAR_RENAME_OUTPUT_DIR} -name "model.py" | wc -l`
+    echo "- [Step  6] rename: successed=${num_renamed_subgraphs}"
+    
+    num_deduplicated_subgraphs=`find ${DEDUPLICATED_OUTPUT_DIR} -name "model.py" | wc -l`
+    echo "- [Step  7] remove duplicated: successed=${num_deduplicated_subgraphs}"
+
+    num_successed_cumsum_kernels_subgraphs=`find ${CUMSUM_NUM_KERNELS_DIR} -name "cumsum_num_kernels.json" | wc -l`
+    cumsum_kernels_successed_percent=$((num_successed_cumsum_kernels_subgraphs * 100 / num_deduplicated_subgraphs))
+    echo "- [Step  8] cumsum kernels: successed=${num_successed_cumsum_kernels_subgraphs}, percent=${cumsum_kernels_successed_percent}%"
+
+    num_fusible_subgraph_ranges=`find ${FUSIBLE_SUBGRAPH_RANGES_DIR} -name "fusible_subgraph_ranges.json" | wc -l`
+    num_grouped_fusible_subgraph_ranges=`find ${GROUPED_FUSIBLE_SUBGRAPH_RANGES_DIR} -name "grouped_fusible_subgraph_ranges.json" | wc -l`
+    echo "    fusible subgraph ranges: successed=${num_fusible_subgraph_ranges}"
+    echo "    grouped fusible subgraph ranges: successed=${num_grouped_fusible_subgraph_ranges}"
+    echo ""
+
+    num_successed_dimension_generalized_subgraphs=`find ${SUBGRAPH_DIMENSION_GENERALIZED_OUTPUT_DIR} -name "model.py" | wc -l`
+    echo "- [Step  9] subgraph dimension generalization: successed=${num_successed_dimension_generalized_subgraphs}"
+    for index in {0..8}; do
+        num_successed_dimension_generalized_subgraphs=`find ${SUBGRAPH_DIMENSION_GENERALIZED_OUTPUT_DIR}/${index} -name "model.py" | wc -l`
+        echo "    ${index}, successed=${num_successed_dimension_generalized_subgraphs}"
+    done
+    echo ""
+
+    num_renamed_fusible_subgraphs=`find ${RENAMED_DIMENSION_GENERALIZED_FUSIBLE_SUBGRAPH_DIR} -name "model.py" | wc -l`
+    echo "- [Step 10] rename: successed=${num_renamed_fusible_subgraphs}"
+    for index in {0..8}; do
+        num_renamed_fusible_subgraphs_index=`find ${RENAMED_DIMENSION_GENERALIZED_FUSIBLE_SUBGRAPH_DIR}/${index} -name "model.py" | wc -l`
+        echo "    ${index}, successed=${num_renamed_fusible_subgraphs_index}"
+    done
+    echo ""
+
+    num_deduplicated_fusible_subgraphs=`find ${DEDUPLICATED_DIMENSION_GENERALIZED_FUSIBLE_SUBGRAPH_DIR} -name "model.py" | wc -l`
+    echo "- [Step 11] remove duplicated: successed=${num_deduplicated_fusible_subgraphs}"
+    for index in {0..8}; do
+        num_deduplicated_fusible_subgraphs_index=`find ${DEDUPLICATED_DIMENSION_GENERALIZED_FUSIBLE_SUBGRAPH_DIR}/${index} -name "model.py" | wc -l`
+        echo "    ${index}, successed=${num_deduplicated_fusible_subgraphs_index}"
+    done
+    echo ""
+
+    num_successed_unittests=`find ${UNITTESTS_OUTPUT_DIR} -name "*_test.py" | wc -l`
+    unittest_successed_percent=$((num_successed_unittests * 100 / num_deduplicated_fusible_subgraphs))
+    echo "- [Step 12] generate unittest: successed=${num_successed_unittests}, percent=${unittest_successed_percent}%"
+    for index in {0..8}; do
+        num_successed_unittests=`find ${UNITTESTS_OUTPUT_DIR}/${index} -name "*_test.py" | wc -l`
+        echo "    ${index}, successed=${num_successed_unittests}"
+    done
+}
+
 main
+
+set +x
+summary 2>&1 | tee ${DECOMPOSE_WORKSPACE}/summary.txt
diff --git a/graph_net/torch/sample_pass/dtype_generalizer.py b/graph_net/torch/sample_pass/dtype_generalizer.py
index 48c06803a..13a246244 100644
--- a/graph_net/torch/sample_pass/dtype_generalizer.py
+++ b/graph_net/torch/sample_pass/dtype_generalizer.py
@@ -17,6 +17,7 @@
 from pathlib import Path
 from typing import Any, Dict, List
 
+import torch
 import torch.fx as fx
 
 from graph_net.graph_net_json_file_util import (
@@ -236,9 +237,9 @@ class ApplyDataTypeGeneralizationPasses(SamplePass, ResumableSamplePassMixin):
             "output_dir": "/path/to/output",
             "model_path_prefix": "",
             "model_runnable_predicator_filepath": "...",
-            "resume": ,
-            "limits_handled_models": ,
-            "try_run": ,
+            "resume": true,
+            "limits_handled_models": null,
+            "try_run": true,
         }
     """
 
@@ -268,6 +269,7 @@ def declare_config(
         output_dir: str,
         model_path_prefix: str,
         model_runnable_predicator_filepath: str,
+        device: str = "auto",
         resume: bool = False,
         limits_handled_models: int = None,
         try_run: bool = True,
@@ -281,6 +283,13 @@ def _make_model_runnable_predicator(self, config: Dict[str, Any]):
         predicator_config = self.model_runnable_predicator_config
         return cls(predicator_config)
 
+    def _choose_device(self, device) -> str:
+        if device is None:
+            return None
+        if device in ["cpu", "cuda"]:
+            return device
+        return "cuda" if torch.cuda.is_available() else "cpu"
+
     def sample_handled(self, rel_model_path: str) -> bool:
         model_path = Path(self.config["model_path_prefix"]) / rel_model_path
         dtype_pass_names = self._read_dtype_pass_names(model_path)
@@ -320,7 +329,9 @@ def resume(self, rel_model_path: str) -> List[str]:
             return []
 
         # Parse the computation graph
-        traced_model = parse_immutable_model_path_into_sole_graph_module(abs_model_path)
+        traced_model = parse_immutable_model_path_into_sole_graph_module(
+            abs_model_path, device=self._choose_device(self.config["device"])
+        )
 
         # Copy the originl sample
         files_copied = [