diff --git a/data/mltable/test/MLTable b/data/mltable/test/MLTable new file mode 100644 index 0000000..5996177 --- /dev/null +++ b/data/mltable/test/MLTable @@ -0,0 +1,6 @@ +$schema: http://azureml/sdk-2-0/MLTable.json +type: mltable +paths: + - file: ./nyc_taxi_dataset_test.parquet +transformations: + - read_parquet \ No newline at end of file diff --git a/data/mltable/test/nyc_taxi_dataset_test.parquet b/data/mltable/test/nyc_taxi_dataset_test.parquet new file mode 100644 index 0000000..c8271f7 Binary files /dev/null and b/data/mltable/test/nyc_taxi_dataset_test.parquet differ diff --git a/data/mltable/train/MLTable b/data/mltable/train/MLTable new file mode 100644 index 0000000..4f1e278 --- /dev/null +++ b/data/mltable/train/MLTable @@ -0,0 +1,6 @@ +$schema: http://azureml/sdk-2-0/MLTable.json +type: mltable +paths: + - file: ./nyc_taxi_dataset_train.parquet +transformations: + - read_parquet \ No newline at end of file diff --git a/data/mltable/train/nyc_taxi_dataset_train.parquet b/data/mltable/train/nyc_taxi_dataset_train.parquet new file mode 100644 index 0000000..409a574 Binary files /dev/null and b/data/mltable/train/nyc_taxi_dataset_train.parquet differ diff --git a/notebooks/create-responsible-ai-dashboard.ipynb b/notebooks/create-responsible-ai-dashboard.ipynb new file mode 100644 index 0000000..694bfa7 --- /dev/null +++ b/notebooks/create-responsible-ai-dashboard.ipynb @@ -0,0 +1,510 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "gather": { + "logged": 1672477533184 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "# 実行する AML workspace の情報を設定\n", + "subscription_id = \"\"\n", + "resource_group = \"\"\n", + "workspace = \"\"\n", + "\n", + "rai_diabetes_regression_example_version_string = \"1\"" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": { + "gather": { + "logged": 1672477533396 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "import json\n", + "from azure.ai.ml import dsl, Input\n", + "from azure.ai.ml.constants import AssetTypes\n", + "from azure.ai.ml import MLClient\n", + "from azure.identity import DefaultAzureCredential" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": { + "gather": { + "logged": 1672477533664 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MLClient(credential=,\n", + " subscription_id=27a05524-7a59-453b-88d7-df3ebaa2bcc1,\n", + " resource_group_name=azuremlytate081922,\n", + " workspace_name=azuremlwsytate081922)\n" + ] + } + ], + "source": [ + "credential = DefaultAzureCredential()\n", + "ml_client = MLClient(\n", + " credential=credential,\n", + " subscription_id=subscription_id,\n", + " resource_group_name=resource_group,\n", + " workspace_name=workspace,\n", + ")\n", + "print(ml_client)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "gather": { + "logged": 1672477534177 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MLClient(credential=,\n", + " subscription_id=6c6683e9-e5fe-4038-8519-ce6ebec2ba15,\n", + " resource_group_name=registry-builtin-prod-eastus-01,\n", + " workspace_name=None)\n" + ] + } + ], + "source": [ + "# RAI 組み込みコンポーネントの azureml レジストリへのハンドルを取得\n", + "registry_name = \"azureml\"\n", + "ml_client_registry = MLClient(\n", + " credential=credential,\n", + " subscription_id=subscription_id,\n", + " resource_group_name=resource_group,\n", + " registry_name=registry_name,\n", + ")\n", + "print(ml_client_registry)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "gather": { + "logged": 1672477534382 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "from azure.ai.ml.entities import Data\n", + "from azure.ai.ml.constants import AssetTypes\n", + "\n", + "train_data_path = \"../data/mltable/train/\"\n", + "test_data_path = \"../data/mltable/test/\"\n", + "input_train_data = \"nyc_taxi_train_mltable_dataset\"\n", + "input_test_data = \"nyc_taxi_test_mltable_dataset\"\n", + "\n", + "# train, test のデータアセットがなければ作成\n", + "try:\n", + " train_data = ml_client.data.get(\n", + " name=input_train_data, version=rai_diabetes_regression_example_version_string\n", + " )\n", + " test_data = ml_client.data.get(\n", + " name=input_test_data, version=rai_diabetes_regression_example_version_string\n", + " )\n", + "except Exception as e:\n", + " train_data = Data(\n", + " path=train_data_path,\n", + " type=AssetTypes.MLTABLE,\n", + " description=\"RAI diabetes regression example training data\",\n", + " name=input_train_data,\n", + " version=rai_diabetes_regression_example_version_string,\n", + " )\n", + " ml_client.data.create_or_update(train_data)\n", + "\n", + " test_data = Data(\n", + " path=test_data_path,\n", + " type=AssetTypes.MLTABLE,\n", + " description=\"RAI diabetes regression example test data\",\n", + " name=input_test_data,\n", + " version=rai_diabetes_regression_example_version_string,\n", + " )\n", + " ml_client.data.create_or_update(test_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "gather": { + "logged": 1672469603345 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": { + "gather": { + "logged": 1672477534500 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "model_name = \"nyc_taxi_mlflow_rai\"\n", + "compute_name = \"cpu-cluster\"\n", + "target_column_name = \"totalAmount\"\n", + "model_name_suffix = 1\n", + "expected_model_id = f\"{model_name}:{model_name_suffix}\"\n", + "azureml_model_id = f\"azureml:{expected_model_id}\"\n", + "programmers_train_mltable = Input(\n", + " type=\"mltable\",\n", + " path=f\"azureml:nyc_taxi_train_mltable_dataset:1\",\n", + " mode=\"download\",\n", + ")\n", + "programmers_test_mltable = Input(\n", + " type=\"mltable\",\n", + " path=f\"azureml:nyc_taxi_test_mltable_dataset:1\",\n", + " mode=\"download\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": { + "gather": { + "logged": 1672477537043 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The current version of RAI built-in components is: 0.4.0\n" + ] + } + ], + "source": [ + "label = \"latest\"\n", + "\n", + "# 最新バージョンのコンポーネントを取得\n", + "rai_constructor_component = ml_client_registry.components.get(\n", + " name=\"microsoft_azureml_rai_tabular_insight_constructor\", label=label\n", + ")\n", + "\n", + "version = rai_constructor_component.version\n", + "print(\"The current version of RAI built-in components is: \" + version)\n", + "\n", + "rai_explanation_component = ml_client_registry.components.get(\n", + " name=\"microsoft_azureml_rai_tabular_explanation\", version=version\n", + ")\n", + "\n", + "rai_causal_component = ml_client_registry.components.get(\n", + " name=\"microsoft_azureml_rai_tabular_causal\", version=version\n", + ")\n", + "\n", + "rai_counterfactual_component = ml_client_registry.components.get(\n", + " name=\"microsoft_azureml_rai_tabular_counterfactual\", version=version\n", + ")\n", + "\n", + "rai_erroranalysis_component = ml_client_registry.components.get(\n", + " name=\"microsoft_azureml_rai_tabular_erroranalysis\", version=version\n", + ")\n", + "\n", + "rai_gather_component = ml_client_registry.components.get(\n", + " name=\"microsoft_azureml_rai_tabular_insight_gather\", version=version\n", + ")\n", + "\n", + "rai_scorecard_component = ml_client_registry.components.get(\n", + " name=\"microsoft_azureml_rai_tabular_score_card\", version=version\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "gather": { + "logged": 1672477537218 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "# RAI パイプラインを構築\n", + "\n", + "categorical_columns = json.dumps(\n", + " [\"vendorID\", \"passengerCount\"]\n", + ")\n", + "treatment_features = json.dumps([\"lpepPickupDatetime\", \"tripDistance\", \"pickupLongitude\", \"pickupLatitude\", \"dropoffLongitude\", \"dropoffLatitude\"])\n", + "desired_range = json.dumps([0, 290])\n", + "filter_columns = json.dumps([])\n", + "\n", + "\n", + "@dsl.pipeline(\n", + " compute=compute_name,\n", + " experiment_name=f\"rai_pipeline_nyc_taxi\",\n", + ")\n", + "def rai_pipeline_nyc_taxi(\n", + " target_column_name,\n", + " train_data,\n", + " test_data\n", + "):\n", + " # RAIInsights を開始\n", + " create_rai_job = rai_constructor_component(\n", + " title=\"RAI Dashboard Example\",\n", + " task_type=\"regression\",\n", + " model_info=expected_model_id,\n", + " model_input=Input(type=AssetTypes.MLFLOW_MODEL, path=azureml_model_id),\n", + " train_dataset=train_data,\n", + " test_dataset=test_data,\n", + " target_column_name=target_column_name,\n", + " categorical_column_names=categorical_columns,\n", + " )\n", + " create_rai_job.set_limits(timeout=120)\n", + "\n", + " # モデルの説明を追加\n", + " explain_job = rai_explanation_component(\n", + " rai_insights_dashboard=create_rai_job.outputs.rai_insights_dashboard,\n", + " comment=\"My comment\",\n", + " )\n", + " explain_job.set_limits(timeout=120)\n", + "\n", + " # 因果分析を追加\n", + " causal_job = rai_causal_component(\n", + " rai_insights_dashboard=create_rai_job.outputs.rai_insights_dashboard,\n", + " treatment_features=treatment_features,\n", + " )\n", + " causal_job.set_limits(timeout=180)\n", + "\n", + " # 反事実分析を追加\n", + " counterfactual_job = rai_counterfactual_component(\n", + " rai_insights_dashboard=create_rai_job.outputs.rai_insights_dashboard,\n", + " total_cfs=10,\n", + " desired_range=desired_range,\n", + " )\n", + " counterfactual_job.set_limits(timeout=600)\n", + "\n", + " # エラー分析を追加\n", + " erroranalysis_job = rai_erroranalysis_component(\n", + " rai_insights_dashboard=create_rai_job.outputs.rai_insights_dashboard,\n", + " filter_features=filter_columns,\n", + " )\n", + " erroranalysis_job.set_limits(timeout=120)\n", + "\n", + " # すべてを組み合わせ\n", + " rai_gather_job = rai_gather_component(\n", + " constructor=create_rai_job.outputs.rai_insights_dashboard,\n", + " insight_1=explain_job.outputs.explanation,\n", + " insight_2=causal_job.outputs.causal,\n", + " insight_3=counterfactual_job.outputs.counterfactual,\n", + " insight_4=erroranalysis_job.outputs.error_analysis,\n", + " )\n", + " rai_gather_job.set_limits(timeout=120)\n", + "\n", + " rai_gather_job.outputs.dashboard.mode = \"upload\"\n", + " rai_gather_job.outputs.ux_json.mode = \"upload\"\n", + "\n", + " return {\n", + " \"dashboard\": rai_gather_job.outputs.dashboard,\n", + " \"ux_json\": rai_gather_job.outputs.ux_json\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": { + "gather": { + "logged": 1672477537347 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "insights_pipeline_job = rai_pipeline_nyc_taxi(\n", + " target_column_name=target_column_name,\n", + " train_data=programmers_train_mltable,\n", + " test_data=programmers_test_mltable\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": { + "gather": { + "logged": 1672477540535 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "created_job = ml_client.jobs.create_or_update(insights_pipeline_job)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernel_info": { + "name": "python310-sdkv2" + }, + "kernelspec": { + "display_name": "Python 3.10 - SDK V2", + "language": "python", + "name": "python310-sdkv2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + }, + "microsoft": { + "host": { + "AzureML": { + "notebookHasBeenCompleted": true + } + } + }, + "nteract": { + "version": "nteract-front-end@1.0.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/model/train.py b/src/model/train.py index 74cd85c..6d5f58f 100644 --- a/src/model/train.py +++ b/src/model/train.py @@ -1,4 +1,4 @@ -import argparse +import argparse import os import matplotlib.pyplot as plt @@ -84,6 +84,11 @@ def save_model(model, output_dir): os.makedirs(os.path.join(output_dir, "models"), exist_ok=True) mlflow.sklearn.save_model(model, os.path.join(output_dir, "models")) + # RAI ダッシュボード用のモデル登録 + mlflow.sklearn.log_model(sk_model=model, + registered_model_name="nyc_taxi_mlflow_rai", + artifact_path="models") + def main(args): # 自動ロギングの有効化