From 082a6067b8475e7f199d07b15172a2b5feb623a4 Mon Sep 17 00:00:00 2001 From: Deepti Mokkapati Date: Tue, 9 Feb 2021 15:12:19 -0800 Subject: [PATCH 1/4] update many model script to create child runs --- Custom_Script/scripts/train.py | 136 ++++++++++++++++++--------------- 1 file changed, 75 insertions(+), 61 deletions(-) diff --git a/Custom_Script/scripts/train.py b/Custom_Script/scripts/train.py index bc641dc4..652ed855 100644 --- a/Custom_Script/scripts/train.py +++ b/Custom_Script/scripts/train.py @@ -59,67 +59,81 @@ def run(input_data): train = data[:-args.test_size] test = data[-args.test_size:] - # 3.0 Create and fit the forecasting pipeline - # The pipeline will drop unhelpful features, make a calendar feature, and make lag features - lagger = SimpleLagger(args.target_column, lag_orders=[1, 2, 3, 4]) - transform_steps = [('column_dropper', ColumnDropper(args.drop_columns)), - ('calendar_featurizer', SimpleCalendarFeaturizer()), ('lagger', lagger)] - forecaster = SimpleForecaster(transform_steps, LinearRegression(), args.target_column, args.timestamp_column) - forecaster.fit(train) - print('Featurized data example:') - print(forecaster.transform(train).head()) - - # 4.0 Get predictions on test set - forecasts = forecaster.forecast(test) - compare_data = test.assign(forecasts=forecasts).dropna() - - # 5.0 Calculate accuracy metrics for the fit - mse = mean_squared_error(compare_data[args.target_column], compare_data['forecasts']) - rmse = np.sqrt(mse) - mae = mean_absolute_error(compare_data[args.target_column], compare_data['forecasts']) - actuals = compare_data[args.target_column].values - preds = compare_data['forecasts'].values - mape = np.mean(np.abs((actuals - preds) / actuals) * 100) - - # 6.0 Log metrics - current_run.log(model_name + '_mse', mse) - current_run.log(model_name + '_rmse', rmse) - current_run.log(model_name + '_mae', mae) - current_run.log(model_name + '_mape', mape) - - # 7.0 Train model with full dataset - forecaster.fit(data) - - # 8.0 Save the forecasting pipeline - joblib.dump(forecaster, filename=os.path.join('./outputs/', model_name)) - - # 9.0 Register the model to the workspace - # Uses the values in the timeseries id columns from the first row of data to form tags for the model - current_run.upload_file(model_name, os.path.join('./outputs/', model_name)) - ts_id_dict = {id_col: str(data[id_col].iloc[0]) for id_col in args.timeseries_id_columns} - tags_dict = {**ts_id_dict, 'ModelType': args.model_type} - current_run.register_model(model_path=model_name, model_name=model_name, - model_framework=args.model_type, tags=tags_dict) - - # 10.0 Add data to output - end_datetime = datetime.datetime.now() - result.update(ts_id_dict) - result['model_type'] = args.model_type - result['file_name'] = file_name - result['model_name'] = model_name - result['start_date'] = str(start_datetime) - result['end_date'] = str(end_datetime) - result['duration'] = str(end_datetime-start_datetime) - result['mse'] = mse - result['rmse'] = rmse - result['mae'] = mae - result['mape'] = mape - result['index'] = idx - result['num_models'] = len(input_data) - result['status'] = current_run.get_status() - - print('ending (' + csv_file_path + ') ' + str(end_datetime)) - result_list.append(result) + try: + child_run = current_run.child_run(name=model_name) + + # 3.0 Create and fit the forecasting pipeline + # The pipeline will drop unhelpful features, make a calendar feature, and make lag features + lagger = SimpleLagger(args.target_column, lag_orders=[1, 2, 3, 4]) + transform_steps = [('column_dropper', ColumnDropper(args.drop_columns)), + ('calendar_featurizer', SimpleCalendarFeaturizer()), ('lagger', lagger)] + forecaster = SimpleForecaster(transform_steps, LinearRegression(), args.target_column, + args.timestamp_column) + forecaster.fit(train) + print('Featurized data example:') + print(forecaster.transform(train).head()) + + # 4.0 Get predictions on test set + forecasts = forecaster.forecast(test) + compare_data = test.assign(forecasts=forecasts).dropna() + + # 5.0 Calculate accuracy metrics for the fit + mse = mean_squared_error(compare_data[args.target_column], compare_data['forecasts']) + rmse = np.sqrt(mse) + mae = mean_absolute_error(compare_data[args.target_column], compare_data['forecasts']) + actuals = compare_data[args.target_column].values + preds = compare_data['forecasts'].values + mape = np.mean(np.abs((actuals - preds) / actuals) * 100) + + # 6.0 Log metrics + child_run.log(model_name + '_mse', mse) + child_run.log(model_name + '_rmse', rmse) + child_run.log(model_name + '_mae', mae) + child_run.log(model_name + '_mape', mape) + + # 7.0 Train model with full dataset + forecaster.fit(data) + + # import time + # time.sleep(180) + + # 8.0 Save the forecasting pipeline + joblib.dump(forecaster, filename=os.path.join('./outputs/', model_name)) + + # 9.0 Register the model to the workspace + # Uses the values in the timeseries id columns from the first row of data to form tags for the model + child_run.upload_file(model_name, os.path.join('./outputs/', model_name)) + ts_id_dict = {id_col: str(data[id_col].iloc[0]) for id_col in args.timeseries_id_columns} + tags_dict = {**ts_id_dict, 'ModelType': args.model_type} + tags_dict.update({'InputData': os.path.basename(csv_file_path)}) + tags_dict.update({'StepRunId': current_run.id}) + tags_dict.update({'RunId': current_run.parent.id}) + child_run.register_model(model_path=model_name, model_name=model_name, + model_framework=args.model_type, tags=tags_dict) + + child_run.complete() + # 10.0 Add data to output + end_datetime = datetime.datetime.now() + result.update(ts_id_dict) + result['model_type'] = args.model_type + result['file_name'] = file_name + result['model_name'] = model_name + result['start_date'] = str(start_datetime) + result['end_date'] = str(end_datetime) + result['duration'] = str(end_datetime-start_datetime) + result['mse'] = mse + result['rmse'] = rmse + result['mae'] = mae + result['mape'] = mape + result['index'] = idx + result['num_models'] = len(input_data) + result['status'] = child_run.get_status() + + print('ending (' + csv_file_path + ') ' + str(end_datetime)) + result_list.append(result) + except Exception: + if child_run and child_run.get_status() != 'Completed': + child_run.fail() # Data returned by this function will be available in parallel_run_step.txt return pd.DataFrame(result_list) From c48f5dd1b7751422378e8e0b3b444e79103e2bfa Mon Sep 17 00:00:00 2001 From: Deepti Mokkapati Date: Tue, 9 Feb 2021 15:53:51 -0800 Subject: [PATCH 2/4] changes to customscript --- .../02_CustomScript_Training_Pipeline.ipynb | 259 ++++++++++++++++-- Custom_Script/scripts/train.py | 25 +- 2 files changed, 252 insertions(+), 32 deletions(-) diff --git a/Custom_Script/02_CustomScript_Training_Pipeline.ipynb b/Custom_Script/02_CustomScript_Training_Pipeline.ipynb index c7a6e044..1f70b4ed 100644 --- a/Custom_Script/02_CustomScript_Training_Pipeline.ipynb +++ b/Custom_Script/02_CustomScript_Training_Pipeline.ipynb @@ -51,7 +51,25 @@ "metadata": {}, "outputs": [], "source": [ - "# !pip install azureml-pipeline-steps" + "pip show azureml-sdk" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install azureml-pipeline-steps" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install --upgrade azureml-contrib-automl-pipeline-steps" ] }, { @@ -63,16 +81,49 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Workspace Name: deeptim-eastus2euap-ws\n", + "Azure Region: eastus2euap\n", + "Subscription Id: 381b38e9-9840-4719-a5a0-61d9585e1e91\n", + "Resource Group: deeptim_westus_3rg\n" + ] + } + ], "source": [ "from azureml.core import Workspace\n", "\n", "# set up workspace\n", - "ws = Workspace.from_config()\n", + "#ws = Workspace.from_config()\n", + "\n", + "#ws = Workspace.create( subscription_id= \"381b38e9-9840-4719-a5a0-61d9585e1e91\",\n", + "# resource_group= \"deeptim_westus_3rg\",\n", + "# name= \"deeptim-eastus2euap-ws\",\n", + "# create_resource_group=True,\n", + "# location='eastus2euap',\n", + " # )\n", + "\n", + "ws = Workspace( subscription_id= \"381b38e9-9840-4719-a5a0-61d9585e1e91\",\n", + " resource_group= \"deeptim_westus_3rg\",\n", + " workspace_name= \"deeptim-eastus2euap-ws\",\n", + " )\n", + "\n", + "#ws = Workspace( subscription_id= \"381b38e9-9840-4719-a5a0-61d9585e1e91\",\n", + "# resource_group= \"deeptim-eastus2euap-rg\",\n", + "# workspace_name= \"deeptim-eastus2euap-ws\",\n", + "# )\n", + "\n", + "#ws = Workspace( subscription_id= \"ba7979f7-d040-49c9-af1a-7414402bf622\",\n", + "# resource_group= \"deeptim-westcentralus-rg\",\n", + "# workspace_name= \"deeptim-westcentralus-ws\",\n", + "# )\n", "\n", "# set up datastores\n", "dstore = ws.get_default_datastore()\n", @@ -93,15 +144,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Experiment name: oj-customscript-test\n" + ] + } + ], "source": [ "from azureml.core import Experiment\n", "\n", - "experiment = Experiment(ws, 'oj_training_pipeline')\n", + "experiment = Experiment(ws, 'oj-customscript-test')\n", "\n", "print('Experiment name: ' + experiment.name)" ] @@ -121,16 +180,86 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "dataset_name = 'oj_data_small_train'" + "dataset_name = 'oj_data'" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "blob_datastore_name = \"automl_many_models\"\n", + "container_name = \"automl-sample-notebook-data\"\n", + "account_name = \"automlsamplenotebookdata\"\n", + "\n", + "from azureml.core import Datastore\n", + "\n", + "datastore = Datastore.register_azure_blob_container(\n", + " workspace=ws, \n", + " datastore_name=blob_datastore_name, \n", + " container_name=container_name,\n", + " account_name=account_name,\n", + " create_if_not_exists=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{\n", + " \"source\": [\n", + " \"('automl_many_models', 'oj_data_small/')\"\n", + " ],\n", + " \"definition\": [\n", + " \"GetDatastoreFiles\"\n", + " ],\n", + " \"registration\": {\n", + " \"id\": \"f84d587d-a7a8-4a37-a2ac-bd1cb71acb1c\",\n", + " \"name\": \"oj_data_small\",\n", + " \"version\": 1,\n", + " \"workspace\": \"Workspace.create(name='deeptim-eastus2euap-ws', subscription_id='381b38e9-9840-4719-a5a0-61d9585e1e91', resource_group='deeptim_westus_3rg')\"\n", + " }\n", + "}" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds_train_path = 'oj_data_small/'\n", + "ds_inference_path = 'oj_inference_small/'\n", + "\n", + "from azureml.core.dataset import Dataset\n", + "\n", + "\n", + "# Create file datasets\n", + "ds_train = Dataset.File.from_files(path=datastore.path(ds_train_path), validate=False)\n", + "ds_inference = Dataset.File.from_files(path=datastore.path(ds_inference_path), validate=False)\n", + "\n", + "# Register the file datasets\n", + "#dataset_name = 'oj_data_small' # if 0 < dataset_maxfiles < 11973 else 'oj_data'\n", + "dataset_name = 'oj_data_small'\n", + "train_dataset_name = dataset_name #+ '_train'\n", + "#inference_dataset_name = dataset_name# + '_inference'\n", + "ds_train.register(ws, train_dataset_name, create_new_version=True)\n", + "#ds_inference.register(ws, inference_dataset_name, create_new_version=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, "metadata": { "scrolled": true }, @@ -138,7 +267,7 @@ "source": [ "from azureml.core.dataset import Dataset\n", "\n", - "dataset = Dataset.get_by_name(ws, name=dataset_name)\n", + "dataset = Dataset.get_by_name(ws, name=train_dataset_name)\n", "dataset_input = dataset.as_named_input(dataset_name)" ] }, @@ -155,7 +284,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": { "scrolled": true }, @@ -185,24 +314,66 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ - "cpu_cluster_name = \"cpucluster\"" + "cpu_cluster_name = \"many-models\"" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found existing compute target.\n", + "Checking cluster status...\n", + "Succeeded\n", + "AmlCompute wait for completion finished\n", + "\n", + "Minimum number of nodes requested have been provisioned\n" + ] + } + ], "source": [ "from azureml.core.compute import AmlCompute\n", - "\n", - "compute = AmlCompute(ws, cpu_cluster_name)" + "from azureml.core.compute import ComputeTarget\n", + "\n", + "# Choose a name for your cluster.\n", + "# TODO\n", + "amlcompute_cluster_name = cpu_cluster_name\n", + "\n", + "found = False\n", + "# Check if this compute target already exists in the workspace.\n", + "cts = ws.compute_targets\n", + "if amlcompute_cluster_name in cts and cts[amlcompute_cluster_name].type == 'AmlCompute':\n", + " found = True\n", + " print('Found existing compute target.')\n", + " compute = cts[amlcompute_cluster_name]\n", + " \n", + "if not found:\n", + " print('Creating a new compute target...')\n", + " provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D64S_V3',\n", + " min_nodes=0,\n", + " max_nodes=63,\n", + " admin_username='azureuser', \n", + " admin_user_password='Pa$$word1',\n", + " )\n", + " # Create the cluster.\n", + " compute = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)\n", + " \n", + "print('Checking cluster status...')\n", + "# Can poll for a minimum number of nodes and for a specific timeout.\n", + "# If no min_node_count is provided, it will use the scale settings for the cluster.\n", + "compute.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)\n", + " \n", + "# For a more detailed view of current AmlCompute status, use get_status()." ] }, { @@ -228,14 +399,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "from azureml.pipeline.steps import ParallelRunConfig\n", "\n", "processes_per_node = 8\n", - "node_count = 1\n", + "node_count =1\n", "timeout = 180\n", "\n", "parallel_run_config = ParallelRunConfig(\n", @@ -243,7 +414,7 @@ " entry_script='train.py',\n", " mini_batch_size=\"1\",\n", " run_invocation_timeout=timeout,\n", - " error_threshold=10,\n", + " error_threshold=-1,\n", " output_action=\"append_row\",\n", " environment=train_env,\n", " process_count_per_node=processes_per_node,\n", @@ -251,6 +422,15 @@ " node_count=node_count)" ] }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "#!pip show pytorch" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -264,7 +444,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": { "scrolled": true }, @@ -290,7 +470,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": { "scrolled": true }, @@ -323,9 +503,19 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created step many-models-training [c4ffbc96][5bf31ad1-e834-4d9e-86f6-5f4e8af18725], (This step will run and generate new outputs)\n", + "Submitted PipelineRun be77c836-82e1-4343-a540-28063e624c35\n", + "Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/oj-customscript-test/runs/be77c836-82e1-4343-a540-28063e624c35?wsid=/subscriptions/381b38e9-9840-4719-a5a0-61d9585e1e91/resourcegroups/deeptim_westus_3rg/workspaces/deeptim-eastus2euap-ws\n" + ] + } + ], "source": [ "from azureml.pipeline.core import Pipeline\n", "\n", @@ -337,7 +527,16 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PipelineRunId: be77c836-82e1-4343-a540-28063e624c35\n", + "Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/oj-customscript-test/runs/be77c836-82e1-4343-a540-28063e624c35?wsid=/subscriptions/381b38e9-9840-4719-a5a0-61d9585e1e91/resourcegroups/deeptim_westus_3rg/workspaces/deeptim-eastus2euap-ws\n" + ] + } + ], "source": [ "#Wait for the run to complete\n", "run.wait_for_completion(show_output=False, raise_on_error=True)" @@ -575,9 +774,9 @@ } ], "kernelspec": { - "display_name": "Python 3", + "display_name": "Python [conda env:.conda-pypi] *", "language": "python", - "name": "python3" + "name": "conda-env-.conda-pypi-py" }, "language_info": { "codemirror_mode": { @@ -589,7 +788,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.6.7" } }, "nbformat": 4, diff --git a/Custom_Script/scripts/train.py b/Custom_Script/scripts/train.py index 652ed855..9deae49e 100644 --- a/Custom_Script/scripts/train.py +++ b/Custom_Script/scripts/train.py @@ -94,8 +94,9 @@ def run(input_data): # 7.0 Train model with full dataset forecaster.fit(data) - # import time - # time.sleep(180) + # Simulating the 3 minutes run to test concurrency + import time + time.sleep(180) # 8.0 Save the forecasting pipeline joblib.dump(forecaster, filename=os.path.join('./outputs/', model_name)) @@ -128,12 +129,32 @@ def run(input_data): result['index'] = idx result['num_models'] = len(input_data) result['status'] = child_run.get_status() + result['run_id'] = str(child_run.id) print('ending (' + csv_file_path + ') ' + str(end_datetime)) result_list.append(result) except Exception: if child_run and child_run.get_status() != 'Completed': child_run.fail() + result['model_type'] = args.model_type + end_datetime = datetime.datetime.now() + result['file_name'] = file_name + result['model_name'] = model_name + result['start_date'] = str(start_datetime) + result['end_date'] = str(end_datetime) + result['duration'] = str(end_datetime-start_datetime) + result['mse'] = str(None) + result['rmse'] = str(None) + result['mae'] = str(None) + result['mape'] = str(None) + result['index'] = idx + result['num_models'] = len(input_data) + if child_run: + result['status'] = child_run.get_status() + result['run_id'] = str(child_run.id) + else: + result['status'] = 'Failed' + result['run_id'] = str(None) # Data returned by this function will be available in parallel_run_step.txt return pd.DataFrame(result_list) From f27360a87fc6ed3c98eb6c814ad9364e6c6b5f69 Mon Sep 17 00:00:00 2001 From: Deepti Mokkapati Date: Tue, 9 Feb 2021 15:57:00 -0800 Subject: [PATCH 3/4] changes wrt scale --- .../02_CustomScript_Training_Pipeline.ipynb | 257 ++---------------- 1 file changed, 29 insertions(+), 228 deletions(-) diff --git a/Custom_Script/02_CustomScript_Training_Pipeline.ipynb b/Custom_Script/02_CustomScript_Training_Pipeline.ipynb index 1f70b4ed..d0534aed 100644 --- a/Custom_Script/02_CustomScript_Training_Pipeline.ipynb +++ b/Custom_Script/02_CustomScript_Training_Pipeline.ipynb @@ -51,25 +51,7 @@ "metadata": {}, "outputs": [], "source": [ - "pip show azureml-sdk" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#!pip install azureml-pipeline-steps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#!pip install --upgrade azureml-contrib-automl-pipeline-steps" + "# !pip install azureml-pipeline-steps" ] }, { @@ -81,49 +63,16 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Workspace Name: deeptim-eastus2euap-ws\n", - "Azure Region: eastus2euap\n", - "Subscription Id: 381b38e9-9840-4719-a5a0-61d9585e1e91\n", - "Resource Group: deeptim_westus_3rg\n" - ] - } - ], + "outputs": [], "source": [ "from azureml.core import Workspace\n", "\n", "# set up workspace\n", - "#ws = Workspace.from_config()\n", - "\n", - "#ws = Workspace.create( subscription_id= \"381b38e9-9840-4719-a5a0-61d9585e1e91\",\n", - "# resource_group= \"deeptim_westus_3rg\",\n", - "# name= \"deeptim-eastus2euap-ws\",\n", - "# create_resource_group=True,\n", - "# location='eastus2euap',\n", - " # )\n", - "\n", - "ws = Workspace( subscription_id= \"381b38e9-9840-4719-a5a0-61d9585e1e91\",\n", - " resource_group= \"deeptim_westus_3rg\",\n", - " workspace_name= \"deeptim-eastus2euap-ws\",\n", - " )\n", - "\n", - "#ws = Workspace( subscription_id= \"381b38e9-9840-4719-a5a0-61d9585e1e91\",\n", - "# resource_group= \"deeptim-eastus2euap-rg\",\n", - "# workspace_name= \"deeptim-eastus2euap-ws\",\n", - "# )\n", - "\n", - "#ws = Workspace( subscription_id= \"ba7979f7-d040-49c9-af1a-7414402bf622\",\n", - "# resource_group= \"deeptim-westcentralus-rg\",\n", - "# workspace_name= \"deeptim-westcentralus-ws\",\n", - "# )\n", + "ws = Workspace.from_config()\n", "\n", "# set up datastores\n", "dstore = ws.get_default_datastore()\n", @@ -144,23 +93,15 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Experiment name: oj-customscript-test\n" - ] - } - ], + "outputs": [], "source": [ "from azureml.core import Experiment\n", "\n", - "experiment = Experiment(ws, 'oj-customscript-test')\n", + "experiment = Experiment(ws, 'oj_training_pipeline')\n", "\n", "print('Experiment name: ' + experiment.name)" ] @@ -180,86 +121,16 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "dataset_name = 'oj_data'" - ] - }, - { - "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "blob_datastore_name = \"automl_many_models\"\n", - "container_name = \"automl-sample-notebook-data\"\n", - "account_name = \"automlsamplenotebookdata\"\n", - "\n", - "from azureml.core import Datastore\n", - "\n", - "datastore = Datastore.register_azure_blob_container(\n", - " workspace=ws, \n", - " datastore_name=blob_datastore_name, \n", - " container_name=container_name,\n", - " account_name=account_name,\n", - " create_if_not_exists=True\n", - ")" + "dataset_name = 'oj_data_small_train'" ] }, { "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{\n", - " \"source\": [\n", - " \"('automl_many_models', 'oj_data_small/')\"\n", - " ],\n", - " \"definition\": [\n", - " \"GetDatastoreFiles\"\n", - " ],\n", - " \"registration\": {\n", - " \"id\": \"f84d587d-a7a8-4a37-a2ac-bd1cb71acb1c\",\n", - " \"name\": \"oj_data_small\",\n", - " \"version\": 1,\n", - " \"workspace\": \"Workspace.create(name='deeptim-eastus2euap-ws', subscription_id='381b38e9-9840-4719-a5a0-61d9585e1e91', resource_group='deeptim_westus_3rg')\"\n", - " }\n", - "}" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ds_train_path = 'oj_data_small/'\n", - "ds_inference_path = 'oj_inference_small/'\n", - "\n", - "from azureml.core.dataset import Dataset\n", - "\n", - "\n", - "# Create file datasets\n", - "ds_train = Dataset.File.from_files(path=datastore.path(ds_train_path), validate=False)\n", - "ds_inference = Dataset.File.from_files(path=datastore.path(ds_inference_path), validate=False)\n", - "\n", - "# Register the file datasets\n", - "#dataset_name = 'oj_data_small' # if 0 < dataset_maxfiles < 11973 else 'oj_data'\n", - "dataset_name = 'oj_data_small'\n", - "train_dataset_name = dataset_name #+ '_train'\n", - "#inference_dataset_name = dataset_name# + '_inference'\n", - "ds_train.register(ws, train_dataset_name, create_new_version=True)\n", - "#ds_inference.register(ws, inference_dataset_name, create_new_version=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": { "scrolled": true }, @@ -267,7 +138,7 @@ "source": [ "from azureml.core.dataset import Dataset\n", "\n", - "dataset = Dataset.get_by_name(ws, name=train_dataset_name)\n", + "dataset = Dataset.get_by_name(ws, name=dataset_name)\n", "dataset_input = dataset.as_named_input(dataset_name)" ] }, @@ -284,7 +155,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": { "scrolled": true }, @@ -314,66 +185,24 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "cpu_cluster_name = \"many-models\"" + "cpu_cluster_name = \"cpucluster\"" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Found existing compute target.\n", - "Checking cluster status...\n", - "Succeeded\n", - "AmlCompute wait for completion finished\n", - "\n", - "Minimum number of nodes requested have been provisioned\n" - ] - } - ], + "outputs": [], "source": [ "from azureml.core.compute import AmlCompute\n", - "from azureml.core.compute import ComputeTarget\n", - "\n", - "# Choose a name for your cluster.\n", - "# TODO\n", - "amlcompute_cluster_name = cpu_cluster_name\n", - "\n", - "found = False\n", - "# Check if this compute target already exists in the workspace.\n", - "cts = ws.compute_targets\n", - "if amlcompute_cluster_name in cts and cts[amlcompute_cluster_name].type == 'AmlCompute':\n", - " found = True\n", - " print('Found existing compute target.')\n", - " compute = cts[amlcompute_cluster_name]\n", - " \n", - "if not found:\n", - " print('Creating a new compute target...')\n", - " provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D64S_V3',\n", - " min_nodes=0,\n", - " max_nodes=63,\n", - " admin_username='azureuser', \n", - " admin_user_password='Pa$$word1',\n", - " )\n", - " # Create the cluster.\n", - " compute = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)\n", - " \n", - "print('Checking cluster status...')\n", - "# Can poll for a minimum number of nodes and for a specific timeout.\n", - "# If no min_node_count is provided, it will use the scale settings for the cluster.\n", - "compute.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)\n", - " \n", - "# For a more detailed view of current AmlCompute status, use get_status()." + "\n", + "compute = AmlCompute(ws, cpu_cluster_name)" ] }, { @@ -399,14 +228,14 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from azureml.pipeline.steps import ParallelRunConfig\n", "\n", "processes_per_node = 8\n", - "node_count =1\n", + "node_count = 1\n", "timeout = 180\n", "\n", "parallel_run_config = ParallelRunConfig(\n", @@ -422,15 +251,6 @@ " node_count=node_count)" ] }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "#!pip show pytorch" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -444,7 +264,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": { "scrolled": true }, @@ -470,7 +290,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": { "scrolled": true }, @@ -503,19 +323,9 @@ }, { "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Created step many-models-training [c4ffbc96][5bf31ad1-e834-4d9e-86f6-5f4e8af18725], (This step will run and generate new outputs)\n", - "Submitted PipelineRun be77c836-82e1-4343-a540-28063e624c35\n", - "Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/oj-customscript-test/runs/be77c836-82e1-4343-a540-28063e624c35?wsid=/subscriptions/381b38e9-9840-4719-a5a0-61d9585e1e91/resourcegroups/deeptim_westus_3rg/workspaces/deeptim-eastus2euap-ws\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "from azureml.pipeline.core import Pipeline\n", "\n", @@ -527,16 +337,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PipelineRunId: be77c836-82e1-4343-a540-28063e624c35\n", - "Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/oj-customscript-test/runs/be77c836-82e1-4343-a540-28063e624c35?wsid=/subscriptions/381b38e9-9840-4719-a5a0-61d9585e1e91/resourcegroups/deeptim_westus_3rg/workspaces/deeptim-eastus2euap-ws\n" - ] - } - ], + "outputs": [], "source": [ "#Wait for the run to complete\n", "run.wait_for_completion(show_output=False, raise_on_error=True)" @@ -774,9 +575,9 @@ } ], "kernelspec": { - "display_name": "Python [conda env:.conda-pypi] *", + "display_name": "Python 3", "language": "python", - "name": "conda-env-.conda-pypi-py" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -788,7 +589,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.7" + "version": "3.7.3" } }, "nbformat": 4, From 5dc8f676f525849aab664dfb6a13f63a69c728cc Mon Sep 17 00:00:00 2001 From: Deepti Mokkapati Date: Thu, 11 Feb 2021 12:15:07 -0800 Subject: [PATCH 4/4] child_run --- Custom_Script/scripts/train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Custom_Script/scripts/train.py b/Custom_Script/scripts/train.py index 9deae49e..cbab8f14 100644 --- a/Custom_Script/scripts/train.py +++ b/Custom_Script/scripts/train.py @@ -59,6 +59,7 @@ def run(input_data): train = data[:-args.test_size] test = data[-args.test_size:] + child_run = None try: child_run = current_run.child_run(name=model_name)