Build AI solutions with Azure Machine Learning
Go to https://ml.azure.com/
Create a Compute instance (Standard D12 v2)
Notebooks -> Terminal
cd Users
git clone https://github.com/MicrosoftLearning/mslearn-dp100
Refresh the file pane to see the folder
01 Getting started
The config file is in the root dir
from azureml.core import Workspace
ws = Workspace.from_config()
See compute targets
from azureml.core import ComputeTarget
print("Compute Resources:")
for compute_name in ws.compute_targets:
compute = ws.compute_targets[compute_name]
print("\t", compute.name, ':', compute.type)
Write a scipt
from azureml.core import Run
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# Get the experiment run context
run = Run.get_context()
# Prepare the dataset
diabetes = pd.read_csv('data.csv')
X, y = data[['Feature1','Feature2','Feature3']].values, data['Label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
# Train a logistic regression model
reg = 0.1
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)
# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
run.log('Accuracy', np.float(acc))
# Save the trained model
os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/model.pkl')
run.complete()
Run the script
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.core.conda_dependencies import CondaDependencies
# Create a Python environment for the experiment
sklearn_env = Environment("sklearn-env")
# Ensure the required packages are installed
packages = CondaDependencies.create(conda_packages=['scikit-learn','pip'],
pip_packages=['azureml-defaults'])
sklearn_env.python.conda_dependencies = packages
# Create a script config
script_config = ScriptRunConfig(source_directory='training_folder',
script='training.py',
environment=sklearn_env)
# Submit the experiment
experiment = Experiment(workspace=ws, name='training-experiment')
run = experiment.submit(config=script_config)
run.wait_for_completion()
For hyperparamters
from azureml.core import Run
import argparse
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# Get the experiment run context
run = Run.get_context()
# Set regularization hyperparameter
parser = argparse.ArgumentParser()
parser.add_argument('--reg-rate', type=float, dest='reg_rate', default=0.01)
args = parser.parse_args()
reg = args.reg_rate
# Prepare the dataset
diabetes = pd.read_csv('data.csv')
X, y = data[['Feature1','Feature2','Feature3']].values, data['Label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
# Train a logistic regression model
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)
# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
run.log('Accuracy', np.float(acc))
# Save the trained model
os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/model.pkl')
run.complete()
Passing arguments into a script
# Create a script config
script_config = ScriptRunConfig(source_directory='training_folder',
script='training.py',
arguments = ['--reg-rate', 0.1],
environment=sklearn_env)
Register models
You can download the model to local
# "run" is a reference to a completed experiment run
# List the files generated by the experiment
for file in run.get_file_names():
print(file)
# Download a named file
run.download_file(name='outputs/model.pkl', output_file_path='model.pkl')
Register
from azureml.core import Model
model = Model.register(workspace=ws,
model_name='classification_model',
model_path='model.pkl', # local path
description='A classification model',
tags={'data-format': 'CSV'},
model_framework=Model.Framework.SCIKITLEARN,
model_framework_version='0.20.3')
or
run.register_model( model_name='classification_model',
model_path='outputs/model.pkl', # run outputs path
description='A classification model',
tags={'data-format': 'CSV'},
model_framework=Model.Framework.SCIKITLEARN,
model_framework_version='0.20.3')
See registered models
from azureml.core import Model
for model in Model.list(ws):
# Get model name and auto-generated version
print(model.name, 'version:', model.version)
Go thorugh https://github.com/MicrosoftLearning/mslearn-dp100/blob/main/05%20-%20Train%20Models.ipynb
!pip install --upgrade azureml-sdk azureml-widgets
import azureml.core
from azureml.core import Workspace
# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))
import os, shutil
# Create a folder for the experiment files
training_folder = 'diabetes-training'
os.makedirs(training_folder, exist_ok=True)
# Copy the data file into the experiment folder
shutil.copy('data/diabetes.csv', os.path.join(training_folder, "diabetes.csv"))
Write training script
%%writefile $training_folder/diabetes_training.py
# Import libraries
from azureml.core import Run
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
# Get the experiment run context
run = Run.get_context()
# load the diabetes dataset
print("Loading Data...")
diabetes = pd.read_csv('diabetes.csv')
# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values
# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
# Set regularization hyperparameter
reg = 0.01
# Train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg)
run.log('Regularization Rate', np.float(reg))
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)
# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))
# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))
# Save the trained model in the outputs folder
os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')
run.complete()
Run the script
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.widgets import RunDetails
# Create a Python environment for the experiment
sklearn_env = Environment("sklearn-env")
# Ensure the required packages are installed (we need scikit-learn and Azure ML defaults)
packages = CondaDependencies.create(pip_packages=['scikit-learn','azureml-defaults'])
sklearn_env.python.conda_dependencies = packages
# Create a script config
script_config = ScriptRunConfig(source_directory=training_folder,
script='diabetes_training.py',
environment=sklearn_env)
# submit the experiment run
experiment_name = 'mslearn-train-diabetes'
experiment = Experiment(workspace=ws, name=experiment_name)
run = experiment.submit(config=script_config)
# Show the running experiment run in the notebook widget
RunDetails(run).show()
# Block until the experiment run has completed
run.wait_for_completion()
Retrive metrics and output
# Get logged metrics and files
metrics = run.get_metrics()
for key in metrics.keys():
print(key, metrics.get(key))
print('\n')
for file in run.get_file_names():
print(file)
Register the model
from azureml.core import Model
# Register the model
run.register_model(model_path='outputs/diabetes_model.pkl', model_name='diabetes_model',
tags={'Training context':'Script'},
properties={'AUC': run.get_metrics()['AUC'], 'Accuracy': run.get_metrics()['Accuracy']})
# List registered models
for model in Model.list(ws):
print(model.name, 'version:', model.version)
for tag_name in model.tags:
tag = model.tags[tag_name]
print ('\t',tag_name, ':', tag)
for prop_name in model.properties:
prop = model.properties[prop_name]
print ('\t',prop_name, ':', prop)
print('\n')
Create folder for param script and training data
import os, shutil
# Create a folder for the experiment files
training_folder = 'diabetes-training-params'
os.makedirs(training_folder, exist_ok=True)
# Copy the data file into the experiment folder
shutil.copy('data/diabetes.csv', os.path.join(training_folder, "diabetes.csv"))
Write file
%%writefile $training_folder/diabetes_training.py
# Import libraries
from azureml.core import Run
import pandas as pd
import numpy as np
import joblib
import os
import argparse
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
# Get the experiment run context
run = Run.get_context()
# Set regularization hyperparameter
parser = argparse.ArgumentParser()
parser.add_argument('--reg_rate', type=float, dest='reg', default=0.01)
args = parser.parse_args()
reg = args.reg
# load the diabetes dataset
print("Loading Data...")
# load the diabetes dataset
diabetes = pd.read_csv('diabetes.csv')
# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values
# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
# Train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg)
run.log('Regularization Rate', np.float(reg))
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)
# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))
# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))
os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')
run.complete()
Run the script with arguments
# Create a script config
script_config = ScriptRunConfig(source_directory=training_folder,
script='diabetes_training.py',
arguments = ['--reg_rate', 0.1],
environment=sklearn_env)
# submit the experiment
experiment_name = 'mslearn-train-diabetes'
experiment = Experiment(workspace=ws, name=experiment_name)
run = experiment.submit(config=script_config)
RunDetails(run).show()
run.wait_for_completion()
Get output
# Get logged metrics
metrics = run.get_metrics()
for key in metrics.keys():
print(key, metrics.get(key))
print('\n')
for file in run.get_file_names():
print(file)
Register new version
from azureml.core import Model
# Register the model
run.register_model(model_path='outputs/diabetes_model.pkl', model_name='diabetes_model',
tags={'Training context':'Parameterized script'},
properties={'AUC': run.get_metrics()['AUC'], 'Accuracy': run.get_metrics()['Accuracy']})
# List registered models
for model in Model.list(ws):
print(model.name, 'version:', model.version)
for tag_name in model.tags:
tag = model.tags[tag_name]
print ('\t',tag_name, ':', tag)
for prop_name in model.properties:
prop = model.properties[prop_name]
print ('\t',prop_name, ':', prop)
print('\n')
You can see the Models in the Models section under Assets
Work with Data in Azure Machine Learning
Mount a blob
from azureml.core import Workspace, Datastore
ws = Workspace.from_config()
# Register a new datastore
blob_ds = Datastore.register_azure_blob_container(workspace=ws,
datastore_name='blob_data',
container_name='data_container',
account_name='az_store_acct',
account_key='123456abcde789…')
View data stores
for ds_name in ws.datastores:
print(ds_name)
Get a reference using get
blob_store = Datastore.get(ws, datastore_name='blob_data')
Has a default (built in workspaceblobsore datastore)
default_store = ws.get_default_datastore()
Change the name of the default store
ws.set_default_datastore('blob_data')
Create and register tabular datasets
from azureml.core import Dataset
blob_ds = ws.get_default_datastore()
csv_paths = [(blob_ds, 'data/files/current_data.csv'),
(blob_ds, 'data/files/archive/*.csv')]
tab_ds = Dataset.Tabular.from_delimited_files(path=csv_paths)
tab_ds = tab_ds.register(workspace=ws, name='csv_table')
file datasets
from azureml.core import Dataset
blob_ds = ws.get_default_datastore()
file_ds = Dataset.File.from_files(path=(blob_ds, 'data/files/images/*.jpg'))
file_ds = file_ds.register(workspace=ws, name='img_files')
Get dataset
import azureml.core
from azureml.core import Workspace, Dataset
# Load the workspace from the saved config file
ws = Workspace.from_config()
# Get a dataset from the workspace datasets collection
ds1 = ws.datasets['csv_table']
# Get a dataset by name from the datasets class
ds2 = Dataset.get_by_name(ws, 'img_files')
Version dataset by adding create_new_version
img_paths = [(blob_ds, 'data/files/images/*.jpg'),
(blob_ds, 'data/files/images/*.png')]
file_ds = Dataset.File.from_files(path=img_paths)
file_ds = file_ds.register(workspace=ws, name='img_files', create_new_version=True)
Retrive a specific version
img_ds = Dataset.get_by_name(workspace=ws, name='img_files', version=2)
Convert to dataframe
df = tab_ds.to_pandas_dataframe()
# code to work with dataframe goes here, for example:
print(df.head())
Pass dataset to script use --ds
env = Environment('my_env')
packages = CondaDependencies.create(conda_packages=['pip'],
pip_packages=['azureml-defaults',
'azureml-dataprep[pandas]'])
env.python.conda_dependencies = packages
script_config = ScriptRunConfig(source_directory='my_dir',
script='script.py',
arguments=['--ds', tab_ds],
environment=env)
from azureml.core import Run, Dataset
parser.add_argument('--ds', type=str, dest='dataset_id')
args = parser.parse_args()
run = Run.get_context()
ws = run.experiment.workspace
dataset = Dataset.get_by_id(ws, id=args.dataset_id)
data = dataset.to_pandas_dataframe()
use a named input
env = Environment('my_env')
packages = CondaDependencies.create(conda_packages=['pip'],
pip_packages=['azureml-defaults',
'azureml-dataprep[pandas]'])
env.python.conda_dependencies = packages
script_config = ScriptRunConfig(source_directory='my_dir',
script='script.py',
arguments=['--ds', tab_ds.as_named_input('my_dataset')],
environment=env)
from azureml.core import Run
parser.add_argument('--ds', type=str, dest='ds_id')
args = parser.parse_args()
run = Run.get_context()
dataset = run.input_datasets['my_dataset']
data = dataset.to_pandas_dataframe()
Pass file dataset. Use as_download or as_mount for really large datasets
env = Environment('my_env')
packages = CondaDependencies.create(conda_packages=['pip'],
pip_packages=['azureml-defaults',
'azureml-dataprep[pandas]'])
env.python.conda_dependencies = packages
script_config = ScriptRunConfig(source_directory='my_dir',
script='script.py',
arguments=['--ds', file_ds.as_download()],
environment=env)
from azureml.core import Run
import glob
parser.add_argument('--ds', type=str, dest='ds_ref')
args = parser.parse_args()
run = Run.get_context()
imgs = glob.glob(ds_ref + "/*.jpg")
Use named input
env = Environment('my_env')
packages = CondaDependencies.create(conda_packages=['pip'],
pip_packages=['azureml-defaults',
'azureml-dataprep[pandas]'])
env.python.conda_dependencies = packages
script_config = ScriptRunConfig(source_directory='my_dir',
script='script.py',
arguments=['--ds', file_ds.as_named_input('my_ds').as_download()],
environment=env)
from azureml.core import Run
import glob
parser.add_argument('--ds', type=str, dest='ds_ref')
args = parser.parse_args()
run = Run.get_context()
dataset = run.input_datasets['my_ds']
imgs= glob.glob(dataset + "/*.jpg")
Go through https://github.com/MicrosoftLearning/mslearn-dp100/blob/main/06%20-%20Work%20with%20Data.ipynb
!pip install --upgrade azureml-sdk azureml-widgets
import azureml.core
from azureml.core import Workspace
# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))
View datastores
# Get the default datastore
default_ds = ws.get_default_datastore()
# Enumerate all datastores, indicating which is the default
for ds_name in ws.datastores:
print(ds_name, "- Default =", ds_name == default_ds.name)
Upload data to datastore
default_ds.upload_files(files=['./data/diabetes.csv', './data/diabetes2.csv'], # Upload the diabetes csv files in /data
target_path='diabetes-data/', # Put it in a folder path in the datastore
overwrite=True, # Replace existing files of the same name
show_progress=True)
Create a tabular dataset
from azureml.core import Dataset
# Get the default datastore
default_ds = ws.get_default_datastore()
#Create a tabular dataset from the path on the datastore (this may take a short while)
tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'diabetes-data/*.csv'))
# Display the first 20 rows as a Pandas dataframe
tab_data_set.take(20).to_pandas_dataframe()
File dataset
#Create a file dataset from the path on the datastore (this may take a short while)
file_data_set = Dataset.File.from_files(path=(default_ds, 'diabetes-data/*.csv'))
# Get the files in the dataset
for file_path in file_data_set.to_path():
print(file_path)
Register the tabular dataset and the file dataset
# Register the tabular dataset
try:
tab_data_set = tab_data_set.register(workspace=ws,
name='diabetes dataset',
description='diabetes data',
tags = {'format':'CSV'},
create_new_version=True)
except Exception as ex:
print(ex)
# Register the file dataset
try:
file_data_set = file_data_set.register(workspace=ws,
name='diabetes file dataset',
description='diabetes files',
tags = {'format':'CSV'},
create_new_version=True)
except Exception as ex:
print(ex)
print('Datasets registered')
View the registered datasets
print("Datasets:")
for dataset_name in list(ws.datasets.keys()):
dataset = Dataset.get_by_name(ws, dataset_name)
print("\t", dataset.name, 'version', dataset.version)
Grab a specific version
dataset_v1 = Dataset.get_by_name(ws, 'diabetes dataset', version = 1)
Train a model from a tabular dataset
import os
# Create a folder for the experiment files
experiment_folder = 'diabetes_training_from_tab_dataset'
os.makedirs(experiment_folder, exist_ok=True)
print(experiment_folder, 'folder created')
%%writefile $experiment_folder/diabetes_training.py
# Import libraries
import os
import argparse
from azureml.core import Run, Dataset
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
# Get the script arguments (regularization rate and training dataset ID)
parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01, help='regularization rate')
parser.add_argument("--input-data", type=str, dest='training_dataset_id', help='training dataset')
args = parser.parse_args()
# Set regularization hyperparameter (passed as an argument to the script)
reg = args.reg_rate
# Get the experiment run context
run = Run.get_context()
# Get the training dataset
print("Loading Data...")
diabetes = run.input_datasets['training_data'].to_pandas_dataframe()
# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values
# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
# Train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg)
run.log('Regularization Rate', np.float(reg))
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)
# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))
# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))
os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')
run.complete()
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.widgets import RunDetails
# Create a Python environment for the experiment
sklearn_env = Environment("sklearn-env")
# Ensure the required packages are installed (we need scikit-learn, Azure ML defaults, and Azure ML dataprep)
packages = CondaDependencies.create(conda_packages=['scikit-learn','pip'],
pip_packages=['azureml-defaults','azureml-dataprep[pandas]'])
sklearn_env.python.conda_dependencies = packages
# Get the training dataset
diabetes_ds = ws.datasets.get("diabetes dataset")
# Create a script config
script_config = ScriptRunConfig(source_directory=experiment_folder,
script='diabetes_training.py',
arguments = ['--regularization', 0.1, # Regularizaton rate parameter
'--input-data', diabetes_ds.as_named_input('training_data')], # Reference to dataset
environment=sklearn_env)
# submit the experiment
experiment_name = 'mslearn-train-diabetes'
experiment = Experiment(workspace=ws, name=experiment_name)
run = experiment.submit(config=script_config)
RunDetails(run).show()
run.wait_for_completion()
Register the traing model
from azureml.core import Model
run.register_model(model_path='outputs/diabetes_model.pkl', model_name='diabetes_model',
tags={'Training context':'Tabular dataset'}, properties={'AUC': run.get_metrics()['AUC'], 'Accuracy': run.get_metrics()['Accuracy']})
for model in Model.list(ws):
print(model.name, 'version:', model.version)
for tag_name in model.tags:
tag = model.tags[tag_name]
print ('\t',tag_name, ':', tag)
for prop_name in model.properties:
prop = model.properties[prop_name]
print ('\t',prop_name, ':', prop)
print('\n')
Train from a file dataset
import os
# Create a folder for the experiment files
experiment_folder = 'diabetes_training_from_file_dataset'
os.makedirs(experiment_folder, exist_ok=True)
print(experiment_folder, 'folder created')
%%writefile $experiment_folder/diabetes_training.py
# Import libraries
import os
import argparse
from azureml.core import Dataset, Run
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import glob
# Get script arguments (rgularization rate and file dataset mount point)
parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01, help='regularization rate')
parser.add_argument('--input-data', type=str, dest='dataset_folder', help='data mount point')
args = parser.parse_args()
# Set regularization hyperparameter (passed as an argument to the script)
reg = args.reg_rate
# Get the experiment run context
run = Run.get_context()
# load the diabetes dataset
print("Loading Data...")
data_path = run.input_datasets['training_files'] # Get the training data path from the input
# (You could also just use args.data_folder if you don't want to rely on a hard-coded friendly name)
# Read the files
all_files = glob.glob(data_path + "/*.csv")
diabetes = pd.concat((pd.read_csv(f) for f in all_files), sort=False)
# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values
# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
# Train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg)
run.log('Regularization Rate', np.float(reg))
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)
# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))
# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))
os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')
run.complete()
from azureml.core import Experiment
from azureml.widgets import RunDetails
# Get the training dataset
diabetes_ds = ws.datasets.get("diabetes file dataset")
# Create a script config
script_config = ScriptRunConfig(source_directory=experiment_folder,
script='diabetes_training.py',
arguments = ['--regularization', 0.1, # Regularizaton rate parameter
'--input-data', diabetes_ds.as_named_input('training_files').as_download()], # Reference to dataset location
environment=sklearn_env) # Use the environment created previously
# submit the experiment
experiment_name = 'mslearn-train-diabetes'
experiment = Experiment(workspace=ws, name=experiment_name)
run = experiment.submit(config=script_config)
RunDetails(run).show()
run.wait_for_completion()
Register the model
from azureml.core import Model
run.register_model(model_path='outputs/diabetes_model.pkl', model_name='diabetes_model',
tags={'Training context':'File dataset'}, properties={'AUC': run.get_metrics()['AUC'], 'Accuracy': run.get_metrics()['Accuracy']})
for model in Model.list(ws):
print(model.name, 'version:', model.version)
for tag_name in model.tags:
tag = model.tags[tag_name]
print ('\t',tag_name, ':', tag)
for prop_name in model.properties:
prop = model.properties[prop_name]
print ('\t',prop_name, ':', prop)
print('\n')
https://microsoftlearning.github.io/mslearn-dp100/instructions/01-create-a-workspace.htmlWork with Compute in Azure Machine Learning
Creating environment files
Create an environment from a specification file
e.g conda
name: py_env
dependencies:
- numpy
- pandas
- scikit-learn
- pip:
- azureml-defaults
then do
from azureml.core import Environment
env = Environment.from_conda_specification(name='training_environment',
file_path='./conda.yml')
can create from existing conda
from azureml.core import Environment
env = Environment.from_existing_conda_environment(name='training_environment',
conda_environment_name='py_env')
create from specifying packages
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies
env = Environment('training_environment')
deps = CondaDependencies.create(conda_packages=['scikit-learn','pandas','numpy'],
pip_packages=['azureml-defaults'])
env.python.conda_dependencies = deps
Create in containers
env.docker.enabled = True
deps = CondaDependencies.create(conda_packages=['scikit-learn','pandas','pip'],
pip_packages=['azureml-defaults']
env.python.conda_dependencies = deps
Can add you own image
env.docker.base_image='my-base-image'
env.docker.base_image_registry='myregistry.azurecr.io/myimage'
or
env.docker.base_image = None
env.docker.base_dockerfile = './Dockerfile'
you can override the version of package or env
env.python.user_managed_dependencies=True
env.python.interpreter_path = '/opt/miniconda/bin/python'
Register your env
env.register(workspace=ws)
See registered envs
from azureml.core import Environment
env_names = Environment.list(workspace=ws)
for env_name in env_names:
print('Name:',env_name)
Get env and set training script
from azureml.core import Environment
from azureml.train.estimator import Estimator
training_env = Environment.get(workspace=ws, name='training_environment')
estimator = Estimator(source_directory='experiment_folder'
entry_script='training_script.py',
compute_target='local',
environment_definition=training_env)
Create compute targets
from azureml.core import Workspace
from azureml.core.compute import ComputeTarget, AmlCompute
# Load the workspace from the saved config file
ws = Workspace.from_config()
# Specify a name for the compute (unique within the workspace)
compute_name = 'aml-cluster'
# Define compute configuration
compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2',
min_nodes=0, max_nodes=4,
vm_priority='dedicated' # lowpriority
# Create the compute
aml_cluster = ComputeTarget.create(ws, compute_name, compute_config)
aml_cluster.wait_for_completion(show_output=True)
An unmanaged compute target is one that is defined and managed outside of the Azure Machine Learning workspace; for example, an Azure virtual machine or an Azure Databricks cluster.
Use the ComputeTarget.attach() method to attach the existing compute based on its target-specific configuration settings.
e.g. connect to a databricks clusters
from azureml.core import Workspace
from azureml.core.compute import ComputeTarget, DatabricksCompute
# Load the workspace from the saved config file
ws = Workspace.from_config()
# Specify a name for the compute (unique within the workspace)
compute_name = 'db_cluster'
# Define configuration for existing Azure Databricks cluster
db_workspace_name = 'db_workspace'
db_resource_group = 'db_resource_group'
db_access_token = '1234-abc-5678-defg-90...'
db_config = DatabricksCompute.attach_configuration(resource_group=db_resource_group,
workspace_name=db_workspace_name,
access_token=db_access_token)
# Create the compute
databricks_compute = ComputeTarget.attach(ws, compute_name, db_config)
databricks_compute.wait_for_completion(True)
If doesn't exist create.
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
compute_name = "aml-cluster"
# Check if the compute target exists
try:
aml_cluster = ComputeTarget(workspace=ws, name=compute_name)
print('Found existing cluster.')
except ComputeTargetException:
# If not, create it
compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2',
max_nodes=4)
aml_cluster = ComputeTarget.create(ws, compute_name, compute_config)
aml_cluster.wait_for_completion(show_output=True)
https://docs.microsoft.com/en-us/azure/machine-learning/how-to-set-up-training-targets
Use compute targets
To use a particular compute target, you can specify it in the appropriate parameter for an experiment run configuration or estimator
from azureml.core import Environment, ScriptRunConfig
compute_name = 'aml-cluster'
training_env = Environment.get(workspace=ws, name='training_environment')
script_config = ScriptRunConfig(source_directory='my_dir',
script='script.py',
environment=env,
compute_target=compute_name)
Instead of specifying the name of the compute target, you can specify a ComputeTarget object, like this:
from azureml.core import Environment, ScriptRunConfig
from azureml.core.compute import ComputeTarget
compute_name = "aml-cluster"
training_cluster = ComputeTarget(workspace=ws, name=compute_name)
training_env = Environment.get(workspace=ws, name='training_environment')
script_config = ScriptRunConfig(source_directory='my_dir',
script='script.py',
environment=env,
compute_target=training_cluster)
Work with Compute Contexts
https://microsoftlearning.github.io/mslearn-dp100/instructions/01-create-a-workspace.html
Orchestrate Machine Learning Pipelines
Two scripts
from azureml.pipeline.steps import PythonScriptStep
# Step to run a Python script
step1 = PythonScriptStep(name = 'prepare data',
source_directory = 'scripts',
script_name = 'data_prep.py',
compute_target = 'aml-cluster')
# Step to train a model
step2 = PythonScriptStep(name = 'train model',
source_directory = 'scripts',
script_name = 'train_model.py',
compute_target = 'aml-cluster')
from azureml.pipeline.core import Pipeline
from azureml.core import Experiment
# Construct the pipeline
train_pipeline = Pipeline(workspace = ws, steps = [step1,step2])
# Create an experiment and run the pipeline
experiment = Experiment(workspace = ws, name = 'training-pipeline')
pipeline_run = experiment.submit(train_pipeline)
PipelineData passes data between steps
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep, EstimatorStep
# Get a dataset for the initial data
raw_ds = Dataset.get_by_name(ws, 'raw_dataset')
# Define a PipelineData object to pass data between steps
data_store = ws.get_default_datastore()
prepped_data = PipelineData('prepped', datastore=data_store)
# Step to run a Python script
step1 = PythonScriptStep(name = 'prepare data',
source_directory = 'scripts',
script_name = 'data_prep.py',
compute_target = 'aml-cluster',
# Script arguments include PipelineData
arguments = ['--raw-ds', raw_ds.as_named_input('raw_data'),
'--out_folder', prepped_data],
# Specify PipelineData as output
outputs=[prepped_data])
# Step to run an estimator
step2 = PythonScriptStep(name = 'train model',
source_directory = 'scripts',
script_name = 'data_prep.py',
compute_target = 'aml-cluster',
# Pass as script argument
arguments=['--in_folder', prepped_data],
# Specify PipelineData as input
inputs=[prepped_data])
# code in data_prep.py
from azureml.core import Run
import argparse
import os
# Get the experiment run context
run = Run.get_context()
# Get arguments
parser = argparse.ArgumentParser()
parser.add_argument('--raw-ds', type=str, dest='raw_dataset_id')
parser.add_argument('--out_folder', type=str, dest='folder')
args = parser.parse_args()
output_folder = args.folder
# Get input dataset as dataframe
raw_df = run.input_datasets['raw_data'].to_pandas_dataframe()
# code to prep data (in this case, just select specific columns)
prepped_df = raw_df[['col1', 'col2', 'col3']]
# Save prepped data to the PipelineData location
os.makedirs(output_folder, exist_ok=True)
output_path = os.path.join(output_folder, 'prepped_data.csv')
prepped_df.to_csv(output_path)
Allow caching with allow_resuse parameter
step1 = PythonScriptStep(name = 'prepare data',
source_directory = 'scripts',
script_name = 'data_prep.py',
compute_target = 'aml-cluster',
runconfig = run_config,
inputs=[raw_ds.as_named_input('raw_data')],
outputs=[prepped_data],
arguments = ['--folder', prepped_data]),
# Disable step reuse
allow_reuse = False)
Force all to run
pipeline_run = experiment.submit(train_pipeline, regenerate_outputs=True)
Publish pipeline
published_pipeline = pipeline.publish(name='training_pipeline',
description='Model training pipeline',
version='1.0')
or call publish method on a successful run
# Get the most recent run of the pipeline
pipeline_experiment = ws.experiments.get('training-pipeline')
run = list(pipeline_experiment.get_runs())[0]
# Publish the pipeline from the run
published_pipeline = run.publish_pipeline(name='training_pipeline',
description='Model training pipeline',
version='1.0')
See URI
rest_endpoint = published_pipeline.endpoint
print(rest_endpoint)
Display runid
import requests
response = requests.post(rest_endpoint,
headers=auth_header,
json={"ExperimentName": "run_training_pipeline"})
run_id = response.json()["Id"]
print(run_id)
Define parameters in a pipeline
from azureml.pipeline.core.graph import PipelineParameter
reg_param = PipelineParameter(name='reg_rate', default_value=0.01)
...
step2 = PythonScriptStep(name = 'train model',
source_directory = 'scripts',
script_name = 'data_prep.py',
compute_target = 'aml-cluster',
# Pass parameter as script argument
arguments=['--in_folder', prepped_data,
'--reg', reg_param],
inputs=[prepped_data])
After publishing you can run a pipeline with a parameter
response = requests.post(rest_endpoint,
headers=auth_header,
json={"ExperimentName": "run_training_pipeline",
"ParameterAssignments": {"reg_rate": 0.1}})
Run the pipeline daily
from azureml.pipeline.core import ScheduleRecurrence, Schedule
daily = ScheduleRecurrence(frequency='Day', interval=1)
pipeline_schedule = Schedule.create(ws, name='Daily Training',
description='trains model every day',
pipeline_id=published_pipeline.id,
experiment_name='Training_Pipeline',
recurrence=daily)
Trigger on data changes
from azureml.core import Datastore
from azureml.pipeline.core import Schedule
training_datastore = Datastore(workspace=ws, name='blob_data')
pipeline_schedule = Schedule.create(ws, name='Reactive Training',
description='trains model on data change',
pipeline_id=published_pipeline_id,
experiment_name='Training_Pipeline',
datastore=training_datastore,
path_on_datastore='data/training')
Exercise: Create a pipeline
https://microsoftlearning.github.io/mslearn-dp100/instructions/08-create-a-pipeline.html
https://github.com/MicrosoftLearning/mslearn-dp100/blob/main/08%20-%20Create%20a%20Pipeline.ipynb
!pip install --upgrade azureml-sdk azureml-widgets
import azureml.core
from azureml.core import Workspace
# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))
Prepare data
from azureml.core import Dataset
default_ds = ws.get_default_datastore()
if 'diabetes dataset' not in ws.datasets:
default_ds.upload_files(files=['./data/diabetes.csv', './data/diabetes2.csv'], # Upload the diabetes csv files in /data
target_path='diabetes-data/', # Put it in a folder path in the datastore
overwrite=True, # Replace existing files of the same name
show_progress=True)
#Create a tabular dataset from the path on the datastore (this may take a short while)
tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'diabetes-data/*.csv'))
# Register the tabular dataset
try:
tab_data_set = tab_data_set.register(workspace=ws,
name='diabetes dataset',
description='diabetes data',
tags = {'format':'CSV'},
create_new_version=True)
print('Dataset registered.')
except Exception as ex:
print(ex)
else:
print('Dataset already registered.')
Create folder
import os
# Create a folder for the pipeline step files
experiment_folder = 'diabetes_pipeline'
os.makedirs(experiment_folder, exist_ok=True)
print(experiment_folder)
create the first script, which will read data from the diabetes dataset and apply some simple pre-processing to remove any rows with missing data and normalize the numeric features so they're on a similar scale.
The script includes a argument named --prepped-data, which references the folder where the resulting data should be saved.
%%writefile $experiment_folder/prep_diabetes.py
# Import libraries
import os
import argparse
import pandas as pd
from azureml.core import Run
from sklearn.preprocessing import MinMaxScaler
# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--input-data", type=str, dest='raw_dataset_id', help='raw dataset')
parser.add_argument('--prepped-data', type=str, dest='prepped_data', default='prepped_data', help='Folder for results')
args = parser.parse_args()
save_folder = args.prepped_data
# Get the experiment run context
run = Run.get_context()
# load the data (passed as an input dataset)
print("Loading Data...")
diabetes = run.input_datasets['raw_data'].to_pandas_dataframe()
# Log raw row count
row_count = (len(diabetes))
run.log('raw_rows', row_count)
# remove nulls
diabetes = diabetes.dropna()
# Normalize the numeric columns
scaler = MinMaxScaler()
num_cols = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree']
diabetes[num_cols] = scaler.fit_transform(diabetes[num_cols])
# Log processed rows
row_count = (len(diabetes))
run.log('processed_rows', row_count)
# Save the prepped data
print("Saving Data...")
os.makedirs(save_folder, exist_ok=True)
save_path = os.path.join(save_folder,'data.csv')
diabetes.to_csv(save_path, index=False, header=True)
# End the run
run.complete()
create the script for the second step, which will train a model. The script includes a argument named --training-folder, which references the folder where the prepared data was saved by the previous step
%%writefile $experiment_folder/train_diabetes.py
# Import libraries
from azureml.core import Run, Model
import argparse
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--training-folder", type=str, dest='training_folder', help='training data folder')
args = parser.parse_args()
training_folder = args.training_folder
# Get the experiment run context
run = Run.get_context()
# load the prepared data file in the training folder
print("Loading Data...")
file_path = os.path.join(training_folder,'data.csv')
diabetes = pd.read_csv(file_path)
# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values
# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
# Train adecision tree model
print('Training a decision tree model...')
model = DecisionTreeClassifier().fit(X_train, y_train)
# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))
# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))
# plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
fig = plt.figure(figsize=(6, 4))
# Plot the diagonal 50% line
plt.plot([0, 1], [0, 1], 'k--')
# Plot the FPR and TPR achieved by our model
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
run.log_image(name = "ROC", plot = fig)
plt.show()
# Save the trained model in the outputs folder
print("Saving model...")
os.makedirs('outputs', exist_ok=True)
model_file = os.path.join('outputs', 'diabetes_model.pkl')
joblib.dump(value=model, filename=model_file)
# Register the model
print('Registering model...')
Model.register(workspace=run.experiment.workspace,
model_path = model_file,
model_name = 'diabetes_model',
tags={'Training context':'Pipeline'},
properties={'AUC': np.float(auc), 'Accuracy': np.float(acc)})
run.complete()
Use the same compute for both steps
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
cluster_name = "compute-ray"
try:
# Check for existing compute target
pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
print('Found existing cluster, use it.')
except ComputeTargetException:
# If it doesn't already exist, create it
try:
compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
pipeline_cluster.wait_for_completion(show_output=True)
except Exception as ex:
print(ex)
Create env
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import RunConfiguration
# Create a Python environment for the experiment
diabetes_env = Environment("diabetes-pipeline-env")
diabetes_env.python.user_managed_dependencies = False # Let Azure ML manage dependencies
diabetes_env.docker.enabled = True # Use a docker container
# Create a set of package dependencies
diabetes_packages = CondaDependencies.create(conda_packages=['scikit-learn','ipykernel','matplotlib','pandas','pip'],
pip_packages=['azureml-defaults','azureml-dataprep[pandas]','pyarrow'])
# Add the dependencies to the environment
diabetes_env.python.conda_dependencies = diabetes_packages
# Register the environment
diabetes_env.register(workspace=ws)
registered_env = Environment.get(ws, 'diabetes-pipeline-env')
# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()
# Use the compute you created above.
pipeline_run_config.target = pipeline_cluster
# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env
print ("Run configuration created.")
Create and run pipeline
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep
# Get the training dataset
diabetes_ds = ws.datasets.get("diabetes dataset")
# Create a PipelineData (temporary Data Reference) for the model folder
prepped_data_folder = PipelineData("prepped_data_folder", datastore=ws.get_default_datastore())
# Step 1, Run the data prep script
train_step = PythonScriptStep(name = "Prepare Data",
source_directory = experiment_folder,
script_name = "prep_diabetes.py",
arguments = ['--input-data', diabetes_ds.as_named_input('raw_data'),
'--prepped-data', prepped_data_folder],
outputs=[prepped_data_folder],
compute_target = pipeline_cluster,
runconfig = pipeline_run_config,
allow_reuse = True)
# Step 2, run the training script
register_step = PythonScriptStep(name = "Train and Register Model",
source_directory = experiment_folder,
script_name = "train_diabetes.py",
arguments = ['--training-folder', prepped_data_folder],
inputs=[prepped_data_folder],
compute_target = pipeline_cluster,
runconfig = pipeline_run_config,
allow_reuse = True)
print("Pipeline steps defined")
Build the pipeline and run as an experiment
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails
# Construct the pipeline
pipeline_steps = [train_step, register_step]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")
# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'mslearn-diabetes-pipeline')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)
Monitor the run in the Experiments tab
When the pipeline has finished, you can examine the metrics recorded by it's child runs.
for run in pipeline_run.get_children():
print(run.name, ':')
metrics = run.get_metrics()
for metric_name in metrics:
print('\t',metric_name, ":", metrics[metric_name])
a new model should be registered with a Training context tag indicating it was trained in a pipeline. See it as
from azureml.core import Model
for model in Model.list(ws):
print(model.name, 'version:', model.version)
for tag_name in model.tags:
tag = model.tags[tag_name]
print ('\t',tag_name, ':', tag)
for prop_name in model.properties:
prop = model.properties[prop_name]
print ('\t',prop_name, ':', prop)
print('\n')
Publish the pipeline
# Publish the pipeline from the run
published_pipeline = pipeline_run.publish_pipeline(
name="diabetes-training-pipeline", description="Trains diabetes model", version="1.0")
published_pipeline
See the endpoint
rest_endpoint = published_pipeline.endpoint
print(rest_endpoint)
Call the pipeline endpoint
use the authorization header from your current connection to your Azure workspace
from azureml.core.authentication import InteractiveLoginAuthentication
interactive_auth = InteractiveLoginAuthentication()
auth_header = interactive_auth.get_authentication_header()
print("Authentication header ready.")
Call the rest interface
import requests
experiment_name = 'mslearn-diabetes-pipeline'
rest_endpoint = published_pipeline.endpoint
response = requests.post(rest_endpoint,
headers=auth_header,
json={"ExperimentName": experiment_name})
run_id = response.json()["Id"]
run_id
Wait for the run to complete
from azureml.pipeline.core.run import PipelineRun
published_pipeline_run = PipelineRun(ws.experiments[experiment_name], run_id)
pipeline_run.wait_for_completion(show_output=True)
Schedule the pipeline to run week and retain with new data
from azureml.pipeline.core import ScheduleRecurrence, Schedule
# Submit the Pipeline every Monday at 00:00 UTC
recurrence = ScheduleRecurrence(frequency="Week", interval=1, week_days=["Monday"], time_of_day="00:00")
weekly_schedule = Schedule.create(ws, name="weekly-diabetes-training",
description="Based on time",
pipeline_id=published_pipeline.id,
experiment_name='mslearn-diabetes-pipeline',
recurrence=recurrence)
print('Pipeline scheduled.')
Retrieve schedules as
schedules = Schedule.list(ws)
schedules
Check the latest run
pipeline_experiment = ws.experiments.get('mslearn-diabetes-pipeline')
latest_run = list(pipeline_experiment.get_runs())[0]
latest_run.get_details()
Deploy real-time machine learning services with Azure Machine Learning
Register a trained model
from azureml.core import Model
classification_model = Model.register(workspace=ws,
model_name='classification_model',
model_path='model.pkl', # local path
description='A classification model')
You can also register from a run
run.register_model( model_name='classification_model',
model_path='outputs/model.pkl', # run outputs path
description='A classification model')
Create an entry script (or scoring script)
import json
import joblib
import numpy as np
from azureml.core.model import Model
# Called when the service is loaded
def init():
global model
# Get the path to the registered model file and load it
model_path = Model.get_model_path('classification_model')
model = joblib.load(model_path)
# Called when a request is received
def run(raw_data):
# Get the input data as a numpy array
data = np.array(json.loads(raw_data)['data'])
# Get a prediction from the model
predictions = model.predict(data)
# Return the predictions as any JSON serializable format
return predictions.tolist()
Create an environment
from azureml.core.conda_dependencies import CondaDependencies
# Add the dependencies for your model
myenv = CondaDependencies()
myenv.add_conda_package("scikit-learn")
# Save the environment config as a .yml file
env_file = 'service_files/env.yml'
with open(env_file,"w") as f:
f.write(myenv.serialize_to_string())
print("Saved dependency info in", env_file)
Create env and entry script
from azureml.core.model import InferenceConfig
classifier_inference_config = InferenceConfig(runtime= "python",
source_directory = 'service_files',
entry_script="score.py",
conda_file="env.yml")
Create AKS cluster for deployment
from azureml.core.compute import ComputeTarget, AksCompute
cluster_name = 'aks-cluster'
compute_config = AksCompute.provisioning_configuration(location='eastus')
production_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
production_cluster.wait_for_completion(show_output=True)
Set compute
from azureml.core.webservice import AksWebservice
classifier_deploy_config = AksWebservice.deploy_configuration(cpu_cores = 1,
memory_gb = 1)
Deploy the model
from azureml.core.model import Model
model = ws.models['classification_model']
service = Model.deploy(workspace=ws,
name = 'classifier-service',
models = [model],
inference_config = classifier_inference_config,
deployment_config = classifier_deploy_config,
deployment_target = production_cluster)
service.wait_for_deployment(show_output = True)
Get prdictions (response)
import json
# An array of new data cases
x_new = [[0.1,2.3,4.1,2.0],
[0.2,1.8,3.9,2.1]]
# Convert the array to a serializable list in a JSON document
json_data = json.dumps({"data": x_new})
# Call the web service, passing the input data
response = service.run(input_data = json_data)
# Get the predictions
predictions = json.loads(response)
# Print the predicted class for each case.
for i in range(len(x_new)):
print (x_new[i]), predictions[i] )
If using a REST api (not SDK)
endpoint = service.scoring_uri
print(endpoint)
import requests
import json
# An array of new data cases
x_new = [[0.1,2.3,4.1,2.0],
[0.2,1.8,3.9,2.1]]
# Convert the array to a serializable list in a JSON document
json_data = json.dumps({"data": x_new})
# Set the content type in the request headers
request_headers = { 'Content-Type':'application/json' }
# Call the service
response = requests.post(url = endpoint,
data = json_data,
headers = request_headers)
# Get the predictions from the JSON response
predictions = json.loads(response.json())
# Print the predicted class for each case.
for i in range(len(x_new)):
print (x_new[i]), predictions[i] )
Can authenticate with keys
primary_key, secondary_key = service.get_keys()
You can get you token as
import requests
import json
# An array of new data cases
x_new = [[0.1,2.3,4.1,2.0],
[0.2,1.8,3.9,2.1]]
# Convert the array to a serializable list in a JSON document
json_data = json.dumps({"data": x_new})
# Set the content type in the request headers
request_headers = { "Content-Type":"application/json",
"Authorization":"Bearer " + key_or_token }
# Call the service
response = requests.post(url = endpoint,
data = json_data,
headers = request_headers)
# Get the predictions from the JSON response
predictions = json.loads(response.json())
# Print the predicted class for each case.
for i in range(len(x_new)):
print (x_new[i]), predictions[i] )
Check the server state
from azureml.core.webservice import AksWebservice
# Get the deployed service
service = AksWebservice(name='classifier-service', workspace=ws)
# Check its state
print(service.state)
See service logs
print(service.get_logs())
Deploy to local container
from azureml.core.webservice import LocalWebservice
deployment_config = LocalWebservice.deploy_configuration(port=8890)
service = Model.deploy(ws, 'test-svc', [model], inference_config, deployment_config)
Test the deployed service using the SDK
print(service.run(input_data = json_data))
troubleshoot runtime issues by making changes to the scoring file that is referenced in the inference configuration, and reloading the service without redeploying it (s
service.reload()
print(service.run(input_data = json_data))
Exercise
!pip install --upgrade azureml-sdk azureml-widgets
Connect to workspace
import azureml.core
from azureml.core import Workspace
# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))
Train and register a model
from azureml.core import Experiment
from azureml.core import Model
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
# Create an Azure ML experiment in your workspace
experiment = Experiment(workspace=ws, name="mslearn-train-diabetes")
run = experiment.start_logging()
print("Starting experiment:", experiment.name)
# load the diabetes dataset
print("Loading Data...")
diabetes = pd.read_csv('data/diabetes.csv')
# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values
# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
# Train a decision tree model
print('Training a decision tree model')
model = DecisionTreeClassifier().fit(X_train, y_train)
# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))
# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))
# Save the trained model
model_file = 'diabetes_model.pkl'
joblib.dump(value=model, filename=model_file)
run.upload_file(name = 'outputs/' + model_file, path_or_stream = './' + model_file)
# Complete the run
run.complete()
# Register the model
run.register_model(model_path='outputs/diabetes_model.pkl', model_name='diabetes_model',
tags={'Training context':'Inline Training'},
properties={'AUC': run.get_metrics()['AUC'], 'Accuracy': run.get_metrics()['Accuracy']})
print('Model trained and registered.')
Deploy the model as a web service
from azureml.core import Model
for model in Model.list(ws):
print(model.name, 'version:', model.version)
for tag_name in model.tags:
tag = model.tags[tag_name]
print ('\t',tag_name, ':', tag)
for prop_name in model.properties:
prop = model.properties[prop_name]
print ('\t',prop_name, ':', prop)
print('\n')
Get the model to deploy
model = ws.models['diabetes_model']
print(model.name, 'version', model.version)
Set folder to host web service
import os
folder_name = 'diabetes_service'
# Create a folder for the web service files
experiment_folder = './' + folder_name
os.makedirs(experiment_folder, exist_ok=True)
print(folder_name, 'folder created.')
# Set path for scoring script
script_file = os.path.join(experiment_folder,"score_diabetes.py")
Create an entry script
import json
import joblib
import numpy as np
from azureml.core.model import Model
# Called when the service is loaded
def init():
global model
# Get the path to the deployed model file and load it
model_path = Model.get_model_path('diabetes_model')
model = joblib.load(model_path)
# Called when a request is received
def run(raw_data):
# Get the input data as a numpy array
data = np.array(json.loads(raw_data)['data'])
# Get a prediction from the model
predictions = model.predict(data)
# Get the corresponding classname for each prediction (0 or 1)
classnames = ['not-diabetic', 'diabetic']
predicted_classes = []
for prediction in predictions:
predicted_classes.append(classnames[prediction])
# Return the predictions as JSON
return json.dumps(predicted_classes)
Create env
from azureml.core.conda_dependencies import CondaDependencies
# Add the dependencies for our model (AzureML defaults is already included)
myenv = CondaDependencies()
myenv.add_conda_package('scikit-learn')
# Save the environment config as a .yml file
env_file = os.path.join(experiment_folder,"diabetes_env.yml")
with open(env_file,"w") as f:
f.write(myenv.serialize_to_string())
print("Saved dependency info in", env_file)
# Print the .yml file
with open(env_file,"r") as f:
print(f.read())
deploy the container a service named diabetes-service
Define an inference configuration, which includes the scoring and environment files required to load and use the model.
Define a deployment configuration that defines the execution environment in which the service will be hosted. In this case, an Azure Container Instance.
Deploy the model as a web service.
Verify the status of the deployed service.
from azureml.core.webservice import AciWebservice
from azureml.core.model import InferenceConfig
# Configure the scoring environment
inference_config = InferenceConfig(runtime= "python",
entry_script=script_file,
conda_file=env_file)
deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1)
service_name = "diabetes-service"
service = Model.deploy(ws, service_name, [model], inference_config, deployment_config)
service.wait_for_deployment(True)
print(service.state)
See the logs
print(service.get_logs())
# If you need to make a change and redeploy, you may need to delete unhealthy service using the following code:
#service.delete()
See Endpoints
for webservice_name in ws.webservices:
print(webservice_name)
consumer the service
import json
x_new = [[2,180,74,24,21,23.9091702,1.488172308,22]]
print ('Patient: {}'.format(x_new[0]))
# Convert the array to a serializable list in a JSON document
input_json = json.dumps({"data": x_new})
# Call the web service, passing the input data (the web service will also accept the data in binary format)
predictions = service.run(input_data = input_json)
# Get the predicted class - it'll be the first (and only) one.
predicted_classes = json.loads(predictions)
print(predicted_classes[0])
Send multiple entries
import json
# This time our input is an array of two feature arrays
x_new = [[2,180,74,24,21,23.9091702,1.488172308,22],
[0,148,58,11,179,39.19207553,0.160829008,45]]
# Convert the array or arrays to a serializable list in a JSON document
input_json = json.dumps({"data": x_new})
# Call the web service, passing the input data
predictions = service.run(input_data = input_json)
# Get the predicted classes.
predicted_classes = json.loads(predictions)
for i in range(len(x_new)):
print ("Patient {}".format(x_new[i]), predicted_classes[i] )
See endpoint
endpoint = service.scoring_uri
print(endpoint)
Make HTTP request
import requests
import json
x_new = [[2,180,74,24,21,23.9091702,1.488172308,22],
[0,148,58,11,179,39.19207553,0.160829008,45]]
# Convert the array to a serializable list in a JSON document
input_json = json.dumps({"data": x_new})
# Set the content type
headers = { 'Content-Type':'application/json' }
predictions = requests.post(endpoint, input_json, headers = headers)
predicted_classes = json.loads(predictions.json())
for i in range(len(x_new)):
print ("Patient {}".format(x_new[i]), predicted_classes[i] )
Delete the service
service.delete()
print ('Service deleted.')
More info in docs https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-and-where?tabs=azcli
Deploy batch inference pipelines with Azure Machine Learning
Register a model
from azureml.core import Model
classification_model = Model.register(workspace=your_workspace,
model_name='classification_model',
model_path='model.pkl', # local path
description='A classification model')
or
run.register_model( model_name='classification_model',
model_path='outputs/model.pkl', # run outputs path
description='A classification model')
Create a scoring script. init called when the pipeline is initiated. run(mini-batch) for each of batch of data to be processed
import os
import numpy as np
from azureml.core import Model
import joblib
def init():
# Runs when the pipeline step is initialized
global model
# load the model
model_path = Model.get_model_path('classification_model')
model = joblib.load(model_path)
def run(mini_batch):
# This runs for each batch
resultList = []
# process each file in the batch
for f in mini_batch:
# Read comma-delimited data into an array
data = np.genfromtxt(f, delimiter=',')
# Reshape into a 2-dimensional array for model input
prediction = model.predict(data.reshape(1, -1))
# Append prediction to results
resultList.append("{}: {}".format(os.path.basename(f), prediction[0]))
return resultList
Create a pipeline with a parallel run-step
from azureml.pipeline.steps import ParallelRunConfig, ParallelRunStep
from azureml.pipeline.core import PipelineData
from azureml.pipeline.core import Pipeline
# Get the batch dataset for input
batch_data_set = ws.datasets['batch-data']
# Set the output location
default_ds = ws.get_default_datastore()
output_dir = PipelineData(name='inferences',
datastore=default_ds,
output_path_on_compute='results')
# Define the parallel run step step configuration
parallel_run_config = ParallelRunConfig(
source_directory='batch_scripts',
entry_script="batch_scoring_script.py",
mini_batch_size="5",
error_threshold=10,
output_action="append_row",
environment=batch_env,
compute_target=aml_cluster,
node_count=4)
# Create the parallel run step
parallelrun_step = ParallelRunStep(
name='batch-score',
parallel_run_config=parallel_run_config,
inputs=[batch_data_set.as_named_input('batch_data')],
output=output_dir,
arguments=[],
allow_reuse=True
)
# Create the pipeline
pipeline = Pipeline(workspace=ws, steps=[parallelrun_step])
Run the pipeline and retrive the output script
from azureml.core import Experiment
# Run the pipeline as an experiment
pipeline_run = Experiment(ws, 'batch_prediction_pipeline').submit(pipeline)
pipeline_run.wait_for_completion(show_output=True)
# Get the outputs from the first (and only) step
prediction_run = next(pipeline_run.get_children())
prediction_output = prediction_run.get_output_data('inferences')
prediction_output.download(local_path='results')
# Find the parallel_run_step.txt file
for root, dirs, files in os.walk('results'):
for file in files:
if file.endswith('parallel_run_step.txt'):
result_file = os.path.join(root,file)
# Load and display the results
df = pd.read_csv(result_file, delimiter=":", header=None)
df.columns = ["File", "Prediction"]
print(df)
Publishing
as a REST service
published_pipeline = pipeline_run.publish_pipeline(name='Batch_Prediction_Pipeline',
description='Batch pipeline',
version='1.0')
rest_endpoint = published_pipeline.endpoint
Use the service endpoint
import requests
response = requests.post(rest_endpoint,
headers=auth_header,
json={"ExperimentName": "Batch_Prediction"})
run_id = response.json()["Id"]
Schedule the pipeline
from azureml.pipeline.core import ScheduleRecurrence, Schedule
weekly = ScheduleRecurrence(frequency='Week', interval=1)
pipeline_schedule = Schedule.create(ws, name='Weekly Predictions',
description='batch inferencing',
pipeline_id=published_pipeline.id,
experiment_name='Batch_Prediction',
recurrence=weekly)
Exercise
!pip install --upgrade azureml-sdk azureml-widgets
import azureml.core
from azureml.core import Workspace
# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))
Train and register a model
from azureml.core import Experiment
from azureml.core import Model
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
# Create an Azure ML experiment in your workspace
experiment = Experiment(workspace=ws, name='mslearn-train-diabetes')
run = experiment.start_logging()
print("Starting experiment:", experiment.name)
# load the diabetes dataset
print("Loading Data...")
diabetes = pd.read_csv('data/diabetes.csv')
# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values
# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
# Train a decision tree model
print('Training a decision tree model')
model = DecisionTreeClassifier().fit(X_train, y_train)
# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))
# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))
# Save the trained model
model_file = 'diabetes_model.pkl'
joblib.dump(value=model, filename=model_file)
run.upload_file(name = 'outputs/' + model_file, path_or_stream = './' + model_file)
# Complete the run
run.complete()
# Register the model
run.register_model(model_path='outputs/diabetes_model.pkl', model_name='diabetes_model',
tags={'Training context':'Inline Training'},
properties={'AUC': run.get_metrics()['AUC'], 'Accuracy': run.get_metrics()['Accuracy']})
print('Model trained and registered.')
Generate and upload batch data
from azureml.core import Datastore, Dataset
import pandas as pd
import os
# Set default data store
ws.set_default_datastore('workspaceblobstore')
default_ds = ws.get_default_datastore()
# Enumerate all datastores, indicating which is the default
for ds_name in ws.datastores:
print(ds_name, "- Default =", ds_name == default_ds.name)
# Load the diabetes data
diabetes = pd.read_csv('data/diabetes2.csv')
# Get a 100-item sample of the feature columns (not the diabetic label)
sample = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].sample(n=100).values
# Create a folder
batch_folder = './batch-data'
os.makedirs(batch_folder, exist_ok=True)
print("Folder created!")
# Save each sample as a separate file
print("Saving files...")
for i in range(100):
fname = str(i+1) + '.csv'
sample[i].tofile(os.path.join(batch_folder, fname), sep=",")
print("files saved!")
# Upload the files to the default datastore
print("Uploading files to datastore...")
default_ds = ws.get_default_datastore()
default_ds.upload(src_dir="batch-data", target_path="batch-data", overwrite=True, show_progress=True)
# Register a dataset for the input data
batch_data_set = Dataset.File.from_files(path=(default_ds, 'batch-data/'), validate=False)
try:
batch_data_set = batch_data_set.register(workspace=ws,
name='batch-data',
description='batch data',
create_new_version=True)
except Exception as ex:
print(ex)
print("Done!")
Create compute
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
cluster_name = "your-compute-cluster"
try:
# Check for existing compute target
inference_cluster = ComputeTarget(workspace=ws, name=cluster_name)
print('Found existing cluster, use it.')
except ComputeTargetException:
# If it doesn't already exist, create it
try:
compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
inference_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
inference_cluster.wait_for_completion(show_output=True)
except Exception as ex:
print(ex)
Create a pipline for batch inferencing
import os
# Create a folder for the experiment files
experiment_folder = 'batch_pipeline'
os.makedirs(experiment_folder, exist_ok=True)
print(experiment_folder)
write script
%%writefile $experiment_folder/batch_diabetes.py
import os
import numpy as np
from azureml.core import Model
import joblib
def init():
# Runs when the pipeline step is initialized
global model
# load the model
model_path = Model.get_model_path('diabetes_model')
model = joblib.load(model_path)
def run(mini_batch):
# This runs for each batch
resultList = []
# process each file in the batch
for f in mini_batch:
# Read the comma-delimited data into an array
data = np.genfromtxt(f, delimiter=',')
# Reshape into a 2-dimensional array for prediction (model expects multiple items)
prediction = model.predict(data.reshape(1, -1))
# Append prediction to results
resultList.append("{}: {}".format(os.path.basename(f), prediction[0]))
return resultList
Define run context with requirements
from azureml.core import Environment
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
from azureml.core.runconfig import CondaDependencies
# Add dependencies required by the model
# For scikit-learn models, you need scikit-learn
# For parallel pipeline steps, you need azureml-core and azureml-dataprep[fuse]
cd = CondaDependencies.create(conda_packages=['scikit-learn','pip'],
pip_packages=['azureml-defaults','azureml-core','azureml-dataprep[fuse]'])
batch_env = Environment(name='batch_environment')
batch_env.python.conda_dependencies = cd
batch_env.docker.enabled = True
batch_env.docker.base_image = DEFAULT_CPU_IMAGE
print('Configuration ready.')
use a pipeline to run the batch prediction script, generate predictions from the input data, and save the results as a text file in the output folder
from azureml.pipeline.steps import ParallelRunConfig, ParallelRunStep
from azureml.pipeline.core import PipelineData
default_ds = ws.get_default_datastore()
output_dir = PipelineData(name='inferences',
datastore=default_ds,
output_path_on_compute='diabetes/results')
parallel_run_config = ParallelRunConfig(
source_directory=experiment_folder,
entry_script="batch_diabetes.py",
mini_batch_size="5",
error_threshold=10,
output_action="append_row",
environment=batch_env,
compute_target=inference_cluster,
node_count=2)
parallelrun_step = ParallelRunStep(
name='batch-score-diabetes',
parallel_run_config=parallel_run_config,
inputs=[batch_data_set.as_named_input('diabetes_batch')],
output=output_dir,
arguments=[],
allow_reuse=True
)
print('Steps defined')
put the step into a pipeline and run in
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
pipeline = Pipeline(workspace=ws, steps=[parallelrun_step])
pipeline_run = Experiment(ws, 'mslearn-diabetes-batch').submit(pipeline)
pipeline_run.wait_for_completion(show_output=True)
Retrieve predictions as
import pandas as pd
import shutil
# Remove the local results folder if left over from a previous run
shutil.rmtree('diabetes-results', ignore_errors=True)
# Get the run for the first step and download its output
prediction_run = next(pipeline_run.get_children())
prediction_output = prediction_run.get_output_data('inferences')
prediction_output.download(local_path='diabetes-results')
# Traverse the folder hierarchy and find the results file
for root, dirs, files in os.walk('diabetes-results'):
for file in files:
if file.endswith('parallel_run_step.txt'):
result_file = os.path.join(root,file)
# cleanup output format
df = pd.read_csv(result_file, delimiter=":", header=None)
df.columns = ["File", "Prediction"]
# Display the first 20 results
df.head(20)
Publish the pipeline and its REST interface
published_pipeline = pipeline_run.publish_pipeline(
name='diabetes-batch-pipeline', description='Batch scoring of diabetes data', version='1.0')
published_pipeline
you can see the end point in the Azure portal or
rest_endpoint = published_pipeline.endpoint
print(rest_endpoint)
make a REST call over HTTP
from azureml.core.authentication import InteractiveLoginAuthentication
interactive_auth = InteractiveLoginAuthentication()
auth_header = interactive_auth.get_authentication_header()
print('Authentication header ready.')
import requests
rest_endpoint = published_pipeline.endpoint
response = requests.post(rest_endpoint,
headers=auth_header,
json={"ExperimentName": "mslearn-diabetes-batch"})
run_id = response.json()["Id"]
run_id
from azureml.pipeline.core.run import PipelineRun
from azureml.widgets import RunDetails
published_pipeline_run = PipelineRun(ws.experiments['mslearn-diabetes-batch'], run_id)
# Block until the run completes
published_pipeline_run.wait_for_completion(show_output=True)
See the results
import pandas as pd
import shutil
# Remove the local results folder if left over from a previous run
shutil.rmtree('diabetes-results', ignore_errors=True)
# Get the run for the first step and download its output
prediction_run = next(pipeline_run.get_children())
prediction_output = prediction_run.get_output_data('inferences')
prediction_output.download(local_path='diabetes-results')
# Traverse the folder hierarchy and find the results file
for root, dirs, files in os.walk('diabetes-results'):
for file in files:
if file.endswith('parallel_run_step.txt'):
result_file = os.path.join(root,file)
# cleanup output format
df = pd.read_csv(result_file, delimiter=":", header=None)
df.columns = ["File", "Prediction"]
# Display the first 20 results
df.head(20)
Tune hyperparameters with Azure Machine Learning
hyperdrive run
discrete hyperparameters: you can use a Python list (choice([10,20,30])), a range (choice(range(1,10))), or an arbitrary set of comma-separated values (choice(30,50,100)). or discrete value from a distribute distributions qnormal, quniform, qlognormal, qloguniform
Some values are continuous normal, uniform, lognormal, loguniform
define a search space
from azureml.train.hyperdrive import choice, normal
param_space = {
'--batch_size': choice(16, 32, 64),
'--learning_rate': normal(10, 3)
}
Grid sampling if all discrete
from azureml.train.hyperdrive import GridParameterSampling, choice
param_space = {
'--batch_size': choice(16, 32, 64),
'--learning_rate': choice(0.01, 0.1, 1.0)
}
param_sampling = GridParameterSampling(param_space)
Random sample can be discrete or continuous
from azureml.train.hyperdrive import RandomParameterSampling, choice, normal
param_space = {
'--batch_size': choice(16, 32, 64),
'--learning_rate': normal(10, 3)
}
param_sampling = RandomParameterSampling(param_space)
Bayesian sampling
from azureml.train.hyperdrive import BayesianParameterSampling, choice, uniform
param_space = {
'--batch_size': choice(16, 32, 64),
'--learning_rate': uniform(0.5, 0.1)
}
param_sampling = BayesianParameterSampling(param_space)
You can only use Bayesian sampling with choice, uniform, and quniform parameter expressions, and you can't combine it with an early-termination policy.
early termination policy that abandons runs that are unlikely to produce a better result than previously completed runs. The policy is evaluated at an evaluation_interval you specify, based on each time the target performance metric is logged. You can also set a delay_evaluation parameter to avoid evaluating the policy until a minimum number of iterations have been completed.
Bandit policy
from azureml.train.hyperdrive import BanditPolicy
early_termination_policy = BanditPolicy(slack_amount = 0.2,
evaluation_interval=1,
delay_evaluation=5)
Applies the policy for every iteration after the first five, and abandons runs where the reported target metric is 0.2 or more worse than the best performing run after the same number of intervals.
A median stopping policy abandons runs where the target performance metric is worse than the median of the running averages for all runs.
from azureml.train.hyperdrive import MedianStoppingPolicy
early_termination_policy = MedianStoppingPolicy(evaluation_interval=1,
delay_evaluation=5)
A truncation selection policy cancels the lowest performing X% of runs at each evaluation interval based on the truncation_percentage value you specify for X.
from azureml.train.hyperdrive import TruncationSelectionPolicy
early_termination_policy = TruncationSelectionPolicy(truncation_percentage=10,
evaluation_interval=1,
delay_evaluation=5)
Create a training script for hyperparamter tuning
Include an argument for each hyperparameter you want to vary.
Log the target performance metric. This enables the hyperdrive run to evaluate the performance of the child runs it initiates, and identify the one that produces the best performing model.
the following example script trains a logistic regression model using a --regularization argument to set the regularization rate hyperparameter, and logs the accuracy metric with the name Accuracy:
import argparse
import joblib
from azureml.core import Run
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# Get regularization hyperparameter
parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01)
args = parser.parse_args()
reg = args.reg_rate
# Get the experiment run context
run = Run.get_context()
# load the training dataset
data = run.input_datasets['training_data'].to_pandas_dataframe()
# Separate features and labels, and split for training/validatiom
X = data[['feature1','feature2','feature3','feature4']].values
y = data['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
# Train a logistic regression model with the reg hyperparameter
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)
# calculate and log accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
run.log('Accuracy', np.float(acc))
# Save the trained model
os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/model.pkl')
run.complete()
Configuring and running
from azureml.core import Experiment
from azureml.train.hyperdrive import HyperDriveConfig, PrimaryMetricGoal
# Assumes ws, script_config and param_sampling are already defined
hyperdrive = HyperDriveConfig(run_config=script_config,
hyperparameter_sampling=param_sampling,
policy=None,
primary_metric_name='Accuracy',
primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
max_total_runs=6,
max_concurrent_runs=4)
experiment = Experiment(workspace = ws, name = 'hyperdrive_training')
hyperdrive_run = experiment.submit(config=hyperdrive)
Monitoring and reviewing hyperdrive runs
The experiment will initiate a child run for each hyperparameter combination to be tried, and you can retrieve the logged metrics these runs using the following code:
for child_run in run.get_children():
print(child_run.id, child_run.get_metrics())
List all runs in descending order of performance like this
for child_run in hyperdrive_.get_children_sorted_by_primary_metric():
print(child_run)
retrieve the best performing run, you can use the following code:
best_run = hyperdrive_run.get_best_run_by_primary_metric()
Exercise
!pip install --upgrade azureml-sdk azureml-widgets
import azureml.core
from azureml.core import Workspace
# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))
Prep data
from azureml.core import Dataset
default_ds = ws.get_default_datastore()
if 'diabetes dataset' not in ws.datasets:
default_ds.upload_files(files=['./data/diabetes.csv', './data/diabetes2.csv'], # Upload the diabetes csv files in /data
target_path='diabetes-data/', # Put it in a folder path in the datastore
overwrite=True, # Replace existing files of the same name
show_progress=True)
#Create a tabular dataset from the path on the datastore (this may take a short while)
tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'diabetes-data/*.csv'))
# Register the tabular dataset
try:
tab_data_set = tab_data_set.register(workspace=ws,
name='diabetes dataset',
description='diabetes data',
tags = {'format':'CSV'},
create_new_version=True)
print('Dataset registered.')
except Exception as ex:
print(ex)
else:
print('Dataset already registered.')
Prepare a training script
import os
experiment_folder = 'diabetes_training-hyperdrive'
os.makedirs(experiment_folder, exist_ok=True)
print('Folder ready.')
create the Python script to train the model. In this example, you'll use a Gradient Boosting algorithm to train a classification model. The script must include:
An argument for each hyperparameter you want to optimize (in this case, the learning rate and number of estimators for the Gradient Boosting algorithm)
Code to log the performance metric you want to optimize for (in this case, you'll log both AUC and accuracy, so you can choose to optimize the model for either of these)
%%writefile $experiment_folder/diabetes_training.py
# Import libraries
import argparse, joblib, os
from azureml.core import Run
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, roc_curve
# Get the experiment run context
run = Run.get_context()
# Get script arguments
parser = argparse.ArgumentParser()
# Input dataset
parser.add_argument("--input-data", type=str, dest='input_data', help='training dataset')
# Hyperparameters
parser.add_argument('--learning_rate', type=float, dest='learning_rate', default=0.1, help='learning rate')
parser.add_argument('--n_estimators', type=int, dest='n_estimators', default=100, help='number of estimators')
# Add arguments to args collection
args = parser.parse_args()
# Log Hyperparameter values
run.log('learning_rate', np.float(args.learning_rate))
run.log('n_estimators', np.int(args.n_estimators))
# load the diabetes dataset
print("Loading Data...")
diabetes = run.input_datasets['training_data'].to_pandas_dataframe() # Get the training data from the estimator input
# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values
# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
# Train a Gradient Boosting classification model with the specified hyperparameters
print('Training a classification model')
model = GradientBoostingClassifier(learning_rate=args.learning_rate,
n_estimators=args.n_estimators).fit(X_train, y_train)
# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))
# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))
# Save the model in the run outputs
os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')
run.complete()
Create compute
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
cluster_name = "your-compute-cluster"
try:
# Check for existing compute target
training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
print('Found existing cluster, use it.')
except ComputeTargetException:
# If it doesn't already exist, create it
try:
compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
training_cluster.wait_for_completion(show_output=True)
except Exception as ex:
print(ex)
Run a hyperparamter tuning experiment
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.train.hyperdrive import GridParameterSampling, HyperDriveConfig, PrimaryMetricGoal, choice
from azureml.widgets import RunDetails
# Create a Python environment for the experiment
sklearn_env = Environment("sklearn-env")
# Ensure the required packages are installed (we need scikit-learn, Azure ML defaults, and Azure ML dataprep)
packages = CondaDependencies.create(conda_packages=['scikit-learn','pip'],
pip_packages=['azureml-defaults','azureml-dataprep[pandas]'])
sklearn_env.python.conda_dependencies = packages
# Get the training dataset
diabetes_ds = ws.datasets.get("diabetes dataset")
# Create a script config
script_config = ScriptRunConfig(source_directory=experiment_folder,
script='diabetes_training.py',
# Add non-hyperparameter arguments -in this case, the training dataset
arguments = ['--input-data', diabetes_ds.as_named_input('training_data')],
environment=sklearn_env,
compute_target = training_cluster)
# Sample a range of parameter values
params = GridParameterSampling(
{
# Hyperdrive will try 6 combinations, adding these as script arguments
'--learning_rate': choice(0.01, 0.1, 1.0),
'--n_estimators' : choice(10, 100)
}
)
# Configure hyperdrive settings
hyperdrive = HyperDriveConfig(run_config=script_config,
hyperparameter_sampling=params,
policy=None, # No early stopping policy
primary_metric_name='AUC', # Find the highest AUC metric
primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
max_total_runs=6, # Restict the experiment to 6 iterations
max_concurrent_runs=2) # Run up to 2 iterations in parallel
# Run the experiment
experiment = Experiment(workspace=ws, name='mslearn-diabetes-hyperdrive')
run = experiment.submit(config=hyperdrive)
# Show the status in the notebook as the experiment runs
RunDetails(run).show()
run.wait_for_completion()
Determine the best performing run
# Print all child runs, sorted by the primary metric
for child_run in run.get_children_sorted_by_primary_metric():
print(child_run)
# Get the best run, and its metrics and arguments
best_run = run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
script_arguments = best_run.get_details() ['runDefinition']['arguments']
print('Best Run Id: ', best_run.id)
print(' -AUC:', best_run_metrics['AUC'])
print(' -Accuracy:', best_run_metrics['Accuracy'])
print(' -Arguments:',script_arguments)
register the model
from azureml.core import Model
# Register model
best_run.register_model(model_path='outputs/diabetes_model.pkl', model_name='diabetes_model',
tags={'Training context':'Hyperdrive'},
properties={'AUC': best_run_metrics['AUC'], 'Accuracy': best_run_metrics['Accuracy']})
# List registered models
for model in Model.list(ws):
print(model.name, 'version:', model.version)
for tag_name in model.tags:
tag = model.tags[tag_name]
print ('\t',tag_name, ':', tag)
for prop_name in model.properties:
prop = model.properties[prop_name]
print ('\t',prop_name, ':', prop)
print('\n')
https://docs.microsoft.com/en-us/azure/machine-learning/how-to-tune-hyperparameters
Automate machine learning model selection with Azure Machine learning
https://docs.microsoft.com/en-us/azure/machine-learning/concept-automated-ml
can apply scaling and normalization to numeric data automatically. can also do
Missing value imputation to eliminate nulls in the training dataset.
Categorical encoding to convert categorical features to numeric indicators.
Dropping high-cardinality features, such as record IDs.
Feature engineering (for example, deriving individual date parts from DateTime features)
Others...
Configure
from azureml.train.automl import AutoMLConfig
automl_run_config = RunConfiguration(framework='python')
automl_config = AutoMLConfig(name='Automated ML Experiment',
task='classification',
primary_metric = 'AUC_weighted',
compute_target=aml_compute,
training_data = train_dataset,
validation_data = test_dataset,
label_column_name='Label',
featurization='auto',
iterations=12,
max_concurrent_iterations=4)
Specify the primary metric
from azureml.train.automl.utilities import get_primary_metrics
get_primary_metrics('classification')
See metrics here https://docs.microsoft.com/en-us/azure/machine-learning/how-to-understand-automated-ml
submitting
from azureml.core.experiment import Experiment
automl_experiment = Experiment(ws, 'automl_experiment')
automl_run = automl_experiment.submit(automl_config)
Retrieving best run and its model
best_run, fitted_model = automl_run.get_output()
best_run_metrics = best_run.get_metrics()
for metric_name in best_run_metrics:
metric = best_run_metrics[metric_name]
print(metric_name, metric)
Explore preprocessing steps
for step in fitted_model.named_steps:
print(step)
Exercise
!pip install --upgrade azureml-sdk azureml-widgets
import azureml.core
from azureml.core import Workspace
# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))
Prepare data
from azureml.core import Dataset
default_ds = ws.get_default_datastore()
if 'diabetes dataset' not in ws.datasets:
default_ds.upload_files(files=['./data/diabetes.csv', './data/diabetes2.csv'], # Upload the diabetes csv files in /data
target_path='diabetes-data/', # Put it in a folder path in the datastore
overwrite=True, # Replace existing files of the same name
show_progress=True)
#Create a tabular dataset from the path on the datastore (this may take a short while)
tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'diabetes-data/*.csv'))
# Register the tabular dataset
try:
tab_data_set = tab_data_set.register(workspace=ws,
name='diabetes dataset',
description='diabetes data',
tags = {'format':'CSV'},
create_new_version=True)
print('Dataset registered.')
except Exception as ex:
print(ex)
else:
print('Dataset already registered.')
# Split the dataset into training and validation subsets
diabetes_ds = ws.datasets.get("diabetes dataset")
train_ds, test_ds = diabetes_ds.random_split(percentage=0.7, seed=123)
print("Data ready!")
Perpare compute
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
cluster_name = "your-compute-cluster"
try:
# Check for existing compute target
training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
print('Found existing cluster, use it.')
except ComputeTargetException:
# If it doesn't already exist, create it
try:
compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
training_cluster.wait_for_completion(show_output=True)
except Exception as ex:
print(ex)
Configure AUtoML
import azureml.train.automl.utilities as automl_utils
for metric in automl_utils.get_primary_metrics('classification'):
print(metric)
Configure AuroML run
from azureml.train.automl import AutoMLConfig
automl_config = AutoMLConfig(name='Automated ML Experiment',
task='classification',
compute_target=training_cluster,
training_data = train_ds,
validation_data = test_ds,
label_column_name='Diabetic',
iterations=4,
primary_metric = 'AUC_weighted',
max_concurrent_iterations=2,
featurization='auto'
)
print("Ready for Auto ML run.")
Run AutoML experiment
from azureml.core.experiment import Experiment
from azureml.widgets import RunDetails
print('Submitting Auto ML experiment...')
automl_experiment = Experiment(ws, 'mslearn-diabetes-automl-sdk')
automl_run = automl_experiment.submit(automl_config)
RunDetails(automl_run).show()
automl_run.wait_for_completion(show_output=True)
Determine the best model
best_run, fitted_model = automl_run.get_output()
print(best_run)
print(fitted_model)
best_run_metrics = best_run.get_metrics()
for metric_name in best_run_metrics:
metric = best_run_metrics[metric_name]
print(metric_name, metric)
See steps
or step in fitted_model.named_steps:
print(step)
Register best model
from azureml.core import Model
# Register model
best_run.register_model(model_path='outputs/model.pkl', model_name='diabetes_model_automl',
tags={'Training context':'Auto ML'},
properties={'AUC': best_run_metrics['AUC_weighted'], 'Accuracy': best_run_metrics['accuracy']})
# List registered models
for model in Model.list(ws):
print(model.name, 'version:', model.version)
for tag_name in model.tags:
tag = model.tags[tag_name]
print ('\t',tag_name, ':', tag)
for prop_name in model.properties:
prop = model.properties[prop_name]
print ('\t',prop_name, ':', prop)
print('\n')
https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-train
Explore differential privacy
!pip install opendp-smartnoise
Load data
import pandas as pd
data_path = 'data/diabetes.csv'
diabetes = pd.read_csv(data_path)
diabetes.describe()
Upper and lower bounds: Clamping is used to set upper and lower bounds on values for a variable. This is required to ensure that the noise generated by SmartNoise is consistent with the expected distribution of the original data.
Sample size: To generate consistent differentially private data for some aggregations, SmartNoise needs to know the size of the data sample to be generated.
Epsilon: Put simplistically, epsilon is a non-negative value that provides an inverse measure of the amount of noise added to the data. A low epsilon results in a dataset with a greater level of privacy, while a high epsilon results in a dataset that is closer to the original data. Generally, you should use epsilon values between 0 and 1. Epsilon is correlated with another value named delta, that indicates the probability that a report generated by an analysis is not fully private.
import opendp.smartnoise.core as sn
cols = list(diabetes.columns)
age_range = [0.0, 120.0]
samples = len(diabetes)
with sn.Analysis() as analysis:
# load data
data = sn.Dataset(path=data_path, column_names=cols)
# Convert Age to float
age_dt = sn.to_float(data['Age'])
# get mean of age
age_mean = sn.dp_mean(data = age_dt,
privacy_usage = {'epsilon': .50},
data_lower = age_range[0],
data_upper = age_range[1],
data_rows = samples
)
analysis.release()
# print differentially private estimate of mean age
print("Private mean age:",age_mean.value)
# print actual mean age
print("Actual mean age:",diabetes.Age.mean())
Explore data distributions with histograms
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
ages = list(range(0, 130, 10))
age = diabetes.Age
# Plot a histogram with 10-year bins
n_age, bins, patches = plt.hist(age, bins=ages, color='blue', alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.title('True Age Distribution')
plt.show()
print(n_age.astype(int))
import matplotlib.pyplot as plt
with sn.Analysis() as analysis:
data = sn.Dataset(path = data_path, column_names = cols)
age_histogram = sn.dp_histogram(
sn.to_int(data['Age'], lower=0, upper=120),
edges = ages,
upper = 10000,
null_value = -1,
privacy_usage = {'epsilon': 0.5}
)
analysis.release()
plt.ylim([0,7000])
width=4
agecat_left = [x + width for x in ages]
agecat_right = [x + 2*width for x in ages]
plt.bar(list(range(0,120,10)), n_age, width=width, color='blue', alpha=0.7, label='True')
plt.bar(agecat_left, age_histogram.value, width=width, color='orange', alpha=0.7, label='Private')
plt.legend()
plt.title('Histogram of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()
print(age_histogram.value)
Covariance
with sn.Analysis() as analysis:
sn_data = sn.Dataset(path = data_path, column_names = cols)
age_bp_cov_scalar = sn.dp_covariance(
left = sn.to_float(sn_data['Age']),
right = sn.to_float(sn_data['DiastolicBloodPressure']),
privacy_usage = {'epsilon': 1.0},
left_lower = 0.,
left_upper = 120.,
left_rows = 10000,
right_lower = 0.,
right_upper = 150.,
right_rows = 10000)
analysis.release()
print('Differentially private covariance: {0}'.format(age_bp_cov_scalar.value[0][0]))
print('Actual covariance', diabetes.Age.cov(diabetes.DiastolicBloodPressure))
Use SQL queries
from opendp.smartnoise.metadata import CollectionMetadata
meta = CollectionMetadata.from_file('metadata/diabetes.yml')
print (meta)
from opendp.smartnoise.sql import PandasReader, PrivateReader
reader = PandasReader(meta, diabetes)
private_reader = PrivateReader(meta, reader)
print('Readers ready.')
query = 'SELECT Diabetic, AVG(Age) AS AvgAge FROM diabetes.diabetes GROUP BY Diabetic'
result_dp = private_reader.execute_typed(query)
print(result_dp)
try a reader with a high epsilon (low privacy) value, and another with a low epsilon (high privacy) value
low_privacy_reader = PrivateReader(meta, reader, 5.0) # large epsilon, less privacy
result = low_privacy_reader.execute_typed(query)
print(result)
print()
high_privacy_reader = PrivateReader(meta, reader, 0.1) # smaller epsilon, more privacy
result = high_privacy_reader.execute_typed(query)
print(result)
https://docs.microsoft.com/en-us/azure/machine-learning/concept-differential-privacy
Explain machine learning models with Azure Machine Learning
Install azureml-interpret package.
explainer types
# MimicExplainer
from interpret.ext.blackbox import MimicExplainer
from interpret.ext.glassbox import DecisionTreeExplainableModel
mim_explainer = MimicExplainer(model=loan_model,
initialization_examples=X_test,
explainable_model = DecisionTreeExplainableModel,
features=['loan_amount','income','age','marital_status'],
classes=['reject', 'approve'])
# TabularExplainer
from interpret.ext.blackbox import TabularExplainer
tab_explainer = TabularExplainer(model=loan_model,
initialization_examples=X_test,
features=['loan_amount','income','age','marital_status'],
classes=['reject', 'approve'])
# PFIExplainer
from interpret.ext.blackbox import PFIExplainer
pfi_explainer = PFIExplainer(model = loan_model,
features=['loan_amount','income','age','marital_status'],
classes=['reject', 'approve'])
Get global importance. You can call explain_global() then get_feature_importance_dict().
# MimicExplainer
global_mim_explanation = mim_explainer.explain_global(X_train)
global_mim_feature_importance = global_mim_explanation.get_feature_importance_dict()
# TabularExplainer
global_tab_explanation = tab_explainer.explain_global(X_train)
global_tab_feature_importance = global_tab_explanation.get_feature_importance_dict()
# PFIExplainer
global_pfi_explanation = pfi_explainer.explain_global(X_train, y_train)
global_pfi_feature_importance = global_pfi_explanation.get_feature_importance_dict()
Get local values use the get_ranked_local_names() and get_ranked_local_values() methods
# MimicExplainer
local_mim_explanation = mim_explainer.explain_local(X_test[0:5])
local_mim_features = local_mim_explanation.get_ranked_local_names()
local_mim_importance = local_mim_explanation.get_ranked_local_values()
# TabularExplainer
local_tab_explanation = tab_explainer.explain_local(X_test[0:5])
local_tab_features = local_tab_explanation.get_ranked_local_names()
local_tab_importance = local_tab_explanation.get_ranked_local_values()
Creating an explanation in the experiment script
ensure that the azureml-interpret and azureml-contrib-interpret packages are installed in the run environment
# Import Azure ML run library
from azureml.core.run import Run
from azureml.contrib.interpret.explanation.explanation_client import ExplanationClient
from interpret.ext.blackbox import TabularExplainer
# other imports as required
# Get the experiment run context
run = Run.get_context()
# code to train model goes here
# Get explanation
explainer = TabularExplainer(model, X_train, features=features, classes=labels)
explanation = explainer.explain_global(X_test)
# Get an Explanation Client and upload the explanation
explain_client = ExplanationClient.from_run(run)
explain_client.upload_model_explanation(explanation, comment='Tabular Explanation')
# Complete the run
run.complete()
View in the explanations tab or the ExplanationClient object
from azureml.contrib.interpret.explanation.explanation_client import ExplanationClient
client = ExplanationClient.from_run_id(workspace=ws,
experiment_name=experiment.experiment_name,
run_id=run.id)
explanation = client.download_model_explanation()
feature_importances = explanation.get_feature_importance_dict()
Exercise
https://github.com/MicrosoftLearning/mslearn-dp100/blob/main/14%20-%20Interpret%20Models.ipynb
!pip install --upgrade azureml-sdk azureml-widgets azureml-explain-model
!pip install --upgrade azureml-interpret
Create a model
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
# load the diabetes dataset
print("Loading Data...")
data = pd.read_csv('data/diabetes.csv')
# Separate features and labels
features = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']
labels = ['not-diabetic', 'diabetic']
X, y = data[features].values, data['Diabetic'].values
# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
# Train a decision tree model
print('Training a decision tree model')
model = DecisionTreeClassifier().fit(X_train, y_train)
# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
print('Model trained.')
Get an explainer
from interpret.ext.blackbox import TabularExplainer
# "features" and "classes" fields are optional
tab_explainer = TabularExplainer(model,
X_train,
features=features,
classes=labels)
print(tab_explainer, "ready!")
Get global feature explanation
# you can use the training data or the test data here
global_tab_explanation = tab_explainer.explain_global(X_train)
# Get the top features by importance
global_tab_feature_importance = global_tab_explanation.get_feature_importance_dict()
for feature, importance in global_tab_feature_importance.items():
print(feature,":", importance)
Get local feature importances
X_explain = X_test[0:2]
# Get predictions
predictions = model.predict(X_explain)
# Get local explanations
local_tab_explanation = tab_explainer.explain_local(X_explain)
# Get feature names and importance for each possible label
local_tab_features = local_tab_explanation.get_ranked_local_names()
local_tab_importance = local_tab_explanation.get_ranked_local_values()
for l in range(len(local_tab_features)):
print('Support for', labels[l])
label = local_tab_features[l]
for o in range(len(label)):
print("\tObservation", o + 1)
feature_list = label[o]
total_support = 0
for f in range(len(feature_list)):
print("\t\t", feature_list[f], ':', local_tab_importance[l][o][f])
total_support += local_tab_importance[l][o][f]
print("\t\t ----------\n\t\t Total:", total_support, "Prediction:", labels[predictions[o]])
Add explainability into model run
import azureml.core
from azureml.core import Workspace
# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))
import os, shutil
from azureml.core import Experiment
# Create a folder for the experiment files
experiment_folder = 'diabetes_train_and_explain'
os.makedirs(experiment_folder, exist_ok=True)
# Copy the data file into the experiment folder
shutil.copy('data/diabetes.csv', os.path.join(experiment_folder, "diabetes.csv"))
Write file and add explanation in it
%%writefile $experiment_folder/diabetes_training.py
# Import libraries
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
# Import Azure ML run library
from azureml.core.run import Run
# Import libraries for model explanation
from azureml.interpret import ExplanationClient
from interpret.ext.blackbox import TabularExplainer
# Get the experiment run context
run = Run.get_context()
# load the diabetes dataset
print("Loading Data...")
data = pd.read_csv('diabetes.csv')
features = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']
labels = ['not-diabetic', 'diabetic']
# Separate features and labels
X, y = data[features].values, data['Diabetic'].values
# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
# Train a decision tree model
print('Training a decision tree model')
model = DecisionTreeClassifier().fit(X_train, y_train)
# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
run.log('Accuracy', np.float(acc))
# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
run.log('AUC', np.float(auc))
os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/diabetes.pkl')
# Get explanation
explainer = TabularExplainer(model, X_train, features=features, classes=labels)
explanation = explainer.explain_global(X_test)
# Get an Explanation Client and upload the explanation
explain_client = ExplanationClient.from_run(run)
explain_client.upload_model_explanation(explanation, comment='Tabular Explanation')
# Complete the run
run.complete()
Run the experiment
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.widgets import RunDetails
# Create a Python environment for the experiment
explain_env = Environment("explain-env")
# Create a set of package dependencies (including the azureml-interpret package)
packages = CondaDependencies.create(conda_packages=['scikit-learn','pandas','pip'],
pip_packages=['azureml-defaults','azureml-interpret'])
explain_env.python.conda_dependencies = packages
# Create a script config
script_config = ScriptRunConfig(source_directory=experiment_folder,
script='diabetes_training.py',
environment=explain_env)
# submit the experiment
experiment_name = 'mslearn-diabetes-explain'
experiment = Experiment(workspace=ws, name=experiment_name)
run = experiment.submit(config=script_config)
RunDetails(run).show()
run.wait_for_completion()
Retrieve feature importance
from azureml.interpret import ExplanationClient
# Get the feature explanations
client = ExplanationClient.from_run(run)
engineered_explanations = client.download_model_explanation()
feature_importances = engineered_explanations.get_feature_importance_dict()
# Overall feature importance
print('Feature\tImportance')
for key, value in feature_importances.items():
print(key, '\t', value)
Go to the Experiments, Explanations tab
https://docs.microsoft.com/en-us/azure/machine-learning/how-to-machine-learning-interpretability
Detect and mitigate unfairness in models with Azure Machine Learning
evaluating the fairness of a model is to compare predictions for each group within a sensitive feature
group the data based on the sensitive feature (Age) and measure the predictive performance metric (recall) for those groups. Then we can compare the metric scores to determine the disparity between them.
potential causes.
Data imbalance
Indirect correlation.
Societal biases.
Mitigating bias
Balance training and validation data. You can apply over-sampling or under-sampling techniques to balance data and use stratified splitting algorithms to maintain representative proportions for training and validation.
Perform extensive feature selection and engineering analysis. Make sure you fully explore the interconnected correlations in your data to try to differentiate features that are directly predictive from features that encapsulate more complex, nuanced relationships. You can use the model interpretability support in Azure Machine Learning to understand how individual features influence predictions.
Evaluate models for disparity based on significant features. You can't easily address the bias in a model if you can't quantify it.
Trade-off overall predictive performance for the lower disparity in predictive performance between sensitive feature groups. A model that is 99.5% accurate with comparable performance across all groups is often more desirable than a model that is 99.9% accurate but discriminates against a particular subset of cases.
Fairlearn
Fairlearn is a Python package that you can use to analyze models and evaluate disparity between predictions and prediction performance for one or more sensitive features.
Fairlearn integrates with Azure Machine Learning by enabling you to run an experiment in which the dashboard metrics are uploaded to your Azure Machine Learning workspace
The mitigation support in Fairlearn is based on the use of algorithms to create alternative models that apply parity constraints to produce comparable metrics across sensitive feature groups
The choice of parity constraint depends on the technique being used and the specific fairness criteria you want to apply. Constraints in Fairlearn include:
Demographic parity: Use this constraint with any of the mitigation algorithms to minimize disparity in the selection rate across sensitive feature groups. For example, in a binary classification scenario, this constraint tries to ensure that an equal number of positive predictions are made in each group.
True positive rate parity: Use this constraint with any of the mitigation algorithms to minimize disparity in true positive rate across sensitive feature groups. For example, in a binary classification scenario, this constraint tries to ensure that each group contains a comparable ratio of true positive predictions.
False-positive rate parity: Use this constraint with any of the mitigation algorithms to minimize disparity in false_positive_rate across sensitive feature groups. For example, in a binary classification scenario, this constraint tries to ensure that each group contains a comparable ratio of false-positive predictions.
Equalized odds: Use this constraint with any of the mitigation algorithms to minimize disparity in combined true positive rate and false_positive_rate across sensitive feature groups. For example, in a binary classification scenario, this constraint tries to ensure that each group contains a comparable ratio of true positive and false-positive predictions.
Error rate parity: Use this constraint with any of the reduction-based mitigation algorithms (Exponentiated Gradient and Grid Search) to ensure that the error for each sensitive feature group does not deviate from the overall error rate by more than a specified amount.
Bounded group loss: Use this constraint with any of the reduction-based mitigation algorithms to restrict the loss for each sensitive feature group in a regression model.
trade-off between raw predictive performance and fairness
fairness is measured by a reduction in the disparity of feature selection (for example, ensuring that an equal proportion of members from each gender group is approved for a bank loan) or by a reduction in the disparity of performance metric (for example, ensuring that a model is equally accurate at identifying repayers and defaulters in each age group).
Exercise
https://github.com/MicrosoftLearning/mslearn-dp100/blob/main/15%20-%20Detect%20Unfairness.ipynb
!pip install --upgrade azureml-sdk azureml-widgets azureml-contrib-fairness
!pip install --upgrade fairlearn==0.5.0
Train a model and split a feature (age <50 and >50).
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
# load the diabetes dataset
print("Loading Data...")
data = pd.read_csv('data/diabetes.csv')
# Separate features and labels
features = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']
X, y = data[features].values, data['Diabetic'].values
# Get sensitive features
S = data[['Age']].astype(int)
# Change value to represent age groups
S['Age'] = np.where(S.Age > 50, 'Over 50', '50 or younger')
# Split data into training set and test set
X_train, X_test, y_train, y_test, S_train, S_test = train_test_split(X, y, S, test_size=0.20, random_state=0, stratify=y)
# Train a classification model
print("Training model...")
diabetes_model = DecisionTreeClassifier().fit(X_train, y_train)
print("Model trained.")
use the Fairlearn package to compare its behavior for different sensitive feature values
Use the fairlearn selection_rate function to return the selection rate (percentage of positive predictions) for the overall population.
Use scikit-learn metric functions to calculate overall accuracy, recall, and precision metrics.
Use a MetricFrame to calculate selection rate, accuracy, recall, and precision for each age group in the Age sensitive feature. Note that a mix of fairlearn and scikit-learn metric functions are used to calculate the performance values.
from fairlearn.metrics import selection_rate, MetricFrame
from sklearn.metrics import accuracy_score, recall_score, precision_score
# Get predictions for the witheld test data
y_hat = diabetes_model.predict(X_test)
# Get overall metrics
print("Overall Metrics:")
# Get selection rate from fairlearn
overall_selection_rate = selection_rate(y_test, y_hat) # Get selection rate from fairlearn
print("\tSelection Rate:", overall_selection_rate)
# Get standard metrics from scikit-learn
overall_accuracy = accuracy_score(y_test, y_hat)
print("\tAccuracy:", overall_accuracy)
overall_recall = recall_score(y_test, y_hat)
print("\tRecall:", overall_recall)
overall_precision = precision_score(y_test, y_hat)
print("\tPrecision:", overall_precision)
# Get metrics by sensitive group from fairlearn
print('\nMetrics by Group:')
metrics = {'selection_rate': selection_rate,
'accuracy': accuracy_score,
'recall': recall_score,
'precision': precision_score}
group_metrics = MetricFrame(metrics,
y_test, y_hat,
sensitive_features=S_test['Age'])
print(group_metrics.by_group)
larger proportion of the older patients are predicted to be diabetic
When the widget is displayed, use the Get started link to start configuring your visualization.
Select the sensitive features you want to compare (in this case, there's only one: Age).
Select the model performance metric you want to compare (in this case, it's a binary classification model so the options are Accuracy, Balanced accuracy, Precision, and Recall). Start with Recall.
View the dashboard visualization, which shows:
Disparity in performance - how the selected performance metric compares for the subpopulations, including underprediction (false negatives) and overprediction (false positives).
Disparity in predictions - A comparison of the number of positive cases per subpopulation.
Edit the configuration to compare the predictions based on different performance metrics.
from fairlearn.widget import FairlearnDashboard
# View this model in Fairlearn's fairness dashboard, and see the disparities which appear:
FairlearnDashboard(sensitive_features=S_test,
sensitive_feature_names=['Age'],
y_true=y_test,
y_pred={"diabetes_model": diabetes_model.predict(X_test)})
exclude the Age feature when training the model
# Separate features and labels
ageless = features.copy()
ageless.remove('Age')
X2, y2 = data[ageless].values, data['Diabetic'].values
# Split data into training set and test set
X_train2, X_test2, y_train2, y_test2, S_train2, S_test2 = train_test_split(X2, y2, S, test_size=0.20, random_state=0, stratify=y2)
# Train a classification model
print("Training model...")
ageless_model = DecisionTreeClassifier().fit(X_train2, y_train2)
print("Model trained.")
# View this model in Fairlearn's fairness dashboard, and see the disparities which appear:
FairlearnDashboard(sensitive_features=S_test2,
sensitive_feature_names=['Age'],
y_true=y_test2,
y_pred={"ageless_diabetes_model": ageless_model.predict(X_test2)})
Register the model and upload the model to the training space
from azureml.core import Workspace, Experiment, Model
import joblib
import os
# Load the Azure ML workspace from the saved config file
ws = Workspace.from_config()
print('Ready to work with', ws.name)
# Save the trained model
model_file = 'diabetes_model.pkl'
joblib.dump(value=diabetes_model, filename=model_file)
# Register the model
print('Registering model...')
registered_model = Model.register(model_path=model_file,
model_name='diabetes_classifier',
workspace=ws)
model_id= registered_model.id
print('Model registered.', model_id)
Upload fairlearn metrics
from fairlearn.metrics._group_metric_set import _create_group_metric_set
from azureml.contrib.fairness import upload_dashboard_dictionary, download_dashboard_by_upload_id
# Create a dictionary of model(s) you want to assess for fairness
sf = { 'Age': S_test.Age}
ys_pred = { model_id:diabetes_model.predict(X_test) }
dash_dict = _create_group_metric_set(y_true=y_test,
predictions=ys_pred,
sensitive_features=sf,
prediction_type='binary_classification')
exp = Experiment(ws, 'mslearn-diabetes-fairness')
print(exp)
run = exp.start_logging()
# Upload the dashboard to Azure Machine Learning
try:
dashboard_title = "Fairness insights of Diabetes Classifier"
upload_id = upload_dashboard_dictionary(run,
dash_dict,
dashboard_name=dashboard_title)
print("\nUploaded to id: {0}\n".format(upload_id))
# To test the dashboard, you can download it
downloaded_dict = download_dashboard_by_upload_id(run, upload_id)
print(downloaded_dict)
finally:
run.complete()
Click on the fairness tab of the run of the tab
from azureml.widgets import RunDetails
RunDetails(run).show()
Mitigate unfairness
use the GridSearch feature, which trains multiple models in an attempt to minimize the disparity of predictive performance for the sensitive features in the dataset (in this case, the age groups). You'll optimize the models by applying the EqualizedOdds parity constraint, which tries to ensure that models that exhibit similar true and false positive rates for each sensitive feature grouping.
from fairlearn.reductions import GridSearch, EqualizedOdds
import joblib
import os
print('Finding mitigated models...')
# Train multiple models
sweep = GridSearch(DecisionTreeClassifier(),
constraints=EqualizedOdds(),
grid_size=20)
sweep.fit(X_train, y_train, sensitive_features=S_train.Age)
models = sweep.predictors_
# Save the models and get predictions from them (plus the original unmitigated one for comparison)
model_dir = 'mitigated_models'
os.makedirs(model_dir, exist_ok=True)
model_name = 'diabetes_unmitigated'
print(model_name)
joblib.dump(value=diabetes_model, filename=os.path.join(model_dir, '{0}.pkl'.format(model_name)))
predictions = {model_name: diabetes_model.predict(X_test)}
i = 0
for model in models:
i += 1
model_name = 'diabetes_mitigated_{0}'.format(i)
print(model_name)
joblib.dump(value=model, filename=os.path.join(model_dir, '{0}.pkl'.format(model_name)))
predictions[model_name] = model.predict(X_test)
Use the wizard to visualize Age by Recall
FairlearnDashboard(sensitive_features=S_test,
sensitive_feature_names=['Age'],
y_true=y_test,
y_pred=predictions)
Upload dashboard
# Register the models
registered_model_predictions = dict()
for model_name, prediction_data in predictions.items():
model_file = os.path.join(model_dir, model_name + ".pkl")
registered_model = Model.register(model_path=model_file,
model_name=model_name,
workspace=ws)
registered_model_predictions[registered_model.id] = prediction_data
# Create a group metric set for binary classification based on the Age feature for all of the models
sf = { 'Age': S_test.Age}
dash_dict = _create_group_metric_set(y_true=y_test,
predictions=registered_model_predictions,
sensitive_features=sf,
prediction_type='binary_classification')
exp = Experiment(ws, "mslearn-diabetes-fairness")
print(exp)
run = exp.start_logging()
RunDetails(run).show()
# Upload the dashboard to Azure Machine Learning
try:
dashboard_title = "Fairness Comparison of Diabetes Models"
upload_id = upload_dashboard_dictionary(run,
dash_dict,
dashboard_name=dashboard_title)
print("\nUploaded to id: {0}\n".format(upload_id))
finally:
run.complete()
Monitor models with Azure machine learning
Enable application insight
from azureml.core import Workspace
ws = Workspace.from_config()
ws.get_details()['applicationInsights']
get config
dep_config = AciWebservice.deploy_configuration(cpu_cores = 1,
memory_gb = 1,
enable_app_insights=True)
update
service = ws.webservices['my-svc']
service.update(enable_app_insights=True)
Capture and view telemetry
Log data
def init():
global model
model = joblib.load(Model.get_model_path('my_model'))
def run(raw_data):
data = json.loads(raw_data)['data']
predictions = model.predict(data)
log_txt = 'Data:' + str(data) + ' - Predictions:' + str(predictions)
print(log_txt)
return predictions.tolist()
Query the log as
traces
|where message == "STDOUT"
and customDimensions.["Service Name"] = "my-svc"
| project timestamp, customDimensions.Content
Exercise
https://github.com/MicrosoftLearning/mslearn-dp100/blob/main/16%20-%20Monitor%20a%20Model.ipynb
Monitor data drift with Azure machine learning
from azureml.datadrift import DataDriftDetector
monitor = DataDriftDetector.create_from_datasets(workspace=ws,
name='dataset-drift-detector',
baseline_data_set=train_ds,
target_data_set=new_data_ds,
compute_target='aml-cluster',
frequency='Week',
feature_list=['age','height', 'bmi'],
latency=24)
Choose a time frame?
import datetime as dt
backfill = monitor.backfill( dt.datetime.now() - dt.timedelta(weeks=6), dt.datetime.now())
Configure alerts
alert_email = AlertConfiguration('data_scientists@contoso.com')
monitor = DataDriftDetector.create_from_datasets(ws, 'dataset-drift-detector',
baseline_data_set, target_data_set,
compute_target=cpu_cluster,
frequency='Week', latency=2,
drift_threshold=.3,
alert_configuration=alert_email)
scheduling a data drift monitor to run every week, and send an alert if the drift magnitude is greater than 0.3
alert_email = AlertConfiguration('data_scientists@contoso.com')
monitor = DataDriftDetector.create_from_datasets(ws, 'dataset-drift-detector',
baseline_data_set, target_data_set,
compute_target=cpu_cluster,
frequency='Week', latency=2,
drift_threshold=.3,
alert_configuration=alert_email)
Exercise
https://github.com/MicrosoftLearning/mslearn-dp100/blob/main/17%20-%20Monitor%20Data%20Drift.ipynb
0X AutoML
https://microsoftlearning.github.io/mslearn-dp100/instructions/02-automated-ml.html
Build a Dataset
Create a new dataset from web files, using the following settings:
Basic Info:
Web URL: https://aka.ms/diabetes-data
Name: diabetes dataset
Dataset type: Tabular
Description: Diabetes data
Settings and preview:
File format: Delimited
Delimiter: Comma
Encoding: UTF-8
Column headers: Use headers from first file
Skip rows: None
Schema:
Include all columns other than Path
Review the automatically detected types
Confirm details:
Do not profile the dataset after creation
After the dataset has been created, open it and view the Explore page to see a sample of the data
view the Automated ML page (under Author).
Create a new Automated ML run with the following settings:
Select dataset:
Dataset: diabetes dataset
Configure run:
New experiment name: mslearn-automl-diabetes
Target column: Diabetic (this is the label the model will be trained to predict)
Select compute cluster: the compute cluster you created previously
Task type and settings:
Task type: Classification
Additional configuration settings:
Primary metric: Select AUC_Weighted (more about this metric later!)
Explain best model: Selected - this option causes automated machine learning to calculate feature importance for the best model; making it possible to determine the influence of each feature on the predicted label.
Blocked algorithms: Leave all algorithms selected
Exit criterion:
Training job time (hours): 0.25 - this causes the experiment to end after a maximum of 15 minutes.
Metric score threshold: 0.90 - this causes the experiment to end if a model achieves a weighted AUC metric of 90% or higher.
Featurization settings:
Enable featurization: Selected - this causes Azure Machine Learning to automatically preprocess the features before training.
Review the best model
On the Details tab of the automated machine learning run, note the best model summary.
Select the Algorithm name for the best model to view the child-run that produced it.
Next to the AUC_Weighted value, select View all other metrics to see values of other possible evaluation metrics for a classification model.
Select the Metrics tab and review the performance metrics you can view for the model. These include a confusion_matrix visualization showing the confusion matrix for the validated model, and an accuracy_table visualization that includes the ROC chart.
Select the Explanations tab, and view the Global Importance chart. This shows the extent to which each feature in the dataset influences the label prediction.
Deploy a predictive service
Note: In Azure Machine Learning, you can deploy a service as an Azure Container Instances (ACI) or to an Azure Kubernetes Service (AKS) cluster. AKS is preferred but ACI is ok for testing
Select the Details tab for the run that produced the best model.
Use the Deploy button to deploy the model with the following settings:
Name: auto-predict-diabetes
Description: Predict diabetes
Compute type: ACI
Enable authentication: Selected
Wait for the deployment to start - this may take a few seconds. Then, on the Model tab, in the Model summary section, observe the Deploy status for the auto-predict-diabetes service, which should be Running. Wait for this status to change to Successful. You may need to select ↻ Refresh periodically.
In Azure Machine Learning studio, view the Endpoints page (under Assets) and select the auto-predict-diabetes real-time endpoint. Then select the Consume tab and note the following information there.
Test the deployed service
With the Consume page for the auto-predict-diabetes service page open in your browser, open a new browser tab and open a second instance of Azure Machine Learning studio. Then in the new tab, view the Notebooks page.
In the Notebooks page, under My files, browse to the Users/mslearn-dp100 folder where you cloned the notebook repository, and open the Get AutoML Prediction notebook.
When the notebook has opened, ensure that the compute instance you created previously is selected in the Compute box, and that it has a status of Running.
In the notebook, replace the ENDPOINT and PRIMARY_KEY placeholders with the values for your service, which you can copy from the Consume tab on the page for your endpoint.