Build AI solutions with Azure Machine Learning

Go to https://ml.azure.com/

Create a Compute instance (Standard D12 v2)

Notebooks -> Terminal

cd Users

git clone https://github.com/MicrosoftLearning/mslearn-dp100

Refresh the file pane to see the folder

01 Getting started

https://github.com/MicrosoftLearning/mslearn-dp100/blob/main/01%20-%20Get%20Started%20with%20Notebooks.ipynb

The config file is in the root dir

from azureml.core import Workspace


ws = Workspace.from_config()

See compute targets

from azureml.core import ComputeTarget


print("Compute Resources:")

for compute_name in ws.compute_targets:

compute = ws.compute_targets[compute_name]

print("\t", compute.name, ':', compute.type)

Write a scipt

from azureml.core import Run

import pandas as pd

import numpy as np

import joblib

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression


# Get the experiment run context

run = Run.get_context()


# Prepare the dataset

diabetes = pd.read_csv('data.csv')

X, y = data[['Feature1','Feature2','Feature3']].values, data['Label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)


# Train a logistic regression model

reg = 0.1

model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)


# calculate accuracy

y_hat = model.predict(X_test)

acc = np.average(y_hat == y_test)

run.log('Accuracy', np.float(acc))


# Save the trained model

os.makedirs('outputs', exist_ok=True)

joblib.dump(value=model, filename='outputs/model.pkl')


run.complete()

Run the script

from azureml.core import Experiment, ScriptRunConfig, Environment

from azureml.core.conda_dependencies import CondaDependencies


# Create a Python environment for the experiment

sklearn_env = Environment("sklearn-env")


# Ensure the required packages are installed

packages = CondaDependencies.create(conda_packages=['scikit-learn','pip'],

pip_packages=['azureml-defaults'])

sklearn_env.python.conda_dependencies = packages


# Create a script config

script_config = ScriptRunConfig(source_directory='training_folder',

script='training.py',

environment=sklearn_env)


# Submit the experiment

experiment = Experiment(workspace=ws, name='training-experiment')

run = experiment.submit(config=script_config)

run.wait_for_completion()

For hyperparamters

from azureml.core import Run

import argparse

import pandas as pd

import numpy as np

import joblib

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression


# Get the experiment run context

run = Run.get_context()


# Set regularization hyperparameter

parser = argparse.ArgumentParser()

parser.add_argument('--reg-rate', type=float, dest='reg_rate', default=0.01)

args = parser.parse_args()

reg = args.reg_rate


# Prepare the dataset

diabetes = pd.read_csv('data.csv')

X, y = data[['Feature1','Feature2','Feature3']].values, data['Label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)


# Train a logistic regression model

model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)


# calculate accuracy

y_hat = model.predict(X_test)

acc = np.average(y_hat == y_test)

run.log('Accuracy', np.float(acc))


# Save the trained model

os.makedirs('outputs', exist_ok=True)

joblib.dump(value=model, filename='outputs/model.pkl')

run.complete()

Passing arguments into a script

# Create a script config

script_config = ScriptRunConfig(source_directory='training_folder',

script='training.py',

arguments = ['--reg-rate', 0.1],

environment=sklearn_env)

Register models

You can download the model to local

# "run" is a reference to a completed experiment run


# List the files generated by the experiment

for file in run.get_file_names():

print(file)


# Download a named file

run.download_file(name='outputs/model.pkl', output_file_path='model.pkl')

Register

from azureml.core import Model


model = Model.register(workspace=ws,

model_name='classification_model',

model_path='model.pkl', # local path

description='A classification model',

tags={'data-format': 'CSV'},

model_framework=Model.Framework.SCIKITLEARN,

model_framework_version='0.20.3')

or

run.register_model( model_name='classification_model',

model_path='outputs/model.pkl', # run outputs path

description='A classification model',

tags={'data-format': 'CSV'},

model_framework=Model.Framework.SCIKITLEARN,

model_framework_version='0.20.3')

See registered models

from azureml.core import Model


for model in Model.list(ws):

# Get model name and auto-generated version

print(model.name, 'version:', model.version)

Go thorugh https://github.com/MicrosoftLearning/mslearn-dp100/blob/main/05%20-%20Train%20Models.ipynb

!pip install --upgrade azureml-sdk azureml-widgets


import azureml.core

from azureml.core import Workspace


# Load the workspace from the saved config file

ws = Workspace.from_config()

print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))


import os, shutil


# Create a folder for the experiment files

training_folder = 'diabetes-training'

os.makedirs(training_folder, exist_ok=True)


# Copy the data file into the experiment folder

shutil.copy('data/diabetes.csv', os.path.join(training_folder, "diabetes.csv"))

Write training script

%%writefile $training_folder/diabetes_training.py

# Import libraries

from azureml.core import Run

import pandas as pd

import numpy as np

import joblib

import os

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score

from sklearn.metrics import roc_curve


# Get the experiment run context

run = Run.get_context()


# load the diabetes dataset

print("Loading Data...")

diabetes = pd.read_csv('diabetes.csv')


# Separate features and labels

X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values


# Split data into training set and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)


# Set regularization hyperparameter

reg = 0.01


# Train a logistic regression model

print('Training a logistic regression model with regularization rate of', reg)

run.log('Regularization Rate', np.float(reg))

model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)


# calculate accuracy

y_hat = model.predict(X_test)

acc = np.average(y_hat == y_test)

print('Accuracy:', acc)

run.log('Accuracy', np.float(acc))


# calculate AUC

y_scores = model.predict_proba(X_test)

auc = roc_auc_score(y_test,y_scores[:,1])

print('AUC: ' + str(auc))

run.log('AUC', np.float(auc))


# Save the trained model in the outputs folder

os.makedirs('outputs', exist_ok=True)

joblib.dump(value=model, filename='outputs/diabetes_model.pkl')


run.complete()

Run the script

from azureml.core import Experiment, ScriptRunConfig, Environment

from azureml.core.conda_dependencies import CondaDependencies

from azureml.widgets import RunDetails


# Create a Python environment for the experiment

sklearn_env = Environment("sklearn-env")


# Ensure the required packages are installed (we need scikit-learn and Azure ML defaults)

packages = CondaDependencies.create(pip_packages=['scikit-learn','azureml-defaults'])

sklearn_env.python.conda_dependencies = packages


# Create a script config

script_config = ScriptRunConfig(source_directory=training_folder,

script='diabetes_training.py',

environment=sklearn_env)


# submit the experiment run

experiment_name = 'mslearn-train-diabetes'

experiment = Experiment(workspace=ws, name=experiment_name)

run = experiment.submit(config=script_config)


# Show the running experiment run in the notebook widget

RunDetails(run).show()


# Block until the experiment run has completed

run.wait_for_completion()

Retrive metrics and output

# Get logged metrics and files

metrics = run.get_metrics()

for key in metrics.keys():

print(key, metrics.get(key))

print('\n')

for file in run.get_file_names():

print(file)

Register the model

from azureml.core import Model


# Register the model

run.register_model(model_path='outputs/diabetes_model.pkl', model_name='diabetes_model',

tags={'Training context':'Script'},

properties={'AUC': run.get_metrics()['AUC'], 'Accuracy': run.get_metrics()['Accuracy']})


# List registered models

for model in Model.list(ws):

print(model.name, 'version:', model.version)

for tag_name in model.tags:

tag = model.tags[tag_name]

print ('\t',tag_name, ':', tag)

for prop_name in model.properties:

prop = model.properties[prop_name]

print ('\t',prop_name, ':', prop)

print('\n')

Create folder for param script and training data

import os, shutil


# Create a folder for the experiment files

training_folder = 'diabetes-training-params'

os.makedirs(training_folder, exist_ok=True)


# Copy the data file into the experiment folder

shutil.copy('data/diabetes.csv', os.path.join(training_folder, "diabetes.csv"))

Write file

%%writefile $training_folder/diabetes_training.py

# Import libraries

from azureml.core import Run

import pandas as pd

import numpy as np

import joblib

import os

import argparse

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score

from sklearn.metrics import roc_curve


# Get the experiment run context

run = Run.get_context()


# Set regularization hyperparameter

parser = argparse.ArgumentParser()

parser.add_argument('--reg_rate', type=float, dest='reg', default=0.01)

args = parser.parse_args()

reg = args.reg


# load the diabetes dataset

print("Loading Data...")

# load the diabetes dataset

diabetes = pd.read_csv('diabetes.csv')


# Separate features and labels

X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values


# Split data into training set and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)


# Train a logistic regression model

print('Training a logistic regression model with regularization rate of', reg)

run.log('Regularization Rate', np.float(reg))

model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)


# calculate accuracy

y_hat = model.predict(X_test)

acc = np.average(y_hat == y_test)

print('Accuracy:', acc)

run.log('Accuracy', np.float(acc))


# calculate AUC

y_scores = model.predict_proba(X_test)

auc = roc_auc_score(y_test,y_scores[:,1])

print('AUC: ' + str(auc))

run.log('AUC', np.float(auc))


os.makedirs('outputs', exist_ok=True)

joblib.dump(value=model, filename='outputs/diabetes_model.pkl')


run.complete()

Run the script with arguments

# Create a script config

script_config = ScriptRunConfig(source_directory=training_folder,

script='diabetes_training.py',

arguments = ['--reg_rate', 0.1],

environment=sklearn_env)


# submit the experiment

experiment_name = 'mslearn-train-diabetes'

experiment = Experiment(workspace=ws, name=experiment_name)

run = experiment.submit(config=script_config)

RunDetails(run).show()

run.wait_for_completion()

Get output

# Get logged metrics

metrics = run.get_metrics()

for key in metrics.keys():

print(key, metrics.get(key))

print('\n')

for file in run.get_file_names():

print(file)

Register new version

from azureml.core import Model


# Register the model

run.register_model(model_path='outputs/diabetes_model.pkl', model_name='diabetes_model',

tags={'Training context':'Parameterized script'},

properties={'AUC': run.get_metrics()['AUC'], 'Accuracy': run.get_metrics()['Accuracy']})


# List registered models

for model in Model.list(ws):

print(model.name, 'version:', model.version)

for tag_name in model.tags:

tag = model.tags[tag_name]

print ('\t',tag_name, ':', tag)

for prop_name in model.properties:

prop = model.properties[prop_name]

print ('\t',prop_name, ':', prop)

print('\n')

You can see the Models in the Models section under Assets

Work with Data in Azure Machine Learning

Mount a blob

from azureml.core import Workspace, Datastore


ws = Workspace.from_config()


# Register a new datastore

blob_ds = Datastore.register_azure_blob_container(workspace=ws,

datastore_name='blob_data',

container_name='data_container',

account_name='az_store_acct',

account_key='123456abcde789…')

View data stores

for ds_name in ws.datastores:

print(ds_name)

Get a reference using get

blob_store = Datastore.get(ws, datastore_name='blob_data')

Has a default (built in workspaceblobsore datastore)

default_store = ws.get_default_datastore()

Change the name of the default store

ws.set_default_datastore('blob_data')

Create and register tabular datasets

from azureml.core import Dataset


blob_ds = ws.get_default_datastore()

csv_paths = [(blob_ds, 'data/files/current_data.csv'),

(blob_ds, 'data/files/archive/*.csv')]

tab_ds = Dataset.Tabular.from_delimited_files(path=csv_paths)

tab_ds = tab_ds.register(workspace=ws, name='csv_table')

file datasets

from azureml.core import Dataset


blob_ds = ws.get_default_datastore()

file_ds = Dataset.File.from_files(path=(blob_ds, 'data/files/images/*.jpg'))

file_ds = file_ds.register(workspace=ws, name='img_files')

Get dataset

import azureml.core

from azureml.core import Workspace, Dataset


# Load the workspace from the saved config file

ws = Workspace.from_config()


# Get a dataset from the workspace datasets collection

ds1 = ws.datasets['csv_table']


# Get a dataset by name from the datasets class

ds2 = Dataset.get_by_name(ws, 'img_files')

Version dataset by adding create_new_version

img_paths = [(blob_ds, 'data/files/images/*.jpg'),

(blob_ds, 'data/files/images/*.png')]

file_ds = Dataset.File.from_files(path=img_paths)

file_ds = file_ds.register(workspace=ws, name='img_files', create_new_version=True)

Retrive a specific version

img_ds = Dataset.get_by_name(workspace=ws, name='img_files', version=2)

Convert to dataframe

df = tab_ds.to_pandas_dataframe()

# code to work with dataframe goes here, for example:

print(df.head())

Pass dataset to script use --ds

env = Environment('my_env')

packages = CondaDependencies.create(conda_packages=['pip'],

pip_packages=['azureml-defaults',

'azureml-dataprep[pandas]'])

env.python.conda_dependencies = packages


script_config = ScriptRunConfig(source_directory='my_dir',

script='script.py',

arguments=['--ds', tab_ds],

environment=env)

from azureml.core import Run, Dataset


parser.add_argument('--ds', type=str, dest='dataset_id')

args = parser.parse_args()


run = Run.get_context()

ws = run.experiment.workspace

dataset = Dataset.get_by_id(ws, id=args.dataset_id)

data = dataset.to_pandas_dataframe()

use a named input

env = Environment('my_env')

packages = CondaDependencies.create(conda_packages=['pip'],

pip_packages=['azureml-defaults',

'azureml-dataprep[pandas]'])

env.python.conda_dependencies = packages


script_config = ScriptRunConfig(source_directory='my_dir',

script='script.py',

arguments=['--ds', tab_ds.as_named_input('my_dataset')],

environment=env)

from azureml.core import Run


parser.add_argument('--ds', type=str, dest='ds_id')

args = parser.parse_args()


run = Run.get_context()

dataset = run.input_datasets['my_dataset']

data = dataset.to_pandas_dataframe()

Pass file dataset. Use as_download or as_mount for really large datasets

env = Environment('my_env')

packages = CondaDependencies.create(conda_packages=['pip'],

pip_packages=['azureml-defaults',

'azureml-dataprep[pandas]'])

env.python.conda_dependencies = packages


script_config = ScriptRunConfig(source_directory='my_dir',

script='script.py',

arguments=['--ds', file_ds.as_download()],

environment=env)

from azureml.core import Run

import glob


parser.add_argument('--ds', type=str, dest='ds_ref')

args = parser.parse_args()

run = Run.get_context()


imgs = glob.glob(ds_ref + "/*.jpg")

Use named input

env = Environment('my_env')

packages = CondaDependencies.create(conda_packages=['pip'],

pip_packages=['azureml-defaults',

'azureml-dataprep[pandas]'])

env.python.conda_dependencies = packages


script_config = ScriptRunConfig(source_directory='my_dir',

script='script.py',

arguments=['--ds', file_ds.as_named_input('my_ds').as_download()],

environment=env)

from azureml.core import Run

import glob


parser.add_argument('--ds', type=str, dest='ds_ref')

args = parser.parse_args()

run = Run.get_context()


dataset = run.input_datasets['my_ds']

imgs= glob.glob(dataset + "/*.jpg")

Go through https://github.com/MicrosoftLearning/mslearn-dp100/blob/main/06%20-%20Work%20with%20Data.ipynb

!pip install --upgrade azureml-sdk azureml-widgets


import azureml.core

from azureml.core import Workspace


# Load the workspace from the saved config file

ws = Workspace.from_config()

print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

View datastores

# Get the default datastore

default_ds = ws.get_default_datastore()


# Enumerate all datastores, indicating which is the default

for ds_name in ws.datastores:

print(ds_name, "- Default =", ds_name == default_ds.name)

Upload data to datastore

default_ds.upload_files(files=['./data/diabetes.csv', './data/diabetes2.csv'], # Upload the diabetes csv files in /data

target_path='diabetes-data/', # Put it in a folder path in the datastore

overwrite=True, # Replace existing files of the same name

show_progress=True)

Create a tabular dataset

from azureml.core import Dataset


# Get the default datastore

default_ds = ws.get_default_datastore()


#Create a tabular dataset from the path on the datastore (this may take a short while)

tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'diabetes-data/*.csv'))


# Display the first 20 rows as a Pandas dataframe

tab_data_set.take(20).to_pandas_dataframe()

File dataset

#Create a file dataset from the path on the datastore (this may take a short while)

file_data_set = Dataset.File.from_files(path=(default_ds, 'diabetes-data/*.csv'))


# Get the files in the dataset

for file_path in file_data_set.to_path():

print(file_path)

Register the tabular dataset and the file dataset

# Register the tabular dataset

try:

tab_data_set = tab_data_set.register(workspace=ws,

name='diabetes dataset',

description='diabetes data',

tags = {'format':'CSV'},

create_new_version=True)

except Exception as ex:

print(ex)


# Register the file dataset

try:

file_data_set = file_data_set.register(workspace=ws,

name='diabetes file dataset',

description='diabetes files',

tags = {'format':'CSV'},

create_new_version=True)

except Exception as ex:

print(ex)


print('Datasets registered')

View the registered datasets

print("Datasets:")

for dataset_name in list(ws.datasets.keys()):

dataset = Dataset.get_by_name(ws, dataset_name)

print("\t", dataset.name, 'version', dataset.version)

Grab a specific version

dataset_v1 = Dataset.get_by_name(ws, 'diabetes dataset', version = 1)

Train a model from a tabular dataset

import os


# Create a folder for the experiment files

experiment_folder = 'diabetes_training_from_tab_dataset'

os.makedirs(experiment_folder, exist_ok=True)

print(experiment_folder, 'folder created')


%%writefile $experiment_folder/diabetes_training.py

# Import libraries

import os

import argparse

from azureml.core import Run, Dataset

import pandas as pd

import numpy as np

import joblib

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score

from sklearn.metrics import roc_curve


# Get the script arguments (regularization rate and training dataset ID)

parser = argparse.ArgumentParser()

parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01, help='regularization rate')

parser.add_argument("--input-data", type=str, dest='training_dataset_id', help='training dataset')

args = parser.parse_args()


# Set regularization hyperparameter (passed as an argument to the script)

reg = args.reg_rate


# Get the experiment run context

run = Run.get_context()


# Get the training dataset

print("Loading Data...")

diabetes = run.input_datasets['training_data'].to_pandas_dataframe()


# Separate features and labels

X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values


# Split data into training set and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)


# Train a logistic regression model

print('Training a logistic regression model with regularization rate of', reg)

run.log('Regularization Rate', np.float(reg))

model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)


# calculate accuracy

y_hat = model.predict(X_test)

acc = np.average(y_hat == y_test)

print('Accuracy:', acc)

run.log('Accuracy', np.float(acc))


# calculate AUC

y_scores = model.predict_proba(X_test)

auc = roc_auc_score(y_test,y_scores[:,1])

print('AUC: ' + str(auc))

run.log('AUC', np.float(auc))


os.makedirs('outputs', exist_ok=True)

# note file saved in the outputs folder is automatically uploaded into experiment record

joblib.dump(value=model, filename='outputs/diabetes_model.pkl')


run.complete()


from azureml.core import Experiment, ScriptRunConfig, Environment

from azureml.core.conda_dependencies import CondaDependencies

from azureml.widgets import RunDetails



# Create a Python environment for the experiment

sklearn_env = Environment("sklearn-env")


# Ensure the required packages are installed (we need scikit-learn, Azure ML defaults, and Azure ML dataprep)

packages = CondaDependencies.create(conda_packages=['scikit-learn','pip'],

pip_packages=['azureml-defaults','azureml-dataprep[pandas]'])

sklearn_env.python.conda_dependencies = packages


# Get the training dataset

diabetes_ds = ws.datasets.get("diabetes dataset")


# Create a script config

script_config = ScriptRunConfig(source_directory=experiment_folder,

script='diabetes_training.py',

arguments = ['--regularization', 0.1, # Regularizaton rate parameter

'--input-data', diabetes_ds.as_named_input('training_data')], # Reference to dataset

environment=sklearn_env)


# submit the experiment

experiment_name = 'mslearn-train-diabetes'

experiment = Experiment(workspace=ws, name=experiment_name)

run = experiment.submit(config=script_config)

RunDetails(run).show()

run.wait_for_completion()

Register the traing model

from azureml.core import Model


run.register_model(model_path='outputs/diabetes_model.pkl', model_name='diabetes_model',

tags={'Training context':'Tabular dataset'}, properties={'AUC': run.get_metrics()['AUC'], 'Accuracy': run.get_metrics()['Accuracy']})


for model in Model.list(ws):

print(model.name, 'version:', model.version)

for tag_name in model.tags:

tag = model.tags[tag_name]

print ('\t',tag_name, ':', tag)

for prop_name in model.properties:

prop = model.properties[prop_name]

print ('\t',prop_name, ':', prop)

print('\n')

Train from a file dataset

import os


# Create a folder for the experiment files

experiment_folder = 'diabetes_training_from_file_dataset'

os.makedirs(experiment_folder, exist_ok=True)

print(experiment_folder, 'folder created')

%%writefile $experiment_folder/diabetes_training.py

# Import libraries

import os

import argparse

from azureml.core import Dataset, Run

import pandas as pd

import numpy as np

import joblib

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score

from sklearn.metrics import roc_curve

import glob


# Get script arguments (rgularization rate and file dataset mount point)

parser = argparse.ArgumentParser()

parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01, help='regularization rate')

parser.add_argument('--input-data', type=str, dest='dataset_folder', help='data mount point')

args = parser.parse_args()


# Set regularization hyperparameter (passed as an argument to the script)

reg = args.reg_rate


# Get the experiment run context

run = Run.get_context()


# load the diabetes dataset

print("Loading Data...")

data_path = run.input_datasets['training_files'] # Get the training data path from the input

# (You could also just use args.data_folder if you don't want to rely on a hard-coded friendly name)


# Read the files

all_files = glob.glob(data_path + "/*.csv")

diabetes = pd.concat((pd.read_csv(f) for f in all_files), sort=False)


# Separate features and labels

X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values


# Split data into training set and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)


# Train a logistic regression model

print('Training a logistic regression model with regularization rate of', reg)

run.log('Regularization Rate', np.float(reg))

model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)


# calculate accuracy

y_hat = model.predict(X_test)

acc = np.average(y_hat == y_test)

print('Accuracy:', acc)

run.log('Accuracy', np.float(acc))


# calculate AUC

y_scores = model.predict_proba(X_test)

auc = roc_auc_score(y_test,y_scores[:,1])

print('AUC: ' + str(auc))

run.log('AUC', np.float(auc))


os.makedirs('outputs', exist_ok=True)

# note file saved in the outputs folder is automatically uploaded into experiment record

joblib.dump(value=model, filename='outputs/diabetes_model.pkl')


run.complete()


from azureml.core import Experiment

from azureml.widgets import RunDetails



# Get the training dataset

diabetes_ds = ws.datasets.get("diabetes file dataset")


# Create a script config

script_config = ScriptRunConfig(source_directory=experiment_folder,

script='diabetes_training.py',

arguments = ['--regularization', 0.1, # Regularizaton rate parameter

'--input-data', diabetes_ds.as_named_input('training_files').as_download()], # Reference to dataset location

environment=sklearn_env) # Use the environment created previously


# submit the experiment

experiment_name = 'mslearn-train-diabetes'

experiment = Experiment(workspace=ws, name=experiment_name)

run = experiment.submit(config=script_config)

RunDetails(run).show()

run.wait_for_completion()

Register the model

from azureml.core import Model


run.register_model(model_path='outputs/diabetes_model.pkl', model_name='diabetes_model',

tags={'Training context':'File dataset'}, properties={'AUC': run.get_metrics()['AUC'], 'Accuracy': run.get_metrics()['Accuracy']})


for model in Model.list(ws):

print(model.name, 'version:', model.version)

for tag_name in model.tags:

tag = model.tags[tag_name]

print ('\t',tag_name, ':', tag)

for prop_name in model.properties:

prop = model.properties[prop_name]

print ('\t',prop_name, ':', prop)

print('\n')

https://microsoftlearning.github.io/mslearn-dp100/instructions/01-create-a-workspace.htmlWork with Compute in Azure Machine Learning

Creating environment files

Create an environment from a specification file

e.g conda

name: py_env

dependencies:

- numpy

- pandas

- scikit-learn

- pip:

- azureml-defaults

then do

from azureml.core import Environment


env = Environment.from_conda_specification(name='training_environment',

file_path='./conda.yml')

can create from existing conda

from azureml.core import Environment


env = Environment.from_existing_conda_environment(name='training_environment',

conda_environment_name='py_env')

create from specifying packages

from azureml.core import Environment

from azureml.core.conda_dependencies import CondaDependencies


env = Environment('training_environment')

deps = CondaDependencies.create(conda_packages=['scikit-learn','pandas','numpy'],

pip_packages=['azureml-defaults'])

env.python.conda_dependencies = deps

Create in containers

env.docker.enabled = True

deps = CondaDependencies.create(conda_packages=['scikit-learn','pandas','pip'],

pip_packages=['azureml-defaults']

env.python.conda_dependencies = deps

Can add you own image

env.docker.base_image='my-base-image'

env.docker.base_image_registry='myregistry.azurecr.io/myimage'

or

env.docker.base_image = None

env.docker.base_dockerfile = './Dockerfile'

you can override the version of package or env

env.python.user_managed_dependencies=True

env.python.interpreter_path = '/opt/miniconda/bin/python'

Register your env

env.register(workspace=ws)

See registered envs

from azureml.core import Environment


env_names = Environment.list(workspace=ws)

for env_name in env_names:

print('Name:',env_name)

Get env and set training script

from azureml.core import Environment

from azureml.train.estimator import Estimator


training_env = Environment.get(workspace=ws, name='training_environment')

estimator = Estimator(source_directory='experiment_folder'

entry_script='training_script.py',

compute_target='local',

environment_definition=training_env)

Create compute targets

from azureml.core import Workspace

from azureml.core.compute import ComputeTarget, AmlCompute


# Load the workspace from the saved config file

ws = Workspace.from_config()


# Specify a name for the compute (unique within the workspace)

compute_name = 'aml-cluster'


# Define compute configuration

compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2',

min_nodes=0, max_nodes=4,

vm_priority='dedicated' # lowpriority

# Create the compute

aml_cluster = ComputeTarget.create(ws, compute_name, compute_config)

aml_cluster.wait_for_completion(show_output=True)

https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.compute.amlcompute.amlcompute?view=azure-ml-py

An unmanaged compute target is one that is defined and managed outside of the Azure Machine Learning workspace; for example, an Azure virtual machine or an Azure Databricks cluster.

Use the ComputeTarget.attach() method to attach the existing compute based on its target-specific configuration settings.

e.g. connect to a databricks clusters

from azureml.core import Workspace

from azureml.core.compute import ComputeTarget, DatabricksCompute


# Load the workspace from the saved config file

ws = Workspace.from_config()


# Specify a name for the compute (unique within the workspace)

compute_name = 'db_cluster'


# Define configuration for existing Azure Databricks cluster

db_workspace_name = 'db_workspace'

db_resource_group = 'db_resource_group'

db_access_token = '1234-abc-5678-defg-90...'

db_config = DatabricksCompute.attach_configuration(resource_group=db_resource_group,

workspace_name=db_workspace_name,

access_token=db_access_token)


# Create the compute

databricks_compute = ComputeTarget.attach(ws, compute_name, db_config)

databricks_compute.wait_for_completion(True)

If doesn't exist create.

from azureml.core.compute import ComputeTarget, AmlCompute

from azureml.core.compute_target import ComputeTargetException


compute_name = "aml-cluster"


# Check if the compute target exists

try:

aml_cluster = ComputeTarget(workspace=ws, name=compute_name)

print('Found existing cluster.')

except ComputeTargetException:

# If not, create it

compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2',

max_nodes=4)

aml_cluster = ComputeTarget.create(ws, compute_name, compute_config)


aml_cluster.wait_for_completion(show_output=True)

https://docs.microsoft.com/en-us/azure/machine-learning/how-to-set-up-training-targets


Use compute targets

To use a particular compute target, you can specify it in the appropriate parameter for an experiment run configuration or estimator

from azureml.core import Environment, ScriptRunConfig


compute_name = 'aml-cluster'


training_env = Environment.get(workspace=ws, name='training_environment')


script_config = ScriptRunConfig(source_directory='my_dir',

script='script.py',

environment=env,

compute_target=compute_name)

Instead of specifying the name of the compute target, you can specify a ComputeTarget object, like this:

from azureml.core import Environment, ScriptRunConfig

from azureml.core.compute import ComputeTarget


compute_name = "aml-cluster"


training_cluster = ComputeTarget(workspace=ws, name=compute_name)


training_env = Environment.get(workspace=ws, name='training_environment')


script_config = ScriptRunConfig(source_directory='my_dir',

script='script.py',

environment=env,

compute_target=training_cluster)

Work with Compute Contexts

https://microsoftlearning.github.io/mslearn-dp100/instructions/01-create-a-workspace.html

Orchestrate Machine Learning Pipelines

https://docs.microsoft.com/en-us/python/api/azureml-pipeline-steps/azureml.pipeline.steps?view=azure-ml-py

Two scripts

from azureml.pipeline.steps import PythonScriptStep


# Step to run a Python script

step1 = PythonScriptStep(name = 'prepare data',

source_directory = 'scripts',

script_name = 'data_prep.py',

compute_target = 'aml-cluster')


# Step to train a model

step2 = PythonScriptStep(name = 'train model',

source_directory = 'scripts',

script_name = 'train_model.py',

compute_target = 'aml-cluster')


from azureml.pipeline.core import Pipeline

from azureml.core import Experiment


# Construct the pipeline

train_pipeline = Pipeline(workspace = ws, steps = [step1,step2])


# Create an experiment and run the pipeline

experiment = Experiment(workspace = ws, name = 'training-pipeline')

pipeline_run = experiment.submit(train_pipeline)

PipelineData passes data between steps

from azureml.pipeline.core import PipelineData

from azureml.pipeline.steps import PythonScriptStep, EstimatorStep


# Get a dataset for the initial data

raw_ds = Dataset.get_by_name(ws, 'raw_dataset')


# Define a PipelineData object to pass data between steps

data_store = ws.get_default_datastore()

prepped_data = PipelineData('prepped', datastore=data_store)


# Step to run a Python script

step1 = PythonScriptStep(name = 'prepare data',

source_directory = 'scripts',

script_name = 'data_prep.py',

compute_target = 'aml-cluster',

# Script arguments include PipelineData

arguments = ['--raw-ds', raw_ds.as_named_input('raw_data'),

'--out_folder', prepped_data],

# Specify PipelineData as output

outputs=[prepped_data])


# Step to run an estimator

step2 = PythonScriptStep(name = 'train model',

source_directory = 'scripts',

script_name = 'data_prep.py',

compute_target = 'aml-cluster',

# Pass as script argument

arguments=['--in_folder', prepped_data],

# Specify PipelineData as input

inputs=[prepped_data])


# code in data_prep.py

from azureml.core import Run

import argparse

import os


# Get the experiment run context

run = Run.get_context()


# Get arguments

parser = argparse.ArgumentParser()

parser.add_argument('--raw-ds', type=str, dest='raw_dataset_id')

parser.add_argument('--out_folder', type=str, dest='folder')

args = parser.parse_args()

output_folder = args.folder


# Get input dataset as dataframe

raw_df = run.input_datasets['raw_data'].to_pandas_dataframe()


# code to prep data (in this case, just select specific columns)

prepped_df = raw_df[['col1', 'col2', 'col3']]


# Save prepped data to the PipelineData location

os.makedirs(output_folder, exist_ok=True)

output_path = os.path.join(output_folder, 'prepped_data.csv')

prepped_df.to_csv(output_path)

Allow caching with allow_resuse parameter

step1 = PythonScriptStep(name = 'prepare data',

source_directory = 'scripts',

script_name = 'data_prep.py',

compute_target = 'aml-cluster',

runconfig = run_config,

inputs=[raw_ds.as_named_input('raw_data')],

outputs=[prepped_data],

arguments = ['--folder', prepped_data]),

# Disable step reuse

allow_reuse = False)

Force all to run

pipeline_run = experiment.submit(train_pipeline, regenerate_outputs=True)

Publish pipeline

published_pipeline = pipeline.publish(name='training_pipeline',

description='Model training pipeline',

version='1.0')

or call publish method on a successful run

# Get the most recent run of the pipeline

pipeline_experiment = ws.experiments.get('training-pipeline')

run = list(pipeline_experiment.get_runs())[0]


# Publish the pipeline from the run

published_pipeline = run.publish_pipeline(name='training_pipeline',

description='Model training pipeline',

version='1.0')

See URI

rest_endpoint = published_pipeline.endpoint

print(rest_endpoint)

Display runid

import requests


response = requests.post(rest_endpoint,

headers=auth_header,

json={"ExperimentName": "run_training_pipeline"})

run_id = response.json()["Id"]

print(run_id)

Define parameters in a pipeline

from azureml.pipeline.core.graph import PipelineParameter


reg_param = PipelineParameter(name='reg_rate', default_value=0.01)


...


step2 = PythonScriptStep(name = 'train model',

source_directory = 'scripts',

script_name = 'data_prep.py',

compute_target = 'aml-cluster',

# Pass parameter as script argument

arguments=['--in_folder', prepped_data,

'--reg', reg_param],

inputs=[prepped_data])

After publishing you can run a pipeline with a parameter

response = requests.post(rest_endpoint,

headers=auth_header,

json={"ExperimentName": "run_training_pipeline",

"ParameterAssignments": {"reg_rate": 0.1}})

Run the pipeline daily

from azureml.pipeline.core import ScheduleRecurrence, Schedule


daily = ScheduleRecurrence(frequency='Day', interval=1)

pipeline_schedule = Schedule.create(ws, name='Daily Training',

description='trains model every day',

pipeline_id=published_pipeline.id,

experiment_name='Training_Pipeline',

recurrence=daily)

Trigger on data changes

from azureml.core import Datastore

from azureml.pipeline.core import Schedule


training_datastore = Datastore(workspace=ws, name='blob_data')

pipeline_schedule = Schedule.create(ws, name='Reactive Training',

description='trains model on data change',

pipeline_id=published_pipeline_id,

experiment_name='Training_Pipeline',

datastore=training_datastore,

path_on_datastore='data/training')

Exercise: Create a pipeline

https://microsoftlearning.github.io/mslearn-dp100/instructions/08-create-a-pipeline.html

https://github.com/MicrosoftLearning/mslearn-dp100/blob/main/08%20-%20Create%20a%20Pipeline.ipynb

!pip install --upgrade azureml-sdk azureml-widgets


import azureml.core

from azureml.core import Workspace


# Load the workspace from the saved config file

ws = Workspace.from_config()

print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Prepare data

from azureml.core import Dataset


default_ds = ws.get_default_datastore()


if 'diabetes dataset' not in ws.datasets:

default_ds.upload_files(files=['./data/diabetes.csv', './data/diabetes2.csv'], # Upload the diabetes csv files in /data

target_path='diabetes-data/', # Put it in a folder path in the datastore

overwrite=True, # Replace existing files of the same name

show_progress=True)


#Create a tabular dataset from the path on the datastore (this may take a short while)

tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'diabetes-data/*.csv'))


# Register the tabular dataset

try:

tab_data_set = tab_data_set.register(workspace=ws,

name='diabetes dataset',

description='diabetes data',

tags = {'format':'CSV'},

create_new_version=True)

print('Dataset registered.')

except Exception as ex:

print(ex)

else:

print('Dataset already registered.')

Create folder

import os

# Create a folder for the pipeline step files

experiment_folder = 'diabetes_pipeline'

os.makedirs(experiment_folder, exist_ok=True)


print(experiment_folder)

create the first script, which will read data from the diabetes dataset and apply some simple pre-processing to remove any rows with missing data and normalize the numeric features so they're on a similar scale.

The script includes a argument named --prepped-data, which references the folder where the resulting data should be saved.

%%writefile $experiment_folder/prep_diabetes.py

# Import libraries

import os

import argparse

import pandas as pd

from azureml.core import Run

from sklearn.preprocessing import MinMaxScaler


# Get parameters

parser = argparse.ArgumentParser()

parser.add_argument("--input-data", type=str, dest='raw_dataset_id', help='raw dataset')

parser.add_argument('--prepped-data', type=str, dest='prepped_data', default='prepped_data', help='Folder for results')

args = parser.parse_args()

save_folder = args.prepped_data


# Get the experiment run context

run = Run.get_context()


# load the data (passed as an input dataset)

print("Loading Data...")

diabetes = run.input_datasets['raw_data'].to_pandas_dataframe()


# Log raw row count

row_count = (len(diabetes))

run.log('raw_rows', row_count)


# remove nulls

diabetes = diabetes.dropna()


# Normalize the numeric columns

scaler = MinMaxScaler()

num_cols = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree']

diabetes[num_cols] = scaler.fit_transform(diabetes[num_cols])


# Log processed rows

row_count = (len(diabetes))

run.log('processed_rows', row_count)


# Save the prepped data

print("Saving Data...")

os.makedirs(save_folder, exist_ok=True)

save_path = os.path.join(save_folder,'data.csv')

diabetes.to_csv(save_path, index=False, header=True)


# End the run

run.complete()

create the script for the second step, which will train a model. The script includes a argument named --training-folder, which references the folder where the prepared data was saved by the previous step

%%writefile $experiment_folder/train_diabetes.py

# Import libraries

from azureml.core import Run, Model

import argparse

import pandas as pd

import numpy as np

import joblib

import os

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import roc_auc_score

from sklearn.metrics import roc_curve

import matplotlib.pyplot as plt


# Get parameters

parser = argparse.ArgumentParser()

parser.add_argument("--training-folder", type=str, dest='training_folder', help='training data folder')

args = parser.parse_args()

training_folder = args.training_folder


# Get the experiment run context

run = Run.get_context()


# load the prepared data file in the training folder

print("Loading Data...")

file_path = os.path.join(training_folder,'data.csv')

diabetes = pd.read_csv(file_path)


# Separate features and labels

X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values


# Split data into training set and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)


# Train adecision tree model

print('Training a decision tree model...')

model = DecisionTreeClassifier().fit(X_train, y_train)


# calculate accuracy

y_hat = model.predict(X_test)

acc = np.average(y_hat == y_test)

print('Accuracy:', acc)

run.log('Accuracy', np.float(acc))


# calculate AUC

y_scores = model.predict_proba(X_test)

auc = roc_auc_score(y_test,y_scores[:,1])

print('AUC: ' + str(auc))

run.log('AUC', np.float(auc))


# plot ROC curve

fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])

fig = plt.figure(figsize=(6, 4))

# Plot the diagonal 50% line

plt.plot([0, 1], [0, 1], 'k--')

# Plot the FPR and TPR achieved by our model

plt.plot(fpr, tpr)

plt.xlabel('False Positive Rate')

plt.ylabel('True Positive Rate')

plt.title('ROC Curve')

run.log_image(name = "ROC", plot = fig)

plt.show()


# Save the trained model in the outputs folder

print("Saving model...")

os.makedirs('outputs', exist_ok=True)

model_file = os.path.join('outputs', 'diabetes_model.pkl')

joblib.dump(value=model, filename=model_file)


# Register the model

print('Registering model...')

Model.register(workspace=run.experiment.workspace,

model_path = model_file,

model_name = 'diabetes_model',

tags={'Training context':'Pipeline'},

properties={'AUC': np.float(auc), 'Accuracy': np.float(acc)})



run.complete()

Use the same compute for both steps

from azureml.core.compute import ComputeTarget, AmlCompute

from azureml.core.compute_target import ComputeTargetException


cluster_name = "compute-ray"


try:

# Check for existing compute target

pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)

print('Found existing cluster, use it.')

except ComputeTargetException:

# If it doesn't already exist, create it

try:

compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)

pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)

pipeline_cluster.wait_for_completion(show_output=True)

except Exception as ex:

print(ex)

Create env

from azureml.core import Environment

from azureml.core.conda_dependencies import CondaDependencies

from azureml.core.runconfig import RunConfiguration


# Create a Python environment for the experiment

diabetes_env = Environment("diabetes-pipeline-env")

diabetes_env.python.user_managed_dependencies = False # Let Azure ML manage dependencies

diabetes_env.docker.enabled = True # Use a docker container


# Create a set of package dependencies

diabetes_packages = CondaDependencies.create(conda_packages=['scikit-learn','ipykernel','matplotlib','pandas','pip'],

pip_packages=['azureml-defaults','azureml-dataprep[pandas]','pyarrow'])


# Add the dependencies to the environment

diabetes_env.python.conda_dependencies = diabetes_packages


# Register the environment

diabetes_env.register(workspace=ws)

registered_env = Environment.get(ws, 'diabetes-pipeline-env')


# Create a new runconfig object for the pipeline

pipeline_run_config = RunConfiguration()


# Use the compute you created above.

pipeline_run_config.target = pipeline_cluster


# Assign the environment to the run configuration

pipeline_run_config.environment = registered_env


print ("Run configuration created.")

Create and run pipeline

from azureml.pipeline.core import PipelineData

from azureml.pipeline.steps import PythonScriptStep


# Get the training dataset

diabetes_ds = ws.datasets.get("diabetes dataset")


# Create a PipelineData (temporary Data Reference) for the model folder

prepped_data_folder = PipelineData("prepped_data_folder", datastore=ws.get_default_datastore())


# Step 1, Run the data prep script

train_step = PythonScriptStep(name = "Prepare Data",

source_directory = experiment_folder,

script_name = "prep_diabetes.py",

arguments = ['--input-data', diabetes_ds.as_named_input('raw_data'),

'--prepped-data', prepped_data_folder],

outputs=[prepped_data_folder],

compute_target = pipeline_cluster,

runconfig = pipeline_run_config,

allow_reuse = True)


# Step 2, run the training script

register_step = PythonScriptStep(name = "Train and Register Model",

source_directory = experiment_folder,

script_name = "train_diabetes.py",

arguments = ['--training-folder', prepped_data_folder],

inputs=[prepped_data_folder],

compute_target = pipeline_cluster,

runconfig = pipeline_run_config,

allow_reuse = True)


print("Pipeline steps defined")

Build the pipeline and run as an experiment

from azureml.core import Experiment

from azureml.pipeline.core import Pipeline

from azureml.widgets import RunDetails


# Construct the pipeline

pipeline_steps = [train_step, register_step]

pipeline = Pipeline(workspace=ws, steps=pipeline_steps)

print("Pipeline is built.")


# Create an experiment and run the pipeline

experiment = Experiment(workspace=ws, name = 'mslearn-diabetes-pipeline')

pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)

print("Pipeline submitted for execution.")

RunDetails(pipeline_run).show()

pipeline_run.wait_for_completion(show_output=True)

Monitor the run in the Experiments tab

When the pipeline has finished, you can examine the metrics recorded by it's child runs.

for run in pipeline_run.get_children():

print(run.name, ':')

metrics = run.get_metrics()

for metric_name in metrics:

print('\t',metric_name, ":", metrics[metric_name])

a new model should be registered with a Training context tag indicating it was trained in a pipeline. See it as

from azureml.core import Model


for model in Model.list(ws):

print(model.name, 'version:', model.version)

for tag_name in model.tags:

tag = model.tags[tag_name]

print ('\t',tag_name, ':', tag)

for prop_name in model.properties:

prop = model.properties[prop_name]

print ('\t',prop_name, ':', prop)

print('\n')

Publish the pipeline

# Publish the pipeline from the run

published_pipeline = pipeline_run.publish_pipeline(

name="diabetes-training-pipeline", description="Trains diabetes model", version="1.0")


published_pipeline

See the endpoint

rest_endpoint = published_pipeline.endpoint

print(rest_endpoint)

Call the pipeline endpoint

use the authorization header from your current connection to your Azure workspace

from azureml.core.authentication import InteractiveLoginAuthentication


interactive_auth = InteractiveLoginAuthentication()

auth_header = interactive_auth.get_authentication_header()

print("Authentication header ready.")

Call the rest interface

import requests


experiment_name = 'mslearn-diabetes-pipeline'


rest_endpoint = published_pipeline.endpoint

response = requests.post(rest_endpoint,

headers=auth_header,

json={"ExperimentName": experiment_name})

run_id = response.json()["Id"]

run_id

Wait for the run to complete

from azureml.pipeline.core.run import PipelineRun


published_pipeline_run = PipelineRun(ws.experiments[experiment_name], run_id)

pipeline_run.wait_for_completion(show_output=True)

Schedule the pipeline to run week and retain with new data

from azureml.pipeline.core import ScheduleRecurrence, Schedule


# Submit the Pipeline every Monday at 00:00 UTC

recurrence = ScheduleRecurrence(frequency="Week", interval=1, week_days=["Monday"], time_of_day="00:00")

weekly_schedule = Schedule.create(ws, name="weekly-diabetes-training",

description="Based on time",

pipeline_id=published_pipeline.id,

experiment_name='mslearn-diabetes-pipeline',

recurrence=recurrence)

print('Pipeline scheduled.')

Retrieve schedules as

schedules = Schedule.list(ws)

schedules

Check the latest run

pipeline_experiment = ws.experiments.get('mslearn-diabetes-pipeline')

latest_run = list(pipeline_experiment.get_runs())[0]


latest_run.get_details()

Deploy real-time machine learning services with Azure Machine Learning

Register a trained model

from azureml.core import Model


classification_model = Model.register(workspace=ws,

model_name='classification_model',

model_path='model.pkl', # local path

description='A classification model')

You can also register from a run

run.register_model( model_name='classification_model',

model_path='outputs/model.pkl', # run outputs path

description='A classification model')

Create an entry script (or scoring script)

import json

import joblib

import numpy as np

from azureml.core.model import Model


# Called when the service is loaded

def init():

global model

# Get the path to the registered model file and load it

model_path = Model.get_model_path('classification_model')

model = joblib.load(model_path)


# Called when a request is received

def run(raw_data):

# Get the input data as a numpy array

data = np.array(json.loads(raw_data)['data'])

# Get a prediction from the model

predictions = model.predict(data)

# Return the predictions as any JSON serializable format

return predictions.tolist()

Create an environment

from azureml.core.conda_dependencies import CondaDependencies


# Add the dependencies for your model

myenv = CondaDependencies()

myenv.add_conda_package("scikit-learn")


# Save the environment config as a .yml file

env_file = 'service_files/env.yml'

with open(env_file,"w") as f:

f.write(myenv.serialize_to_string())

print("Saved dependency info in", env_file)

Create env and entry script

from azureml.core.model import InferenceConfig


classifier_inference_config = InferenceConfig(runtime= "python",

source_directory = 'service_files',

entry_script="score.py",

conda_file="env.yml")

Create AKS cluster for deployment

from azureml.core.compute import ComputeTarget, AksCompute


cluster_name = 'aks-cluster'

compute_config = AksCompute.provisioning_configuration(location='eastus')

production_cluster = ComputeTarget.create(ws, cluster_name, compute_config)

production_cluster.wait_for_completion(show_output=True)

Set compute

from azureml.core.webservice import AksWebservice


classifier_deploy_config = AksWebservice.deploy_configuration(cpu_cores = 1,

memory_gb = 1)

Deploy the model

from azureml.core.model import Model


model = ws.models['classification_model']

service = Model.deploy(workspace=ws,

name = 'classifier-service',

models = [model],

inference_config = classifier_inference_config,

deployment_config = classifier_deploy_config,

deployment_target = production_cluster)

service.wait_for_deployment(show_output = True)

Get prdictions (response)

import json


# An array of new data cases

x_new = [[0.1,2.3,4.1,2.0],

[0.2,1.8,3.9,2.1]]


# Convert the array to a serializable list in a JSON document

json_data = json.dumps({"data": x_new})


# Call the web service, passing the input data

response = service.run(input_data = json_data)


# Get the predictions

predictions = json.loads(response)


# Print the predicted class for each case.

for i in range(len(x_new)):

print (x_new[i]), predictions[i] )

If using a REST api (not SDK)

endpoint = service.scoring_uri

print(endpoint)


import requests

import json


# An array of new data cases

x_new = [[0.1,2.3,4.1,2.0],

[0.2,1.8,3.9,2.1]]


# Convert the array to a serializable list in a JSON document

json_data = json.dumps({"data": x_new})


# Set the content type in the request headers

request_headers = { 'Content-Type':'application/json' }


# Call the service

response = requests.post(url = endpoint,

data = json_data,

headers = request_headers)


# Get the predictions from the JSON response

predictions = json.loads(response.json())


# Print the predicted class for each case.

for i in range(len(x_new)):

print (x_new[i]), predictions[i] )

Can authenticate with keys

primary_key, secondary_key = service.get_keys()

You can get you token as

import requests

import json


# An array of new data cases

x_new = [[0.1,2.3,4.1,2.0],

[0.2,1.8,3.9,2.1]]


# Convert the array to a serializable list in a JSON document

json_data = json.dumps({"data": x_new})


# Set the content type in the request headers

request_headers = { "Content-Type":"application/json",

"Authorization":"Bearer " + key_or_token }


# Call the service

response = requests.post(url = endpoint,

data = json_data,

headers = request_headers)


# Get the predictions from the JSON response

predictions = json.loads(response.json())


# Print the predicted class for each case.

for i in range(len(x_new)):

print (x_new[i]), predictions[i] )

Check the server state

from azureml.core.webservice import AksWebservice


# Get the deployed service

service = AksWebservice(name='classifier-service', workspace=ws)


# Check its state

print(service.state)

See service logs

print(service.get_logs())

Deploy to local container

from azureml.core.webservice import LocalWebservice


deployment_config = LocalWebservice.deploy_configuration(port=8890)

service = Model.deploy(ws, 'test-svc', [model], inference_config, deployment_config)

Test the deployed service using the SDK

print(service.run(input_data = json_data))

troubleshoot runtime issues by making changes to the scoring file that is referenced in the inference configuration, and reloading the service without redeploying it (s

service.reload()

print(service.run(input_data = json_data))

Exercise

https://github.com/MicrosoftLearning/mslearn-dp100/blob/main/09%20-%20Create%20a%20Real-time%20Inferencing%20Service.ipynb

!pip install --upgrade azureml-sdk azureml-widgets

Connect to workspace


import azureml.core

from azureml.core import Workspace


# Load the workspace from the saved config file

ws = Workspace.from_config()

print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Train and register a model

from azureml.core import Experiment

from azureml.core import Model

import pandas as pd

import numpy as np

import joblib

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import roc_auc_score

from sklearn.metrics import roc_curve


# Create an Azure ML experiment in your workspace

experiment = Experiment(workspace=ws, name="mslearn-train-diabetes")

run = experiment.start_logging()

print("Starting experiment:", experiment.name)


# load the diabetes dataset

print("Loading Data...")

diabetes = pd.read_csv('data/diabetes.csv')


# Separate features and labels

X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values


# Split data into training set and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)


# Train a decision tree model

print('Training a decision tree model')

model = DecisionTreeClassifier().fit(X_train, y_train)


# calculate accuracy

y_hat = model.predict(X_test)

acc = np.average(y_hat == y_test)

print('Accuracy:', acc)

run.log('Accuracy', np.float(acc))


# calculate AUC

y_scores = model.predict_proba(X_test)

auc = roc_auc_score(y_test,y_scores[:,1])

print('AUC: ' + str(auc))

run.log('AUC', np.float(auc))


# Save the trained model

model_file = 'diabetes_model.pkl'

joblib.dump(value=model, filename=model_file)

run.upload_file(name = 'outputs/' + model_file, path_or_stream = './' + model_file)


# Complete the run

run.complete()


# Register the model

run.register_model(model_path='outputs/diabetes_model.pkl', model_name='diabetes_model',

tags={'Training context':'Inline Training'},

properties={'AUC': run.get_metrics()['AUC'], 'Accuracy': run.get_metrics()['Accuracy']})

print('Model trained and registered.')

Deploy the model as a web service


from azureml.core import Model


for model in Model.list(ws):

print(model.name, 'version:', model.version)

for tag_name in model.tags:

tag = model.tags[tag_name]

print ('\t',tag_name, ':', tag)

for prop_name in model.properties:

prop = model.properties[prop_name]

print ('\t',prop_name, ':', prop)

print('\n')

Get the model to deploy

model = ws.models['diabetes_model']

print(model.name, 'version', model.version)

Set folder to host web service


import os


folder_name = 'diabetes_service'


# Create a folder for the web service files

experiment_folder = './' + folder_name

os.makedirs(experiment_folder, exist_ok=True)


print(folder_name, 'folder created.')


# Set path for scoring script

script_file = os.path.join(experiment_folder,"score_diabetes.py")

Create an entry script

import json

import joblib

import numpy as np

from azureml.core.model import Model


# Called when the service is loaded

def init():

global model

# Get the path to the deployed model file and load it

model_path = Model.get_model_path('diabetes_model')

model = joblib.load(model_path)


# Called when a request is received

def run(raw_data):

# Get the input data as a numpy array

data = np.array(json.loads(raw_data)['data'])

# Get a prediction from the model

predictions = model.predict(data)

# Get the corresponding classname for each prediction (0 or 1)

classnames = ['not-diabetic', 'diabetic']

predicted_classes = []

for prediction in predictions:

predicted_classes.append(classnames[prediction])

# Return the predictions as JSON

return json.dumps(predicted_classes)

Create env


from azureml.core.conda_dependencies import CondaDependencies


# Add the dependencies for our model (AzureML defaults is already included)

myenv = CondaDependencies()

myenv.add_conda_package('scikit-learn')


# Save the environment config as a .yml file

env_file = os.path.join(experiment_folder,"diabetes_env.yml")

with open(env_file,"w") as f:

f.write(myenv.serialize_to_string())

print("Saved dependency info in", env_file)


# Print the .yml file

with open(env_file,"r") as f:

print(f.read())

deploy the container a service named diabetes-service

  1. Define an inference configuration, which includes the scoring and environment files required to load and use the model.

  2. Define a deployment configuration that defines the execution environment in which the service will be hosted. In this case, an Azure Container Instance.

  3. Deploy the model as a web service.

  4. Verify the status of the deployed service.

from azureml.core.webservice import AciWebservice

from azureml.core.model import InferenceConfig


# Configure the scoring environment

inference_config = InferenceConfig(runtime= "python",

entry_script=script_file,

conda_file=env_file)


deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1)


service_name = "diabetes-service"


service = Model.deploy(ws, service_name, [model], inference_config, deployment_config)


service.wait_for_deployment(True)

print(service.state)

See the logs

print(service.get_logs())


# If you need to make a change and redeploy, you may need to delete unhealthy service using the following code:

#service.delete()

See Endpoints

for webservice_name in ws.webservices:

print(webservice_name)

consumer the service


import json


x_new = [[2,180,74,24,21,23.9091702,1.488172308,22]]

print ('Patient: {}'.format(x_new[0]))


# Convert the array to a serializable list in a JSON document

input_json = json.dumps({"data": x_new})


# Call the web service, passing the input data (the web service will also accept the data in binary format)

predictions = service.run(input_data = input_json)


# Get the predicted class - it'll be the first (and only) one.

predicted_classes = json.loads(predictions)

print(predicted_classes[0])

Send multiple entries


import json


# This time our input is an array of two feature arrays

x_new = [[2,180,74,24,21,23.9091702,1.488172308,22],

[0,148,58,11,179,39.19207553,0.160829008,45]]


# Convert the array or arrays to a serializable list in a JSON document

input_json = json.dumps({"data": x_new})


# Call the web service, passing the input data

predictions = service.run(input_data = input_json)


# Get the predicted classes.

predicted_classes = json.loads(predictions)

for i in range(len(x_new)):

print ("Patient {}".format(x_new[i]), predicted_classes[i] )

See endpoint


endpoint = service.scoring_uri

print(endpoint)

Make HTTP request

import requests

import json


x_new = [[2,180,74,24,21,23.9091702,1.488172308,22],

[0,148,58,11,179,39.19207553,0.160829008,45]]


# Convert the array to a serializable list in a JSON document

input_json = json.dumps({"data": x_new})


# Set the content type

headers = { 'Content-Type':'application/json' }


predictions = requests.post(endpoint, input_json, headers = headers)

predicted_classes = json.loads(predictions.json())


for i in range(len(x_new)):

print ("Patient {}".format(x_new[i]), predicted_classes[i] )

Delete the service

service.delete()

print ('Service deleted.')

More info in docs https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-and-where?tabs=azcli

Deploy batch inference pipelines with Azure Machine Learning

Register a model

from azureml.core import Model


classification_model = Model.register(workspace=your_workspace,

model_name='classification_model',

model_path='model.pkl', # local path

description='A classification model')

or

run.register_model( model_name='classification_model',

model_path='outputs/model.pkl', # run outputs path

description='A classification model')

Create a scoring script. init called when the pipeline is initiated. run(mini-batch) for each of batch of data to be processed

import os

import numpy as np

from azureml.core import Model

import joblib


def init():

# Runs when the pipeline step is initialized

global model


# load the model

model_path = Model.get_model_path('classification_model')

model = joblib.load(model_path)


def run(mini_batch):

# This runs for each batch

resultList = []


# process each file in the batch

for f in mini_batch:

# Read comma-delimited data into an array

data = np.genfromtxt(f, delimiter=',')

# Reshape into a 2-dimensional array for model input

prediction = model.predict(data.reshape(1, -1))

# Append prediction to results

resultList.append("{}: {}".format(os.path.basename(f), prediction[0]))

return resultList

Create a pipeline with a parallel run-step

from azureml.pipeline.steps import ParallelRunConfig, ParallelRunStep

from azureml.pipeline.core import PipelineData

from azureml.pipeline.core import Pipeline


# Get the batch dataset for input

batch_data_set = ws.datasets['batch-data']


# Set the output location

default_ds = ws.get_default_datastore()

output_dir = PipelineData(name='inferences',

datastore=default_ds,

output_path_on_compute='results')


# Define the parallel run step step configuration

parallel_run_config = ParallelRunConfig(

source_directory='batch_scripts',

entry_script="batch_scoring_script.py",

mini_batch_size="5",

error_threshold=10,

output_action="append_row",

environment=batch_env,

compute_target=aml_cluster,

node_count=4)


# Create the parallel run step

parallelrun_step = ParallelRunStep(

name='batch-score',

parallel_run_config=parallel_run_config,

inputs=[batch_data_set.as_named_input('batch_data')],

output=output_dir,

arguments=[],

allow_reuse=True

)

# Create the pipeline

pipeline = Pipeline(workspace=ws, steps=[parallelrun_step])

Run the pipeline and retrive the output script

from azureml.core import Experiment


# Run the pipeline as an experiment

pipeline_run = Experiment(ws, 'batch_prediction_pipeline').submit(pipeline)

pipeline_run.wait_for_completion(show_output=True)


# Get the outputs from the first (and only) step

prediction_run = next(pipeline_run.get_children())

prediction_output = prediction_run.get_output_data('inferences')

prediction_output.download(local_path='results')


# Find the parallel_run_step.txt file

for root, dirs, files in os.walk('results'):

for file in files:

if file.endswith('parallel_run_step.txt'):

result_file = os.path.join(root,file)


# Load and display the results

df = pd.read_csv(result_file, delimiter=":", header=None)

df.columns = ["File", "Prediction"]

print(df)

Publishing

as a REST service

published_pipeline = pipeline_run.publish_pipeline(name='Batch_Prediction_Pipeline',

description='Batch pipeline',

version='1.0')

rest_endpoint = published_pipeline.endpoint

Use the service endpoint

import requests


response = requests.post(rest_endpoint,

headers=auth_header,

json={"ExperimentName": "Batch_Prediction"})

run_id = response.json()["Id"]

Schedule the pipeline

from azureml.pipeline.core import ScheduleRecurrence, Schedule


weekly = ScheduleRecurrence(frequency='Week', interval=1)

pipeline_schedule = Schedule.create(ws, name='Weekly Predictions',

description='batch inferencing',

pipeline_id=published_pipeline.id,

experiment_name='Batch_Prediction',

recurrence=weekly)

Exercise

https://github.com/MicrosoftLearning/mslearn-dp100/blob/main/10%20-%20Create%20a%20Batch%20Inferencing%20Service.ipynb

!pip install --upgrade azureml-sdk azureml-widgets

import azureml.core

from azureml.core import Workspace


# Load the workspace from the saved config file

ws = Workspace.from_config()

print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Train and register a model

from azureml.core import Experiment

from azureml.core import Model

import pandas as pd

import numpy as np

import joblib

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import roc_auc_score

from sklearn.metrics import roc_curve


# Create an Azure ML experiment in your workspace

experiment = Experiment(workspace=ws, name='mslearn-train-diabetes')

run = experiment.start_logging()

print("Starting experiment:", experiment.name)


# load the diabetes dataset

print("Loading Data...")

diabetes = pd.read_csv('data/diabetes.csv')


# Separate features and labels

X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values


# Split data into training set and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)


# Train a decision tree model

print('Training a decision tree model')

model = DecisionTreeClassifier().fit(X_train, y_train)


# calculate accuracy

y_hat = model.predict(X_test)

acc = np.average(y_hat == y_test)

print('Accuracy:', acc)

run.log('Accuracy', np.float(acc))


# calculate AUC

y_scores = model.predict_proba(X_test)

auc = roc_auc_score(y_test,y_scores[:,1])

print('AUC: ' + str(auc))

run.log('AUC', np.float(auc))


# Save the trained model

model_file = 'diabetes_model.pkl'

joblib.dump(value=model, filename=model_file)

run.upload_file(name = 'outputs/' + model_file, path_or_stream = './' + model_file)


# Complete the run

run.complete()


# Register the model

run.register_model(model_path='outputs/diabetes_model.pkl', model_name='diabetes_model',

tags={'Training context':'Inline Training'},

properties={'AUC': run.get_metrics()['AUC'], 'Accuracy': run.get_metrics()['Accuracy']})


print('Model trained and registered.')

Generate and upload batch data

from azureml.core import Datastore, Dataset

import pandas as pd

import os


# Set default data store

ws.set_default_datastore('workspaceblobstore')

default_ds = ws.get_default_datastore()


# Enumerate all datastores, indicating which is the default

for ds_name in ws.datastores:

print(ds_name, "- Default =", ds_name == default_ds.name)


# Load the diabetes data

diabetes = pd.read_csv('data/diabetes2.csv')

# Get a 100-item sample of the feature columns (not the diabetic label)

sample = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].sample(n=100).values


# Create a folder

batch_folder = './batch-data'

os.makedirs(batch_folder, exist_ok=True)

print("Folder created!")


# Save each sample as a separate file

print("Saving files...")

for i in range(100):

fname = str(i+1) + '.csv'

sample[i].tofile(os.path.join(batch_folder, fname), sep=",")

print("files saved!")


# Upload the files to the default datastore

print("Uploading files to datastore...")

default_ds = ws.get_default_datastore()

default_ds.upload(src_dir="batch-data", target_path="batch-data", overwrite=True, show_progress=True)


# Register a dataset for the input data

batch_data_set = Dataset.File.from_files(path=(default_ds, 'batch-data/'), validate=False)

try:

batch_data_set = batch_data_set.register(workspace=ws,

name='batch-data',

description='batch data',

create_new_version=True)

except Exception as ex:

print(ex)


print("Done!")

Create compute

from azureml.core.compute import ComputeTarget, AmlCompute

from azureml.core.compute_target import ComputeTargetException


cluster_name = "your-compute-cluster"


try:

# Check for existing compute target

inference_cluster = ComputeTarget(workspace=ws, name=cluster_name)

print('Found existing cluster, use it.')

except ComputeTargetException:

# If it doesn't already exist, create it

try:

compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)

inference_cluster = ComputeTarget.create(ws, cluster_name, compute_config)

inference_cluster.wait_for_completion(show_output=True)

except Exception as ex:

print(ex)

Create a pipline for batch inferencing

import os

# Create a folder for the experiment files

experiment_folder = 'batch_pipeline'

os.makedirs(experiment_folder, exist_ok=True)


print(experiment_folder)

write script

%%writefile $experiment_folder/batch_diabetes.py

import os

import numpy as np

from azureml.core import Model

import joblib



def init():

# Runs when the pipeline step is initialized

global model


# load the model

model_path = Model.get_model_path('diabetes_model')

model = joblib.load(model_path)



def run(mini_batch):

# This runs for each batch

resultList = []


# process each file in the batch

for f in mini_batch:

# Read the comma-delimited data into an array

data = np.genfromtxt(f, delimiter=',')

# Reshape into a 2-dimensional array for prediction (model expects multiple items)

prediction = model.predict(data.reshape(1, -1))

# Append prediction to results

resultList.append("{}: {}".format(os.path.basename(f), prediction[0]))

return resultList

Define run context with requirements


from azureml.core import Environment

from azureml.core.runconfig import DEFAULT_CPU_IMAGE

from azureml.core.runconfig import CondaDependencies


# Add dependencies required by the model

# For scikit-learn models, you need scikit-learn

# For parallel pipeline steps, you need azureml-core and azureml-dataprep[fuse]

cd = CondaDependencies.create(conda_packages=['scikit-learn','pip'],

pip_packages=['azureml-defaults','azureml-core','azureml-dataprep[fuse]'])


batch_env = Environment(name='batch_environment')

batch_env.python.conda_dependencies = cd

batch_env.docker.enabled = True

batch_env.docker.base_image = DEFAULT_CPU_IMAGE

print('Configuration ready.')

use a pipeline to run the batch prediction script, generate predictions from the input data, and save the results as a text file in the output folder

from azureml.pipeline.steps import ParallelRunConfig, ParallelRunStep

from azureml.pipeline.core import PipelineData


default_ds = ws.get_default_datastore()


output_dir = PipelineData(name='inferences',

datastore=default_ds,

output_path_on_compute='diabetes/results')


parallel_run_config = ParallelRunConfig(

source_directory=experiment_folder,

entry_script="batch_diabetes.py",

mini_batch_size="5",

error_threshold=10,

output_action="append_row",

environment=batch_env,

compute_target=inference_cluster,

node_count=2)


parallelrun_step = ParallelRunStep(

name='batch-score-diabetes',

parallel_run_config=parallel_run_config,

inputs=[batch_data_set.as_named_input('diabetes_batch')],

output=output_dir,

arguments=[],

allow_reuse=True

)

print('Steps defined')

put the step into a pipeline and run in

from azureml.core import Experiment

from azureml.pipeline.core import Pipeline


pipeline = Pipeline(workspace=ws, steps=[parallelrun_step])

pipeline_run = Experiment(ws, 'mslearn-diabetes-batch').submit(pipeline)

pipeline_run.wait_for_completion(show_output=True)

Retrieve predictions as


import pandas as pd

import shutil


# Remove the local results folder if left over from a previous run

shutil.rmtree('diabetes-results', ignore_errors=True)


# Get the run for the first step and download its output

prediction_run = next(pipeline_run.get_children())

prediction_output = prediction_run.get_output_data('inferences')

prediction_output.download(local_path='diabetes-results')


# Traverse the folder hierarchy and find the results file

for root, dirs, files in os.walk('diabetes-results'):

for file in files:

if file.endswith('parallel_run_step.txt'):

result_file = os.path.join(root,file)


# cleanup output format

df = pd.read_csv(result_file, delimiter=":", header=None)

df.columns = ["File", "Prediction"]


# Display the first 20 results

df.head(20)

Publish the pipeline and its REST interface

published_pipeline = pipeline_run.publish_pipeline(

name='diabetes-batch-pipeline', description='Batch scoring of diabetes data', version='1.0')

published_pipeline

you can see the end point in the Azure portal or

rest_endpoint = published_pipeline.endpoint

print(rest_endpoint)

make a REST call over HTTP

from azureml.core.authentication import InteractiveLoginAuthentication


interactive_auth = InteractiveLoginAuthentication()

auth_header = interactive_auth.get_authentication_header()

print('Authentication header ready.')


import requests


rest_endpoint = published_pipeline.endpoint

response = requests.post(rest_endpoint,

headers=auth_header,

json={"ExperimentName": "mslearn-diabetes-batch"})

run_id = response.json()["Id"]

run_id


from azureml.pipeline.core.run import PipelineRun

from azureml.widgets import RunDetails


published_pipeline_run = PipelineRun(ws.experiments['mslearn-diabetes-batch'], run_id)


# Block until the run completes

published_pipeline_run.wait_for_completion(show_output=True)

See the results


import pandas as pd

import shutil


# Remove the local results folder if left over from a previous run

shutil.rmtree('diabetes-results', ignore_errors=True)


# Get the run for the first step and download its output

prediction_run = next(pipeline_run.get_children())

prediction_output = prediction_run.get_output_data('inferences')

prediction_output.download(local_path='diabetes-results')


# Traverse the folder hierarchy and find the results file

for root, dirs, files in os.walk('diabetes-results'):

for file in files:

if file.endswith('parallel_run_step.txt'):

result_file = os.path.join(root,file)


# cleanup output format

df = pd.read_csv(result_file, delimiter=":", header=None)

df.columns = ["File", "Prediction"]


# Display the first 20 results

df.head(20)

https://docs.microsoft.com/en-us/azure/machine-learning/tutorial-pipeline-batch-scoring-classification

Tune hyperparameters with Azure Machine Learning

hyperdrive run

discrete hyperparameters: you can use a Python list (choice([10,20,30])), a range (choice(range(1,10))), or an arbitrary set of comma-separated values (choice(30,50,100)). or discrete value from a distribute distributions qnormal, quniform, qlognormal, qloguniform

Some values are continuous normal, uniform, lognormal, loguniform

define a search space

from azureml.train.hyperdrive import choice, normal


param_space = {

'--batch_size': choice(16, 32, 64),

'--learning_rate': normal(10, 3)

}

Grid sampling if all discrete

from azureml.train.hyperdrive import GridParameterSampling, choice


param_space = {

'--batch_size': choice(16, 32, 64),

'--learning_rate': choice(0.01, 0.1, 1.0)

}


param_sampling = GridParameterSampling(param_space)

Random sample can be discrete or continuous

from azureml.train.hyperdrive import RandomParameterSampling, choice, normal


param_space = {

'--batch_size': choice(16, 32, 64),

'--learning_rate': normal(10, 3)

}


param_sampling = RandomParameterSampling(param_space)

Bayesian sampling

from azureml.train.hyperdrive import BayesianParameterSampling, choice, uniform


param_space = {

'--batch_size': choice(16, 32, 64),

'--learning_rate': uniform(0.5, 0.1)

}


param_sampling = BayesianParameterSampling(param_space)

You can only use Bayesian sampling with choice, uniform, and quniform parameter expressions, and you can't combine it with an early-termination policy.

early termination policy that abandons runs that are unlikely to produce a better result than previously completed runs. The policy is evaluated at an evaluation_interval you specify, based on each time the target performance metric is logged. You can also set a delay_evaluation parameter to avoid evaluating the policy until a minimum number of iterations have been completed.

Bandit policy

from azureml.train.hyperdrive import BanditPolicy


early_termination_policy = BanditPolicy(slack_amount = 0.2,

evaluation_interval=1,

delay_evaluation=5)

Applies the policy for every iteration after the first five, and abandons runs where the reported target metric is 0.2 or more worse than the best performing run after the same number of intervals.

A median stopping policy abandons runs where the target performance metric is worse than the median of the running averages for all runs.

from azureml.train.hyperdrive import MedianStoppingPolicy


early_termination_policy = MedianStoppingPolicy(evaluation_interval=1,

delay_evaluation=5)

A truncation selection policy cancels the lowest performing X% of runs at each evaluation interval based on the truncation_percentage value you specify for X.

from azureml.train.hyperdrive import TruncationSelectionPolicy


early_termination_policy = TruncationSelectionPolicy(truncation_percentage=10,

evaluation_interval=1,

delay_evaluation=5)

Create a training script for hyperparamter tuning

  • Include an argument for each hyperparameter you want to vary.

  • Log the target performance metric. This enables the hyperdrive run to evaluate the performance of the child runs it initiates, and identify the one that produces the best performing model.

the following example script trains a logistic regression model using a --regularization argument to set the regularization rate hyperparameter, and logs the accuracy metric with the name Accuracy:

import argparse

import joblib

from azureml.core import Run

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression


# Get regularization hyperparameter

parser = argparse.ArgumentParser()

parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01)

args = parser.parse_args()

reg = args.reg_rate


# Get the experiment run context

run = Run.get_context()


# load the training dataset

data = run.input_datasets['training_data'].to_pandas_dataframe()


# Separate features and labels, and split for training/validatiom

X = data[['feature1','feature2','feature3','feature4']].values

y = data['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)


# Train a logistic regression model with the reg hyperparameter

model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)


# calculate and log accuracy

y_hat = model.predict(X_test)

acc = np.average(y_hat == y_test)

run.log('Accuracy', np.float(acc))


# Save the trained model

os.makedirs('outputs', exist_ok=True)

joblib.dump(value=model, filename='outputs/model.pkl')


run.complete()

Configuring and running

from azureml.core import Experiment

from azureml.train.hyperdrive import HyperDriveConfig, PrimaryMetricGoal


# Assumes ws, script_config and param_sampling are already defined


hyperdrive = HyperDriveConfig(run_config=script_config,

hyperparameter_sampling=param_sampling,

policy=None,

primary_metric_name='Accuracy',

primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,

max_total_runs=6,

max_concurrent_runs=4)


experiment = Experiment(workspace = ws, name = 'hyperdrive_training')

hyperdrive_run = experiment.submit(config=hyperdrive)

Monitoring and reviewing hyperdrive runs

The experiment will initiate a child run for each hyperparameter combination to be tried, and you can retrieve the logged metrics these runs using the following code:

for child_run in run.get_children():

print(child_run.id, child_run.get_metrics())

List all runs in descending order of performance like this

for child_run in hyperdrive_.get_children_sorted_by_primary_metric():

print(child_run)

retrieve the best performing run, you can use the following code:

best_run = hyperdrive_run.get_best_run_by_primary_metric()

Exercise

!pip install --upgrade azureml-sdk azureml-widgets


import azureml.core

from azureml.core import Workspace


# Load the workspace from the saved config file

ws = Workspace.from_config()

print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Prep data


from azureml.core import Dataset


default_ds = ws.get_default_datastore()


if 'diabetes dataset' not in ws.datasets:

default_ds.upload_files(files=['./data/diabetes.csv', './data/diabetes2.csv'], # Upload the diabetes csv files in /data

target_path='diabetes-data/', # Put it in a folder path in the datastore

overwrite=True, # Replace existing files of the same name

show_progress=True)


#Create a tabular dataset from the path on the datastore (this may take a short while)

tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'diabetes-data/*.csv'))


# Register the tabular dataset

try:

tab_data_set = tab_data_set.register(workspace=ws,

name='diabetes dataset',

description='diabetes data',

tags = {'format':'CSV'},

create_new_version=True)

print('Dataset registered.')

except Exception as ex:

print(ex)

else:

print('Dataset already registered.')

Prepare a training script

import os


experiment_folder = 'diabetes_training-hyperdrive'

os.makedirs(experiment_folder, exist_ok=True)


print('Folder ready.')

create the Python script to train the model. In this example, you'll use a Gradient Boosting algorithm to train a classification model. The script must include:

  • An argument for each hyperparameter you want to optimize (in this case, the learning rate and number of estimators for the Gradient Boosting algorithm)

  • Code to log the performance metric you want to optimize for (in this case, you'll log both AUC and accuracy, so you can choose to optimize the model for either of these)

%%writefile $experiment_folder/diabetes_training.py

# Import libraries

import argparse, joblib, os

from azureml.core import Run

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import roc_auc_score, roc_curve


# Get the experiment run context

run = Run.get_context()


# Get script arguments

parser = argparse.ArgumentParser()


# Input dataset

parser.add_argument("--input-data", type=str, dest='input_data', help='training dataset')


# Hyperparameters

parser.add_argument('--learning_rate', type=float, dest='learning_rate', default=0.1, help='learning rate')

parser.add_argument('--n_estimators', type=int, dest='n_estimators', default=100, help='number of estimators')


# Add arguments to args collection

args = parser.parse_args()


# Log Hyperparameter values

run.log('learning_rate', np.float(args.learning_rate))

run.log('n_estimators', np.int(args.n_estimators))


# load the diabetes dataset

print("Loading Data...")

diabetes = run.input_datasets['training_data'].to_pandas_dataframe() # Get the training data from the estimator input


# Separate features and labels

X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values


# Split data into training set and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)


# Train a Gradient Boosting classification model with the specified hyperparameters

print('Training a classification model')

model = GradientBoostingClassifier(learning_rate=args.learning_rate,

n_estimators=args.n_estimators).fit(X_train, y_train)


# calculate accuracy

y_hat = model.predict(X_test)

acc = np.average(y_hat == y_test)

print('Accuracy:', acc)

run.log('Accuracy', np.float(acc))


# calculate AUC

y_scores = model.predict_proba(X_test)

auc = roc_auc_score(y_test,y_scores[:,1])

print('AUC: ' + str(auc))

run.log('AUC', np.float(auc))


# Save the model in the run outputs

os.makedirs('outputs', exist_ok=True)

joblib.dump(value=model, filename='outputs/diabetes_model.pkl')

run.complete()

Create compute


from azureml.core.compute import ComputeTarget, AmlCompute

from azureml.core.compute_target import ComputeTargetException


cluster_name = "your-compute-cluster"


try:

# Check for existing compute target

training_cluster = ComputeTarget(workspace=ws, name=cluster_name)

print('Found existing cluster, use it.')

except ComputeTargetException:

# If it doesn't already exist, create it

try:

compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)

training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)

training_cluster.wait_for_completion(show_output=True)

except Exception as ex:

print(ex)

Run a hyperparamter tuning experiment

from azureml.core import Experiment, ScriptRunConfig, Environment

from azureml.core.conda_dependencies import CondaDependencies

from azureml.train.hyperdrive import GridParameterSampling, HyperDriveConfig, PrimaryMetricGoal, choice

from azureml.widgets import RunDetails


# Create a Python environment for the experiment

sklearn_env = Environment("sklearn-env")


# Ensure the required packages are installed (we need scikit-learn, Azure ML defaults, and Azure ML dataprep)

packages = CondaDependencies.create(conda_packages=['scikit-learn','pip'],

pip_packages=['azureml-defaults','azureml-dataprep[pandas]'])

sklearn_env.python.conda_dependencies = packages


# Get the training dataset

diabetes_ds = ws.datasets.get("diabetes dataset")


# Create a script config

script_config = ScriptRunConfig(source_directory=experiment_folder,

script='diabetes_training.py',

# Add non-hyperparameter arguments -in this case, the training dataset

arguments = ['--input-data', diabetes_ds.as_named_input('training_data')],

environment=sklearn_env,

compute_target = training_cluster)


# Sample a range of parameter values

params = GridParameterSampling(

{

# Hyperdrive will try 6 combinations, adding these as script arguments

'--learning_rate': choice(0.01, 0.1, 1.0),

'--n_estimators' : choice(10, 100)

}

)


# Configure hyperdrive settings

hyperdrive = HyperDriveConfig(run_config=script_config,

hyperparameter_sampling=params,

policy=None, # No early stopping policy

primary_metric_name='AUC', # Find the highest AUC metric

primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,

max_total_runs=6, # Restict the experiment to 6 iterations

max_concurrent_runs=2) # Run up to 2 iterations in parallel


# Run the experiment

experiment = Experiment(workspace=ws, name='mslearn-diabetes-hyperdrive')

run = experiment.submit(config=hyperdrive)


# Show the status in the notebook as the experiment runs

RunDetails(run).show()

run.wait_for_completion()

Determine the best performing run

# Print all child runs, sorted by the primary metric

for child_run in run.get_children_sorted_by_primary_metric():

print(child_run)


# Get the best run, and its metrics and arguments

best_run = run.get_best_run_by_primary_metric()

best_run_metrics = best_run.get_metrics()

script_arguments = best_run.get_details() ['runDefinition']['arguments']

print('Best Run Id: ', best_run.id)

print(' -AUC:', best_run_metrics['AUC'])

print(' -Accuracy:', best_run_metrics['Accuracy'])

print(' -Arguments:',script_arguments)

register the model


from azureml.core import Model


# Register model

best_run.register_model(model_path='outputs/diabetes_model.pkl', model_name='diabetes_model',

tags={'Training context':'Hyperdrive'},

properties={'AUC': best_run_metrics['AUC'], 'Accuracy': best_run_metrics['Accuracy']})


# List registered models

for model in Model.list(ws):

print(model.name, 'version:', model.version)

for tag_name in model.tags:

tag = model.tags[tag_name]

print ('\t',tag_name, ':', tag)

for prop_name in model.properties:

prop = model.properties[prop_name]

print ('\t',prop_name, ':', prop)

print('\n')

https://docs.microsoft.com/en-us/azure/machine-learning/how-to-tune-hyperparameters

Automate machine learning model selection with Azure Machine learning

https://docs.microsoft.com/en-us/azure/machine-learning/concept-automated-ml

can apply scaling and normalization to numeric data automatically. can also do

  • Missing value imputation to eliminate nulls in the training dataset.

  • Categorical encoding to convert categorical features to numeric indicators.

  • Dropping high-cardinality features, such as record IDs.

  • Feature engineering (for example, deriving individual date parts from DateTime features)

  • Others...

Configure

from azureml.train.automl import AutoMLConfig


automl_run_config = RunConfiguration(framework='python')

automl_config = AutoMLConfig(name='Automated ML Experiment',

task='classification',

primary_metric = 'AUC_weighted',

compute_target=aml_compute,

training_data = train_dataset,

validation_data = test_dataset,

label_column_name='Label',

featurization='auto',

iterations=12,

max_concurrent_iterations=4)

Specify the primary metric

from azureml.train.automl.utilities import get_primary_metrics


get_primary_metrics('classification')

See metrics here https://docs.microsoft.com/en-us/azure/machine-learning/how-to-understand-automated-ml

submitting

from azureml.core.experiment import Experiment


automl_experiment = Experiment(ws, 'automl_experiment')

automl_run = automl_experiment.submit(automl_config)

Retrieving best run and its model

best_run, fitted_model = automl_run.get_output()

best_run_metrics = best_run.get_metrics()

for metric_name in best_run_metrics:

metric = best_run_metrics[metric_name]

print(metric_name, metric)

Explore preprocessing steps

for step in fitted_model.named_steps:

print(step)

Exercise

https://github.com/MicrosoftLearning/mslearn-dp100/blob/main/12%20-%20Use%20Automated%20Machine%20Learning.ipynb

!pip install --upgrade azureml-sdk azureml-widgets

import azureml.core

from azureml.core import Workspace


# Load the workspace from the saved config file

ws = Workspace.from_config()

print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Prepare data


from azureml.core import Dataset


default_ds = ws.get_default_datastore()


if 'diabetes dataset' not in ws.datasets:

default_ds.upload_files(files=['./data/diabetes.csv', './data/diabetes2.csv'], # Upload the diabetes csv files in /data

target_path='diabetes-data/', # Put it in a folder path in the datastore

overwrite=True, # Replace existing files of the same name

show_progress=True)


#Create a tabular dataset from the path on the datastore (this may take a short while)

tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'diabetes-data/*.csv'))


# Register the tabular dataset

try:

tab_data_set = tab_data_set.register(workspace=ws,

name='diabetes dataset',

description='diabetes data',

tags = {'format':'CSV'},

create_new_version=True)

print('Dataset registered.')

except Exception as ex:

print(ex)

else:

print('Dataset already registered.')



# Split the dataset into training and validation subsets

diabetes_ds = ws.datasets.get("diabetes dataset")

train_ds, test_ds = diabetes_ds.random_split(percentage=0.7, seed=123)

print("Data ready!")

Perpare compute


from azureml.core.compute import ComputeTarget, AmlCompute

from azureml.core.compute_target import ComputeTargetException


cluster_name = "your-compute-cluster"


try:

# Check for existing compute target

training_cluster = ComputeTarget(workspace=ws, name=cluster_name)

print('Found existing cluster, use it.')

except ComputeTargetException:

# If it doesn't already exist, create it

try:

compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)

training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)

training_cluster.wait_for_completion(show_output=True)

except Exception as ex:

print(ex)

Configure AUtoML

import azureml.train.automl.utilities as automl_utils


for metric in automl_utils.get_primary_metrics('classification'):

print(metric)

Configure AuroML run


from azureml.train.automl import AutoMLConfig


automl_config = AutoMLConfig(name='Automated ML Experiment',

task='classification',

compute_target=training_cluster,

training_data = train_ds,

validation_data = test_ds,

label_column_name='Diabetic',

iterations=4,

primary_metric = 'AUC_weighted',

max_concurrent_iterations=2,

featurization='auto'

)


print("Ready for Auto ML run.")

Run AutoML experiment

from azureml.core.experiment import Experiment

from azureml.widgets import RunDetails


print('Submitting Auto ML experiment...')

automl_experiment = Experiment(ws, 'mslearn-diabetes-automl-sdk')

automl_run = automl_experiment.submit(automl_config)

RunDetails(automl_run).show()

automl_run.wait_for_completion(show_output=True)

Determine the best model

best_run, fitted_model = automl_run.get_output()

print(best_run)

print(fitted_model)

best_run_metrics = best_run.get_metrics()

for metric_name in best_run_metrics:

metric = best_run_metrics[metric_name]

print(metric_name, metric)

See steps

or step in fitted_model.named_steps:

print(step)

Register best model

from azureml.core import Model


# Register model

best_run.register_model(model_path='outputs/model.pkl', model_name='diabetes_model_automl',

tags={'Training context':'Auto ML'},

properties={'AUC': best_run_metrics['AUC_weighted'], 'Accuracy': best_run_metrics['accuracy']})


# List registered models

for model in Model.list(ws):

print(model.name, 'version:', model.version)

for tag_name in model.tags:

tag = model.tags[tag_name]

print ('\t',tag_name, ':', tag)

for prop_name in model.properties:

prop = model.properties[prop_name]

print ('\t',prop_name, ':', prop)

print('\n')

https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-train

Explore differential privacy

https://github.com/MicrosoftLearning/mslearn-dp100/blob/main/13%20-%20Explore%20Differential%20Privacy.ipynb

https://smartnoise.org/

!pip install opendp-smartnoise

Load data

import pandas as pd


data_path = 'data/diabetes.csv'

diabetes = pd.read_csv(data_path)

diabetes.describe()

  • Upper and lower bounds: Clamping is used to set upper and lower bounds on values for a variable. This is required to ensure that the noise generated by SmartNoise is consistent with the expected distribution of the original data.

  • Sample size: To generate consistent differentially private data for some aggregations, SmartNoise needs to know the size of the data sample to be generated.

  • Epsilon: Put simplistically, epsilon is a non-negative value that provides an inverse measure of the amount of noise added to the data. A low epsilon results in a dataset with a greater level of privacy, while a high epsilon results in a dataset that is closer to the original data. Generally, you should use epsilon values between 0 and 1. Epsilon is correlated with another value named delta, that indicates the probability that a report generated by an analysis is not fully private.


import opendp.smartnoise.core as sn


cols = list(diabetes.columns)

age_range = [0.0, 120.0]

samples = len(diabetes)


with sn.Analysis() as analysis:

# load data

data = sn.Dataset(path=data_path, column_names=cols)

# Convert Age to float

age_dt = sn.to_float(data['Age'])

# get mean of age

age_mean = sn.dp_mean(data = age_dt,

privacy_usage = {'epsilon': .50},

data_lower = age_range[0],

data_upper = age_range[1],

data_rows = samples

)

analysis.release()


# print differentially private estimate of mean age

print("Private mean age:",age_mean.value)


# print actual mean age

print("Actual mean age:",diabetes.Age.mean())

Explore data distributions with histograms

import matplotlib.pyplot as plt

import numpy as np

%matplotlib inline


ages = list(range(0, 130, 10))

age = diabetes.Age


# Plot a histogram with 10-year bins

n_age, bins, patches = plt.hist(age, bins=ages, color='blue', alpha=0.7, rwidth=0.85)

plt.grid(axis='y', alpha=0.75)

plt.xlabel('Age')

plt.ylabel('Frequency')

plt.title('True Age Distribution')

plt.show()

print(n_age.astype(int))


import matplotlib.pyplot as plt


with sn.Analysis() as analysis:

data = sn.Dataset(path = data_path, column_names = cols)


age_histogram = sn.dp_histogram(

sn.to_int(data['Age'], lower=0, upper=120),

edges = ages,

upper = 10000,

null_value = -1,

privacy_usage = {'epsilon': 0.5}

)

analysis.release()


plt.ylim([0,7000])

width=4

agecat_left = [x + width for x in ages]

agecat_right = [x + 2*width for x in ages]

plt.bar(list(range(0,120,10)), n_age, width=width, color='blue', alpha=0.7, label='True')

plt.bar(agecat_left, age_histogram.value, width=width, color='orange', alpha=0.7, label='Private')

plt.legend()

plt.title('Histogram of Age')

plt.xlabel('Age')

plt.ylabel('Frequency')

plt.show()

print(age_histogram.value)

Covariance

with sn.Analysis() as analysis:

sn_data = sn.Dataset(path = data_path, column_names = cols)


age_bp_cov_scalar = sn.dp_covariance(

left = sn.to_float(sn_data['Age']),

right = sn.to_float(sn_data['DiastolicBloodPressure']),

privacy_usage = {'epsilon': 1.0},

left_lower = 0.,

left_upper = 120.,

left_rows = 10000,

right_lower = 0.,

right_upper = 150.,

right_rows = 10000)

analysis.release()

print('Differentially private covariance: {0}'.format(age_bp_cov_scalar.value[0][0]))

print('Actual covariance', diabetes.Age.cov(diabetes.DiastolicBloodPressure))

Use SQL queries

from opendp.smartnoise.metadata import CollectionMetadata


meta = CollectionMetadata.from_file('metadata/diabetes.yml')

print (meta)

from opendp.smartnoise.sql import PandasReader, PrivateReader


reader = PandasReader(meta, diabetes)

private_reader = PrivateReader(meta, reader)

print('Readers ready.')

query = 'SELECT Diabetic, AVG(Age) AS AvgAge FROM diabetes.diabetes GROUP BY Diabetic'


result_dp = private_reader.execute_typed(query)

print(result_dp)

try a reader with a high epsilon (low privacy) value, and another with a low epsilon (high privacy) value

low_privacy_reader = PrivateReader(meta, reader, 5.0) # large epsilon, less privacy

result = low_privacy_reader.execute_typed(query)

print(result)

print()


high_privacy_reader = PrivateReader(meta, reader, 0.1) # smaller epsilon, more privacy

result = high_privacy_reader.execute_typed(query)

print(result)

https://docs.microsoft.com/en-us/azure/machine-learning/concept-differential-privacy

Explain machine learning models with Azure Machine Learning

Install azureml-interpret package.

explainer types

# MimicExplainer

from interpret.ext.blackbox import MimicExplainer

from interpret.ext.glassbox import DecisionTreeExplainableModel


mim_explainer = MimicExplainer(model=loan_model,

initialization_examples=X_test,

explainable_model = DecisionTreeExplainableModel,

features=['loan_amount','income','age','marital_status'],

classes=['reject', 'approve'])


# TabularExplainer

from interpret.ext.blackbox import TabularExplainer


tab_explainer = TabularExplainer(model=loan_model,

initialization_examples=X_test,

features=['loan_amount','income','age','marital_status'],

classes=['reject', 'approve'])



# PFIExplainer

from interpret.ext.blackbox import PFIExplainer


pfi_explainer = PFIExplainer(model = loan_model,

features=['loan_amount','income','age','marital_status'],

classes=['reject', 'approve'])

Get global importance. You can call explain_global() then get_feature_importance_dict().

# MimicExplainer

global_mim_explanation = mim_explainer.explain_global(X_train)

global_mim_feature_importance = global_mim_explanation.get_feature_importance_dict()



# TabularExplainer

global_tab_explanation = tab_explainer.explain_global(X_train)

global_tab_feature_importance = global_tab_explanation.get_feature_importance_dict()



# PFIExplainer

global_pfi_explanation = pfi_explainer.explain_global(X_train, y_train)

global_pfi_feature_importance = global_pfi_explanation.get_feature_importance_dict()

Get local values use the get_ranked_local_names() and get_ranked_local_values() methods

# MimicExplainer

local_mim_explanation = mim_explainer.explain_local(X_test[0:5])

local_mim_features = local_mim_explanation.get_ranked_local_names()

local_mim_importance = local_mim_explanation.get_ranked_local_values()



# TabularExplainer

local_tab_explanation = tab_explainer.explain_local(X_test[0:5])

local_tab_features = local_tab_explanation.get_ranked_local_names()

local_tab_importance = local_tab_explanation.get_ranked_local_values()

Creating an explanation in the experiment script

ensure that the azureml-interpret and azureml-contrib-interpret packages are installed in the run environment

# Import Azure ML run library

from azureml.core.run import Run

from azureml.contrib.interpret.explanation.explanation_client import ExplanationClient

from interpret.ext.blackbox import TabularExplainer

# other imports as required


# Get the experiment run context

run = Run.get_context()


# code to train model goes here


# Get explanation

explainer = TabularExplainer(model, X_train, features=features, classes=labels)

explanation = explainer.explain_global(X_test)


# Get an Explanation Client and upload the explanation

explain_client = ExplanationClient.from_run(run)

explain_client.upload_model_explanation(explanation, comment='Tabular Explanation')


# Complete the run

run.complete()

View in the explanations tab or the ExplanationClient object

from azureml.contrib.interpret.explanation.explanation_client import ExplanationClient


client = ExplanationClient.from_run_id(workspace=ws,

experiment_name=experiment.experiment_name,

run_id=run.id)

explanation = client.download_model_explanation()

feature_importances = explanation.get_feature_importance_dict()

Exercise

https://github.com/MicrosoftLearning/mslearn-dp100/blob/main/14%20-%20Interpret%20Models.ipynb

!pip install --upgrade azureml-sdk azureml-widgets azureml-explain-model

!pip install --upgrade azureml-interpret

Create a model

import pandas as pd

import numpy as np

import joblib

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import roc_auc_score

from sklearn.metrics import roc_curve


# load the diabetes dataset

print("Loading Data...")

data = pd.read_csv('data/diabetes.csv')


# Separate features and labels

features = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']

labels = ['not-diabetic', 'diabetic']

X, y = data[features].values, data['Diabetic'].values


# Split data into training set and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)


# Train a decision tree model

print('Training a decision tree model')

model = DecisionTreeClassifier().fit(X_train, y_train)


# calculate accuracy

y_hat = model.predict(X_test)

acc = np.average(y_hat == y_test)

print('Accuracy:', acc)


# calculate AUC

y_scores = model.predict_proba(X_test)

auc = roc_auc_score(y_test,y_scores[:,1])

print('AUC: ' + str(auc))


print('Model trained.')

Get an explainer


from interpret.ext.blackbox import TabularExplainer


# "features" and "classes" fields are optional

tab_explainer = TabularExplainer(model,

X_train,

features=features,

classes=labels)

print(tab_explainer, "ready!")

Get global feature explanation


# you can use the training data or the test data here

global_tab_explanation = tab_explainer.explain_global(X_train)

# Get the top features by importance

global_tab_feature_importance = global_tab_explanation.get_feature_importance_dict()

for feature, importance in global_tab_feature_importance.items():

print(feature,":", importance)

Get local feature importances

X_explain = X_test[0:2]


# Get predictions

predictions = model.predict(X_explain)


# Get local explanations

local_tab_explanation = tab_explainer.explain_local(X_explain)


# Get feature names and importance for each possible label

local_tab_features = local_tab_explanation.get_ranked_local_names()

local_tab_importance = local_tab_explanation.get_ranked_local_values()


for l in range(len(local_tab_features)):

print('Support for', labels[l])

label = local_tab_features[l]

for o in range(len(label)):

print("\tObservation", o + 1)

feature_list = label[o]

total_support = 0

for f in range(len(feature_list)):

print("\t\t", feature_list[f], ':', local_tab_importance[l][o][f])

total_support += local_tab_importance[l][o][f]

print("\t\t ----------\n\t\t Total:", total_support, "Prediction:", labels[predictions[o]])

Add explainability into model run


import azureml.core

from azureml.core import Workspace


# Load the workspace from the saved config file

ws = Workspace.from_config()

print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))


import os, shutil

from azureml.core import Experiment


# Create a folder for the experiment files

experiment_folder = 'diabetes_train_and_explain'

os.makedirs(experiment_folder, exist_ok=True)


# Copy the data file into the experiment folder

shutil.copy('data/diabetes.csv', os.path.join(experiment_folder, "diabetes.csv"))

Write file and add explanation in it

%%writefile $experiment_folder/diabetes_training.py

# Import libraries

import pandas as pd

import numpy as np

import joblib

import os

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import roc_auc_score

from sklearn.metrics import roc_curve


# Import Azure ML run library

from azureml.core.run import Run


# Import libraries for model explanation

from azureml.interpret import ExplanationClient

from interpret.ext.blackbox import TabularExplainer


# Get the experiment run context

run = Run.get_context()


# load the diabetes dataset

print("Loading Data...")

data = pd.read_csv('diabetes.csv')


features = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']

labels = ['not-diabetic', 'diabetic']


# Separate features and labels

X, y = data[features].values, data['Diabetic'].values


# Split data into training set and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)


# Train a decision tree model

print('Training a decision tree model')

model = DecisionTreeClassifier().fit(X_train, y_train)


# calculate accuracy

y_hat = model.predict(X_test)

acc = np.average(y_hat == y_test)

run.log('Accuracy', np.float(acc))


# calculate AUC

y_scores = model.predict_proba(X_test)

auc = roc_auc_score(y_test,y_scores[:,1])

run.log('AUC', np.float(auc))


os.makedirs('outputs', exist_ok=True)

# note file saved in the outputs folder is automatically uploaded into experiment record

joblib.dump(value=model, filename='outputs/diabetes.pkl')


# Get explanation

explainer = TabularExplainer(model, X_train, features=features, classes=labels)

explanation = explainer.explain_global(X_test)


# Get an Explanation Client and upload the explanation

explain_client = ExplanationClient.from_run(run)

explain_client.upload_model_explanation(explanation, comment='Tabular Explanation')


# Complete the run

run.complete()

Run the experiment


from azureml.core import Experiment, ScriptRunConfig, Environment

from azureml.core.conda_dependencies import CondaDependencies

from azureml.widgets import RunDetails



# Create a Python environment for the experiment

explain_env = Environment("explain-env")


# Create a set of package dependencies (including the azureml-interpret package)

packages = CondaDependencies.create(conda_packages=['scikit-learn','pandas','pip'],

pip_packages=['azureml-defaults','azureml-interpret'])

explain_env.python.conda_dependencies = packages


# Create a script config

script_config = ScriptRunConfig(source_directory=experiment_folder,

script='diabetes_training.py',

environment=explain_env)


# submit the experiment

experiment_name = 'mslearn-diabetes-explain'

experiment = Experiment(workspace=ws, name=experiment_name)

run = experiment.submit(config=script_config)

RunDetails(run).show()

run.wait_for_completion()

Retrieve feature importance


from azureml.interpret import ExplanationClient


# Get the feature explanations

client = ExplanationClient.from_run(run)

engineered_explanations = client.download_model_explanation()

feature_importances = engineered_explanations.get_feature_importance_dict()


# Overall feature importance

print('Feature\tImportance')

for key, value in feature_importances.items():

print(key, '\t', value)

Go to the Experiments, Explanations tab

https://docs.microsoft.com/en-us/azure/machine-learning/how-to-machine-learning-interpretability

Detect and mitigate unfairness in models with Azure Machine Learning

evaluating the fairness of a model is to compare predictions for each group within a sensitive feature

group the data based on the sensitive feature (Age) and measure the predictive performance metric (recall) for those groups. Then we can compare the metric scores to determine the disparity between them.


potential causes.

  • Data imbalance

  • Indirect correlation.

  • Societal biases.

Mitigating bias

  • Balance training and validation data. You can apply over-sampling or under-sampling techniques to balance data and use stratified splitting algorithms to maintain representative proportions for training and validation.

  • Perform extensive feature selection and engineering analysis. Make sure you fully explore the interconnected correlations in your data to try to differentiate features that are directly predictive from features that encapsulate more complex, nuanced relationships. You can use the model interpretability support in Azure Machine Learning to understand how individual features influence predictions.

  • Evaluate models for disparity based on significant features. You can't easily address the bias in a model if you can't quantify it.

  • Trade-off overall predictive performance for the lower disparity in predictive performance between sensitive feature groups. A model that is 99.5% accurate with comparable performance across all groups is often more desirable than a model that is 99.9% accurate but discriminates against a particular subset of cases.

Fairlearn

Fairlearn is a Python package that you can use to analyze models and evaluate disparity between predictions and prediction performance for one or more sensitive features.


Fairlearn integrates with Azure Machine Learning by enabling you to run an experiment in which the dashboard metrics are uploaded to your Azure Machine Learning workspace

The mitigation support in Fairlearn is based on the use of algorithms to create alternative models that apply parity constraints to produce comparable metrics across sensitive feature groups

The choice of parity constraint depends on the technique being used and the specific fairness criteria you want to apply. Constraints in Fairlearn include:

  • Demographic parity: Use this constraint with any of the mitigation algorithms to minimize disparity in the selection rate across sensitive feature groups. For example, in a binary classification scenario, this constraint tries to ensure that an equal number of positive predictions are made in each group.

  • True positive rate parity: Use this constraint with any of the mitigation algorithms to minimize disparity in true positive rate across sensitive feature groups. For example, in a binary classification scenario, this constraint tries to ensure that each group contains a comparable ratio of true positive predictions.

  • False-positive rate parity: Use this constraint with any of the mitigation algorithms to minimize disparity in false_positive_rate across sensitive feature groups. For example, in a binary classification scenario, this constraint tries to ensure that each group contains a comparable ratio of false-positive predictions.

  • Equalized odds: Use this constraint with any of the mitigation algorithms to minimize disparity in combined true positive rate and false_positive_rate across sensitive feature groups. For example, in a binary classification scenario, this constraint tries to ensure that each group contains a comparable ratio of true positive and false-positive predictions.

  • Error rate parity: Use this constraint with any of the reduction-based mitigation algorithms (Exponentiated Gradient and Grid Search) to ensure that the error for each sensitive feature group does not deviate from the overall error rate by more than a specified amount.

  • Bounded group loss: Use this constraint with any of the reduction-based mitigation algorithms to restrict the loss for each sensitive feature group in a regression model.

trade-off between raw predictive performance and fairness

fairness is measured by a reduction in the disparity of feature selection (for example, ensuring that an equal proportion of members from each gender group is approved for a bank loan) or by a reduction in the disparity of performance metric (for example, ensuring that a model is equally accurate at identifying repayers and defaulters in each age group).

Exercise

https://github.com/MicrosoftLearning/mslearn-dp100/blob/main/15%20-%20Detect%20Unfairness.ipynb

!pip install --upgrade azureml-sdk azureml-widgets azureml-contrib-fairness

!pip install --upgrade fairlearn==0.5.0

Train a model and split a feature (age <50 and >50).


import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier


# load the diabetes dataset

print("Loading Data...")

data = pd.read_csv('data/diabetes.csv')


# Separate features and labels

features = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']

X, y = data[features].values, data['Diabetic'].values


# Get sensitive features

S = data[['Age']].astype(int)

# Change value to represent age groups

S['Age'] = np.where(S.Age > 50, 'Over 50', '50 or younger')


# Split data into training set and test set

X_train, X_test, y_train, y_test, S_train, S_test = train_test_split(X, y, S, test_size=0.20, random_state=0, stratify=y)


# Train a classification model

print("Training model...")

diabetes_model = DecisionTreeClassifier().fit(X_train, y_train)


print("Model trained.")

use the Fairlearn package to compare its behavior for different sensitive feature values

  • Use the fairlearn selection_rate function to return the selection rate (percentage of positive predictions) for the overall population.

  • Use scikit-learn metric functions to calculate overall accuracy, recall, and precision metrics.

  • Use a MetricFrame to calculate selection rate, accuracy, recall, and precision for each age group in the Age sensitive feature. Note that a mix of fairlearn and scikit-learn metric functions are used to calculate the performance values.

from fairlearn.metrics import selection_rate, MetricFrame

from sklearn.metrics import accuracy_score, recall_score, precision_score


# Get predictions for the witheld test data

y_hat = diabetes_model.predict(X_test)


# Get overall metrics

print("Overall Metrics:")

# Get selection rate from fairlearn

overall_selection_rate = selection_rate(y_test, y_hat) # Get selection rate from fairlearn

print("\tSelection Rate:", overall_selection_rate)

# Get standard metrics from scikit-learn

overall_accuracy = accuracy_score(y_test, y_hat)

print("\tAccuracy:", overall_accuracy)

overall_recall = recall_score(y_test, y_hat)

print("\tRecall:", overall_recall)

overall_precision = precision_score(y_test, y_hat)

print("\tPrecision:", overall_precision)


# Get metrics by sensitive group from fairlearn

print('\nMetrics by Group:')

metrics = {'selection_rate': selection_rate,

'accuracy': accuracy_score,

'recall': recall_score,

'precision': precision_score}


group_metrics = MetricFrame(metrics,

y_test, y_hat,

sensitive_features=S_test['Age'])


print(group_metrics.by_group)

larger proportion of the older patients are predicted to be diabetic

  1. When the widget is displayed, use the Get started link to start configuring your visualization.

  2. Select the sensitive features you want to compare (in this case, there's only one: Age).

  3. Select the model performance metric you want to compare (in this case, it's a binary classification model so the options are Accuracy, Balanced accuracy, Precision, and Recall). Start with Recall.

  4. View the dashboard visualization, which shows:

    • Disparity in performance - how the selected performance metric compares for the subpopulations, including underprediction (false negatives) and overprediction (false positives).

    • Disparity in predictions - A comparison of the number of positive cases per subpopulation.

  5. Edit the configuration to compare the predictions based on different performance metrics.

from fairlearn.widget import FairlearnDashboard


# View this model in Fairlearn's fairness dashboard, and see the disparities which appear:

FairlearnDashboard(sensitive_features=S_test,

sensitive_feature_names=['Age'],

y_true=y_test,

y_pred={"diabetes_model": diabetes_model.predict(X_test)})

exclude the Age feature when training the model

# Separate features and labels

ageless = features.copy()

ageless.remove('Age')

X2, y2 = data[ageless].values, data['Diabetic'].values


# Split data into training set and test set

X_train2, X_test2, y_train2, y_test2, S_train2, S_test2 = train_test_split(X2, y2, S, test_size=0.20, random_state=0, stratify=y2)


# Train a classification model

print("Training model...")

ageless_model = DecisionTreeClassifier().fit(X_train2, y_train2)

print("Model trained.")


# View this model in Fairlearn's fairness dashboard, and see the disparities which appear:

FairlearnDashboard(sensitive_features=S_test2,

sensitive_feature_names=['Age'],

y_true=y_test2,

y_pred={"ageless_diabetes_model": ageless_model.predict(X_test2)})

Register the model and upload the model to the training space


from azureml.core import Workspace, Experiment, Model

import joblib

import os


# Load the Azure ML workspace from the saved config file

ws = Workspace.from_config()

print('Ready to work with', ws.name)


# Save the trained model

model_file = 'diabetes_model.pkl'

joblib.dump(value=diabetes_model, filename=model_file)


# Register the model

print('Registering model...')

registered_model = Model.register(model_path=model_file,

model_name='diabetes_classifier',

workspace=ws)

model_id= registered_model.id



print('Model registered.', model_id)

Upload fairlearn metrics

from fairlearn.metrics._group_metric_set import _create_group_metric_set

from azureml.contrib.fairness import upload_dashboard_dictionary, download_dashboard_by_upload_id


# Create a dictionary of model(s) you want to assess for fairness

sf = { 'Age': S_test.Age}

ys_pred = { model_id:diabetes_model.predict(X_test) }

dash_dict = _create_group_metric_set(y_true=y_test,

predictions=ys_pred,

sensitive_features=sf,

prediction_type='binary_classification')


exp = Experiment(ws, 'mslearn-diabetes-fairness')

print(exp)


run = exp.start_logging()


# Upload the dashboard to Azure Machine Learning

try:

dashboard_title = "Fairness insights of Diabetes Classifier"

upload_id = upload_dashboard_dictionary(run,

dash_dict,

dashboard_name=dashboard_title)

print("\nUploaded to id: {0}\n".format(upload_id))


# To test the dashboard, you can download it

downloaded_dict = download_dashboard_by_upload_id(run, upload_id)

print(downloaded_dict)

finally:

run.complete()

Click on the fairness tab of the run of the tab

from azureml.widgets import RunDetails

RunDetails(run).show()

Mitigate unfairness

use the GridSearch feature, which trains multiple models in an attempt to minimize the disparity of predictive performance for the sensitive features in the dataset (in this case, the age groups). You'll optimize the models by applying the EqualizedOdds parity constraint, which tries to ensure that models that exhibit similar true and false positive rates for each sensitive feature grouping.

from fairlearn.reductions import GridSearch, EqualizedOdds

import joblib

import os


print('Finding mitigated models...')


# Train multiple models

sweep = GridSearch(DecisionTreeClassifier(),

constraints=EqualizedOdds(),

grid_size=20)


sweep.fit(X_train, y_train, sensitive_features=S_train.Age)

models = sweep.predictors_


# Save the models and get predictions from them (plus the original unmitigated one for comparison)

model_dir = 'mitigated_models'

os.makedirs(model_dir, exist_ok=True)

model_name = 'diabetes_unmitigated'

print(model_name)

joblib.dump(value=diabetes_model, filename=os.path.join(model_dir, '{0}.pkl'.format(model_name)))

predictions = {model_name: diabetes_model.predict(X_test)}

i = 0

for model in models:

i += 1

model_name = 'diabetes_mitigated_{0}'.format(i)

print(model_name)

joblib.dump(value=model, filename=os.path.join(model_dir, '{0}.pkl'.format(model_name)))

predictions[model_name] = model.predict(X_test)

Use the wizard to visualize Age by Recall


FairlearnDashboard(sensitive_features=S_test,

sensitive_feature_names=['Age'],

y_true=y_test,

y_pred=predictions)

Upload dashboard

# Register the models

registered_model_predictions = dict()

for model_name, prediction_data in predictions.items():

model_file = os.path.join(model_dir, model_name + ".pkl")

registered_model = Model.register(model_path=model_file,

model_name=model_name,

workspace=ws)

registered_model_predictions[registered_model.id] = prediction_data


# Create a group metric set for binary classification based on the Age feature for all of the models

sf = { 'Age': S_test.Age}

dash_dict = _create_group_metric_set(y_true=y_test,

predictions=registered_model_predictions,

sensitive_features=sf,

prediction_type='binary_classification')


exp = Experiment(ws, "mslearn-diabetes-fairness")

print(exp)


run = exp.start_logging()

RunDetails(run).show()


# Upload the dashboard to Azure Machine Learning

try:

dashboard_title = "Fairness Comparison of Diabetes Models"

upload_id = upload_dashboard_dictionary(run,

dash_dict,

dashboard_name=dashboard_title)

print("\nUploaded to id: {0}\n".format(upload_id))

finally:

run.complete()


Monitor models with Azure machine learning

Enable application insight

from azureml.core import Workspace


ws = Workspace.from_config()

ws.get_details()['applicationInsights']

get config

dep_config = AciWebservice.deploy_configuration(cpu_cores = 1,

memory_gb = 1,

enable_app_insights=True)

update

service = ws.webservices['my-svc']

service.update(enable_app_insights=True)

Capture and view telemetry

Log data

def init():

global model

model = joblib.load(Model.get_model_path('my_model'))

def run(raw_data):

data = json.loads(raw_data)['data']

predictions = model.predict(data)

log_txt = 'Data:' + str(data) + ' - Predictions:' + str(predictions)

print(log_txt)

return predictions.tolist()

Query the log as

traces

|where message == "STDOUT"

and customDimensions.["Service Name"] = "my-svc"

| project timestamp, customDimensions.Content

Exercise

https://github.com/MicrosoftLearning/mslearn-dp100/blob/main/16%20-%20Monitor%20a%20Model.ipynb

Monitor data drift with Azure machine learning

from azureml.datadrift import DataDriftDetector


monitor = DataDriftDetector.create_from_datasets(workspace=ws,

name='dataset-drift-detector',

baseline_data_set=train_ds,

target_data_set=new_data_ds,

compute_target='aml-cluster',

frequency='Week',

feature_list=['age','height', 'bmi'],

latency=24)

Choose a time frame?

import datetime as dt


backfill = monitor.backfill( dt.datetime.now() - dt.timedelta(weeks=6), dt.datetime.now())

Configure alerts

alert_email = AlertConfiguration('data_scientists@contoso.com')

monitor = DataDriftDetector.create_from_datasets(ws, 'dataset-drift-detector',

baseline_data_set, target_data_set,

compute_target=cpu_cluster,

frequency='Week', latency=2,

drift_threshold=.3,

alert_configuration=alert_email)

scheduling a data drift monitor to run every week, and send an alert if the drift magnitude is greater than 0.3

alert_email = AlertConfiguration('data_scientists@contoso.com')

monitor = DataDriftDetector.create_from_datasets(ws, 'dataset-drift-detector',

baseline_data_set, target_data_set,

compute_target=cpu_cluster,

frequency='Week', latency=2,

drift_threshold=.3,

alert_configuration=alert_email)

Exercise

https://github.com/MicrosoftLearning/mslearn-dp100/blob/main/17%20-%20Monitor%20Data%20Drift.ipynb

0X AutoML

https://microsoftlearning.github.io/mslearn-dp100/instructions/02-automated-ml.html


Build a Dataset

Create a new dataset from web files, using the following settings:

  • Basic Info:

    • Web URL: https://aka.ms/diabetes-data

    • Name: diabetes dataset

    • Dataset type: Tabular

    • Description: Diabetes data

  • Settings and preview:

    • File format: Delimited

    • Delimiter: Comma

    • Encoding: UTF-8

    • Column headers: Use headers from first file

    • Skip rows: None

  • Schema:

    • Include all columns other than Path

    • Review the automatically detected types

  • Confirm details:

    • Do not profile the dataset after creation

After the dataset has been created, open it and view the Explore page to see a sample of the data

view the Automated ML page (under Author).

Create a new Automated ML run with the following settings:

  • Select dataset:

    • Dataset: diabetes dataset

  • Configure run:

    • New experiment name: mslearn-automl-diabetes

    • Target column: Diabetic (this is the label the model will be trained to predict)

    • Select compute cluster: the compute cluster you created previously

  • Task type and settings:

    • Task type: Classification

    • Additional configuration settings:

      • Primary metric: Select AUC_Weighted (more about this metric later!)

      • Explain best model: Selected - this option causes automated machine learning to calculate feature importance for the best model; making it possible to determine the influence of each feature on the predicted label.

      • Blocked algorithms: Leave all algorithms selected

      • Exit criterion:

        • Training job time (hours): 0.25 - this causes the experiment to end after a maximum of 15 minutes.

        • Metric score threshold: 0.90 - this causes the experiment to end if a model achieves a weighted AUC metric of 90% or higher.

    • Featurization settings:

      • Enable featurization: Selected - this causes Azure Machine Learning to automatically preprocess the features before training.


Review the best model

  1. On the Details tab of the automated machine learning run, note the best model summary.

Select the Algorithm name for the best model to view the child-run that produced it.

  1. Next to the AUC_Weighted value, select View all other metrics to see values of other possible evaluation metrics for a classification model.

  2. Select the Metrics tab and review the performance metrics you can view for the model. These include a confusion_matrix visualization showing the confusion matrix for the validated model, and an accuracy_table visualization that includes the ROC chart.

  3. Select the Explanations tab, and view the Global Importance chart. This shows the extent to which each feature in the dataset influences the label prediction.

Deploy a predictive service

Note: In Azure Machine Learning, you can deploy a service as an Azure Container Instances (ACI) or to an Azure Kubernetes Service (AKS) cluster. AKS is preferred but ACI is ok for testing

  1. Select the Details tab for the run that produced the best model.

  2. Use the Deploy button to deploy the model with the following settings:

    • Name: auto-predict-diabetes

    • Description: Predict diabetes

    • Compute type: ACI

    • Enable authentication: Selected

  3. Wait for the deployment to start - this may take a few seconds. Then, on the Model tab, in the Model summary section, observe the Deploy status for the auto-predict-diabetes service, which should be Running. Wait for this status to change to Successful. You may need to select ↻ Refresh periodically.

In Azure Machine Learning studio, view the Endpoints page (under Assets) and select the auto-predict-diabetes real-time endpoint. Then select the Consume tab and note the following information there.

Test the deployed service

  1. With the Consume page for the auto-predict-diabetes service page open in your browser, open a new browser tab and open a second instance of Azure Machine Learning studio. Then in the new tab, view the Notebooks page.

  2. In the Notebooks page, under My files, browse to the Users/mslearn-dp100 folder where you cloned the notebook repository, and open the Get AutoML Prediction notebook.

https://github.com/MicrosoftLearning/mslearn-dp100/blob/main/02%20-%20Get%20AutoML%20Prediction.ipynb

  1. When the notebook has opened, ensure that the compute instance you created previously is selected in the Compute box, and that it has a status of Running.

  2. In the notebook, replace the ENDPOINT and PRIMARY_KEY placeholders with the values for your service, which you can copy from the Consume tab on the page for your endpoint.