1. Introduction to Machine Learning Toolkit

import pandas as pd

import numpy as np

pd.set_option('display.max_columns',100)

url = 'https://raw.githubusercontent.com/nyp-sit/data/master/Iris_Data.csv'

data = pd.read_csv(url)

print(data.head())

sepal_length sepal_width petal_length petal_width species

0 5.1 3.5 1.4 0.2 Iris-setosa

1 4.9 3.0 1.4 0.2 Iris-setosa

2 4.7 3.2 1.3 0.2 Iris-setosa

3 4.6 3.1 1.5 0.2 Iris-setosa

4 5.0 3.6 1.4 0.2 Iris-setosa

Number of rows, Column names, and Data types

# Print Number of rows

print("Number of rows:", data.shape[0])

# Print Column names

print(data.columns.tolist())

# Print Data types

print(data.dtypes)

Number of rows: 150

['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

sepal_length float64

sepal_width float64

petal_length float64

petal_width float64

species object

dtype: object

To remove string

# To change species to string and remove "Iris-"

data['species'] = data['species'].str.replace("Iris-", "")

print(data.head())

sepal_length sepal_width petal_length petal_width species

0 5.1 3.5 1.4 0.2 setosa

1 4.9 3.0 1.4 0.2 setosa

2 4.7 3.2 1.3 0.2 setosa

3 4.6 3.1 1.5 0.2 setosa

4 5.0 3.6 1.4 0.2 setosa

To count and describe dataset

# To count each species

data["species"].value_counts()

versicolor 50

virginica 50

setosa 50

Name: species, dtype: int64

# Select just the rows desired from the 'describe' method and add in the 'median'

stats_df = data.describe()

print(stats_df)

# To calculate range and assign to "stats_df"

stats_df.loc["range"] = stats_df.loc["max"] - stats_df.loc["min"]

# Selecting the 5 columns (attributes) to present

out_fields = ["mean", "25%", "50%", "75%", "range"]

stats_df = stats_df.loc[out_fields]

# Rename "50%" as "Median"

stats_df.rename({"50%" : "median"}, inplace=True)

print(stats_df)

sepal_length sepal_width petal_length petal_width

count 150.000000 150.000000 150.000000 150.000000

mean 5.843333 3.054000 3.758667 1.198667

std 0.828066 0.433594 1.764420 0.763161

min 4.300000 2.000000 1.000000 0.100000

25% 5.100000 2.800000 1.600000 0.300000

50% 5.800000 3.000000 4.350000 1.300000

75% 6.400000 3.300000 5.100000 1.800000

max 7.900000 4.400000 6.900000 2.500000

sepal_length sepal_width petal_length petal_width

mean 5.843333 3.054 3.758667 1.198667

25% 5.100000 2.800 1.600000 0.300000

median 5.800000 3.000 4.350000 1.300000

75% 6.400000 3.300 5.100000 1.800000

range 3.600000 2.400 5.900000 2.400000

Groupby

# The mean calculation for "Sepal" of species

print(data.groupby("species")["sepal_length", "sepal_width"].mean())

sepal_length sepal_width

species

setosa 5.006 3.418

versicolor 5.936 2.770

virginica 6.588 2.974

# The mean calculation for all features of species

print(data.groupby('species').mean())

sepal_length sepal_width petal_length petal_width

species

setosa 5.006 3.418 1.464 0.244

versicolor 5.936 2.770 4.260 1.326

virginica 6.588 2.974 5.552 2.026

# The median calculation for all features of species

print(data.groupby("species").median())

sepal_length sepal_width petal_length petal_width

species

setosa 5.0 3.4 1.50 0.2

versicolor 5.9 2.8 4.35 1.3

virginica 6.5 3.0 5.55 2.0

# The mean and median calculation at once

print(data.groupby('species').agg([np.mean, np.median]))

sepal_length sepal_width petal_length petal_width

mean median mean median mean median mean median

species

setosa 5.006 5.0 3.418 3.4 1.464 1.50 0.244 0.2

versicolor 5.936 5.9 2.770 2.8 4.260 4.35 1.326 1.3

virginica 6.588 6.5 2.974 3.0 5.552 5.55 2.026 2.0

# pprint provides a capability to "pretty-print" arbitrary Python data structures in a form which can be used as input to the interpreter

# If certain fields need to be aggregated differently:

from pprint import pprint

agg_dict = {field: ['mean', 'median'] for field in data.columns if field != 'species'}

agg_dict['petal_length'] = 'max'

pprint(agg_dict)

{'petal_length': 'max',

'petal_width': ['mean', 'median'],

'sepal_length': ['mean', 'median'],

'sepal_width': ['mean', 'median']}

print(data.groupby('species').agg(agg_dict))

sepal_length sepal_width petal_length petal_width

mean median mean median max mean median

species

setosa 5.006 5.0 3.418 3.4 1.9 0.244 0.2

versicolor 5.936 5.9 2.770 2.8 5.1 1.326 1.3

virginica 6.588 6.5 2.974 3.0 6.9 2.026 2.0

# Make a boxplot of each petal and sepal measurement

#data.boxplot(column=["petal_width", "petal_length", "sepal_width", "sepal_length"],by="species", rot=90)

data.boxplot(by='species');

#plt.show()

# Create a single boxplot where the features are separated in the x-axis and species are colored with different hues

# First we have to reshape the data so there is only a single measurement in each column

plot_data = (data

             .set_index('species')

             .stack()

             .to_frame()

             .reset_index()

             .rename(columns={0:'size', 'level_1':'measurement'})

print(plot_data.head())

# Plot the dataframe from above using Seaborn

sns.set_style('white')

sns.set_context('notebook')

sns.set_palette('dark')

f = plt.figure(figsize=(6,4))

sns.boxplot(x='measurement', y='size',

            hue='species', data=plot_data);

#plt.show()