import pandas as pd
import numpy as np
pd.set_option('display.max_columns',100)
url = 'https://raw.githubusercontent.com/nyp-sit/data/master/Iris_Data.csv'
data = pd.read_csv(url)
print(data.head())
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
# Print Number of rows
print("Number of rows:", data.shape[0])
# Print Column names
print(data.columns.tolist())
# Print Data types
print(data.dtypes)
Number of rows: 150
['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
sepal_length float64
sepal_width float64
petal_length float64
petal_width float64
species object
dtype: object
# To change species to string and remove "Iris-"
data['species'] = data['species'].str.replace("Iris-", "")
print(data.head())
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
# To count each species
data["species"].value_counts()
versicolor 50
virginica 50
setosa 50
Name: species, dtype: int64
# Select just the rows desired from the 'describe' method and add in the 'median'
stats_df = data.describe()
print(stats_df)
# To calculate range and assign to "stats_df"
stats_df.loc["range"] = stats_df.loc["max"] - stats_df.loc["min"]
# Selecting the 5 columns (attributes) to present
out_fields = ["mean", "25%", "50%", "75%", "range"]
stats_df = stats_df.loc[out_fields]
# Rename "50%" as "Median"
stats_df.rename({"50%" : "median"}, inplace=True)
print(stats_df)
sepal_length sepal_width petal_length petal_width
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.054000 3.758667 1.198667
std 0.828066 0.433594 1.764420 0.763161
min 4.300000 2.000000 1.000000 0.100000
25% 5.100000 2.800000 1.600000 0.300000
50% 5.800000 3.000000 4.350000 1.300000
75% 6.400000 3.300000 5.100000 1.800000
max 7.900000 4.400000 6.900000 2.500000
sepal_length sepal_width petal_length petal_width
mean 5.843333 3.054 3.758667 1.198667
25% 5.100000 2.800 1.600000 0.300000
median 5.800000 3.000 4.350000 1.300000
75% 6.400000 3.300 5.100000 1.800000
range 3.600000 2.400 5.900000 2.400000
# The mean calculation for "Sepal" of species
print(data.groupby("species")["sepal_length", "sepal_width"].mean())
sepal_length sepal_width
species
setosa 5.006 3.418
versicolor 5.936 2.770
virginica 6.588 2.974
# The mean calculation for all features of species
print(data.groupby('species').mean())
sepal_length sepal_width petal_length petal_width
species
setosa 5.006 3.418 1.464 0.244
versicolor 5.936 2.770 4.260 1.326
virginica 6.588 2.974 5.552 2.026
# The median calculation for all features of species
print(data.groupby("species").median())
sepal_length sepal_width petal_length petal_width
species
setosa 5.0 3.4 1.50 0.2
versicolor 5.9 2.8 4.35 1.3
virginica 6.5 3.0 5.55 2.0
# The mean and median calculation at once
print(data.groupby('species').agg([np.mean, np.median]))
sepal_length sepal_width petal_length petal_width
mean median mean median mean median mean median
species
setosa 5.006 5.0 3.418 3.4 1.464 1.50 0.244 0.2
versicolor 5.936 5.9 2.770 2.8 4.260 4.35 1.326 1.3
virginica 6.588 6.5 2.974 3.0 5.552 5.55 2.026 2.0
# pprint provides a capability to "pretty-print" arbitrary Python data structures in a form which can be used as input to the interpreter
# If certain fields need to be aggregated differently:
from pprint import pprint
agg_dict = {field: ['mean', 'median'] for field in data.columns if field != 'species'}
agg_dict['petal_length'] = 'max'
pprint(agg_dict)
{'petal_length': 'max',
'petal_width': ['mean', 'median'],
'sepal_length': ['mean', 'median'],
'sepal_width': ['mean', 'median']}
print(data.groupby('species').agg(agg_dict))
sepal_length sepal_width petal_length petal_width
mean median mean median max mean median
species
setosa 5.006 5.0 3.418 3.4 1.9 0.244 0.2
versicolor 5.936 5.9 2.770 2.8 5.1 1.326 1.3
virginica 6.588 6.5 2.974 3.0 6.9 2.026 2.0
# Make a boxplot of each petal and sepal measurement
#data.boxplot(column=["petal_width", "petal_length", "sepal_width", "sepal_length"],by="species", rot=90)
data.boxplot(by='species');
#plt.show()
# Create a single boxplot where the features are separated in the x-axis and species are colored with different hues
# First we have to reshape the data so there is only a single measurement in each column
plot_data = (data
.set_index('species')
.stack()
.to_frame()
.reset_index()
.rename(columns={0:'size', 'level_1':'measurement'})
)
print(plot_data.head())
# Plot the dataframe from above using Seaborn
sns.set_style('white')
sns.set_context('notebook')
sns.set_palette('dark')
f = plt.figure(figsize=(6,4))
sns.boxplot(x='measurement', y='size',
hue='species', data=plot_data);
#plt.show()