import pandas as pdimport numpy as nppd.set_option('display.max_columns',100)url = 'https://raw.githubusercontent.com/nyp-sit/data/master/Iris_Data.csv'data = pd.read_csv(url)print(data.head())sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
# Print Number of rowsprint("Number of rows:", data.shape[0])# Print Column namesprint(data.columns.tolist())# Print Data typesprint(data.dtypes)Number of rows: 150
['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
sepal_length float64
sepal_width float64
petal_length float64
petal_width float64
species object
dtype: object
# To change species to string and remove "Iris-"data['species'] = data['species'].str.replace("Iris-", "")print(data.head())sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
# To count each speciesdata["species"].value_counts()versicolor 50
virginica 50
setosa 50
Name: species, dtype: int64
# Select just the rows desired from the 'describe' method and add in the 'median'stats_df = data.describe()print(stats_df)# To calculate range and assign to "stats_df"stats_df.loc["range"] = stats_df.loc["max"] - stats_df.loc["min"]# Selecting the 5 columns (attributes) to presentout_fields = ["mean", "25%", "50%", "75%", "range"]stats_df = stats_df.loc[out_fields]# Rename "50%" as "Median"stats_df.rename({"50%" : "median"}, inplace=True)print(stats_df)sepal_length sepal_width petal_length petal_width
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.054000 3.758667 1.198667
std 0.828066 0.433594 1.764420 0.763161
min 4.300000 2.000000 1.000000 0.100000
25% 5.100000 2.800000 1.600000 0.300000
50% 5.800000 3.000000 4.350000 1.300000
75% 6.400000 3.300000 5.100000 1.800000
max 7.900000 4.400000 6.900000 2.500000
sepal_length sepal_width petal_length petal_width
mean 5.843333 3.054 3.758667 1.198667
25% 5.100000 2.800 1.600000 0.300000
median 5.800000 3.000 4.350000 1.300000
75% 6.400000 3.300 5.100000 1.800000
range 3.600000 2.400 5.900000 2.400000
# The mean calculation for "Sepal" of speciesprint(data.groupby("species")["sepal_length", "sepal_width"].mean())sepal_length sepal_width
species
setosa 5.006 3.418
versicolor 5.936 2.770
virginica 6.588 2.974
# The mean calculation for all features of speciesprint(data.groupby('species').mean())sepal_length sepal_width petal_length petal_width
species
setosa 5.006 3.418 1.464 0.244
versicolor 5.936 2.770 4.260 1.326
virginica 6.588 2.974 5.552 2.026
# The median calculation for all features of speciesprint(data.groupby("species").median())sepal_length sepal_width petal_length petal_width
species
setosa 5.0 3.4 1.50 0.2
versicolor 5.9 2.8 4.35 1.3
virginica 6.5 3.0 5.55 2.0
# The mean and median calculation at onceprint(data.groupby('species').agg([np.mean, np.median]))sepal_length sepal_width petal_length petal_width
mean median mean median mean median mean median
species
setosa 5.006 5.0 3.418 3.4 1.464 1.50 0.244 0.2
versicolor 5.936 5.9 2.770 2.8 4.260 4.35 1.326 1.3
virginica 6.588 6.5 2.974 3.0 5.552 5.55 2.026 2.0
# pprint provides a capability to "pretty-print" arbitrary Python data structures in a form which can be used as input to the interpreter# If certain fields need to be aggregated differently:from pprint import pprintagg_dict = {field: ['mean', 'median'] for field in data.columns if field != 'species'}agg_dict['petal_length'] = 'max'pprint(agg_dict){'petal_length': 'max',
'petal_width': ['mean', 'median'],
'sepal_length': ['mean', 'median'],
'sepal_width': ['mean', 'median']}
print(data.groupby('species').agg(agg_dict))sepal_length sepal_width petal_length petal_width
mean median mean median max mean median
species
setosa 5.006 5.0 3.418 3.4 1.9 0.244 0.2
versicolor 5.936 5.9 2.770 2.8 5.1 1.326 1.3
virginica 6.588 6.5 2.974 3.0 6.9 2.026 2.0
# Make a boxplot of each petal and sepal measurement#data.boxplot(column=["petal_width", "petal_length", "sepal_width", "sepal_length"],by="species", rot=90)data.boxplot(by='species');#plt.show()# Create a single boxplot where the features are separated in the x-axis and species are colored with different hues# First we have to reshape the data so there is only a single measurement in each columnplot_data = (data .set_index('species') .stack() .to_frame() .reset_index() .rename(columns={0:'size', 'level_1':'measurement'}) )print(plot_data.head())# Plot the dataframe from above using Seabornsns.set_style('white')sns.set_context('notebook')sns.set_palette('dark')f = plt.figure(figsize=(6,4))sns.boxplot(x='measurement', y='size', hue='species', data=plot_data);#plt.show()