2. Introduction to Machine Learning

# Basic Data Analysis

import pandas as pd

url = 'https://raw.githubusercontent.com/nyp-sit/data/master/Iris_Data.csv'

df = pd.read_csv(url)

print(df.head())

sepal_length sepal_width petal_length petal_width species

0 5.1 3.5 1.4 0.2 Iris-setosa

1 4.9 3.0 1.4 0.2 Iris-setosa

2 4.7 3.2 1.3 0.2 Iris-setosa

3 4.6 3.1 1.5 0.2 Iris-setosa

4 5.0 3.6 1.4 0.2 Iris-setosa

# Check the sample size

print("Check the sample size:")

print(df.shape)

Check the sample size:

(150, 5)

# Check the features

print("Check the features:")

print(df.describe())

print(df.columns.tolist()) # printing the columns name

Check the features:

sepal_length sepal_width petal_length petal_width

count 150.000000 150.000000 150.000000 150.000000

mean 5.843333 3.054000 3.758667 1.198667

std 0.828066 0.433594 1.764420 0.763161

min 4.300000 2.000000 1.000000 0.100000

25% 5.100000 2.800000 1.600000 0.300000

50% 5.800000 3.000000 4.350000 1.300000

75% 6.400000 3.300000 5.100000 1.800000

max 7.900000 4.400000 6.900000 2.500000

['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

# Check for missing values

print("Check for missing values:")

#print(df.isnull())

print(df.info())

Check for missing values:

RangeIndex: 150 entries, 0 to 149

Data columns (total 5 columns):

# Column Non-Null Count Dtype

--- ------ -------------- -----

0 sepal_length 150 non-null float64

1 sepal_width 150 non-null float64

2 petal_length 150 non-null float64

3 petal_width 150 non-null float64

4 species 150 non-null object

dtypes: float64(4), object(1)

Google Sites

Report abuse