3. Machine Learning Process

import pandas as pd

import matplotlib.pyplot as plt

url = 'https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv'

df = pd.read_csv(url)

print(df.head())

longitude latitude ... median_house_value ocean_proximity

0 -122.23 37.88 ... 452600.0 NEAR BAY

1 -122.22 37.86 ... 358500.0 NEAR BAY

2 -122.24 37.85 ... 352100.0 NEAR BAY

3 -122.25 37.85 ... 341300.0 NEAR BAY

4 -122.25 37.85 ... 342200.0 NEAR BAY

print('>>> Check the sample size:')

print(df.shape)

>>> Check the sample size:

(20640, 10)

print('>>> Check for the features: ')

print(df.describe())

>>> Check for the features:

longitude latitude ... median_income median_house_value

count 20640.000000 20640.000000 ... 20640.000000 20640.000000

mean -119.569704 35.631861 ... 3.870671 206855.816909

std 2.003532 2.135952 ... 1.899822 115395.615874

min -124.350000 32.540000 ... 0.499900 14999.000000

25% -121.800000 33.930000 ... 2.563400 119600.000000

50% -118.490000 34.260000 ... 3.534800 179700.000000

75% -118.010000 37.710000 ... 4.743250 264725.000000

max -114.310000 41.950000 ... 15.000100 500001.000000

[8 rows x 9 columns]

print('>>> Check for missing values')

print(df.info())

>>> Check for missing values

RangeIndex: 20640 entries, 0 to 20639

Data columns (total 10 columns):

# Column Non-Null Count Dtype

--- ------ -------------- -----

0 longitude 20640 non-null float64

1 latitude 20640 non-null float64

2 housing_median_age 20640 non-null float64

3 total_rooms 20640 non-null float64

4 total_bedrooms 20433 non-null float64

5 population 20640 non-null float64

6 households 20640 non-null float64

7 median_income 20640 non-null float64

8 median_house_value 20640 non-null float64

9 ocean_proximity 20640 non-null object

dtypes: float64(9), object(1)

memory usage: 1.6+ MB

None

import matplotlib.pyplot as plt

df.hist(bins=50, figsize=(10,10))

plt.show()