import pandas as pd
import matplotlib.pyplot as plt
url = 'https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv'
df = pd.read_csv(url)
print(df.head())
longitude latitude ... median_house_value ocean_proximity
0 -122.23 37.88 ... 452600.0 NEAR BAY
1 -122.22 37.86 ... 358500.0 NEAR BAY
2 -122.24 37.85 ... 352100.0 NEAR BAY
3 -122.25 37.85 ... 341300.0 NEAR BAY
4 -122.25 37.85 ... 342200.0 NEAR BAY
print('>>> Check the sample size:')
print(df.shape)
>>> Check the sample size:
(20640, 10)
print('>>> Check for the features: ')
print(df.describe())
>>> Check for the features:
longitude latitude ... median_income median_house_value
count 20640.000000 20640.000000 ... 20640.000000 20640.000000
mean -119.569704 35.631861 ... 3.870671 206855.816909
std 2.003532 2.135952 ... 1.899822 115395.615874
min -124.350000 32.540000 ... 0.499900 14999.000000
25% -121.800000 33.930000 ... 2.563400 119600.000000
50% -118.490000 34.260000 ... 3.534800 179700.000000
75% -118.010000 37.710000 ... 4.743250 264725.000000
max -114.310000 41.950000 ... 15.000100 500001.000000
[8 rows x 9 columns]
print('>>> Check for missing values')
print(df.info())
>>> Check for missing values
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 longitude 20640 non-null float64
1 latitude 20640 non-null float64
2 housing_median_age 20640 non-null float64
3 total_rooms 20640 non-null float64
4 total_bedrooms 20433 non-null float64
5 population 20640 non-null float64
6 households 20640 non-null float64
7 median_income 20640 non-null float64
8 median_house_value 20640 non-null float64
9 ocean_proximity 20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
None
import matplotlib.pyplot as plt
df.hist(bins=50, figsize=(10,10))
plt.show()