import numpy as np
def split_train_test(data, test_ratio):
shuffled_indices = np.random.permutation(len(data))
test_set_size = int(len(data) * test_ratio)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices], data.iloc[test_indices]
train_set, test_set = split_train_test(df, 0.2)
print(len(train_set), 'train', len(test_set), 'test')
# print(test_set.head())
16512 train 4128 test
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
#pd.cut() to bin the median income into 5 categories
df["income_cat"] = pd.cut(df["median_income"],
bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
labels=[1, 2, 3, 4, 5])
print(df.income_cat.value_counts().sort_index())
1 822
2 6581
3 7236
4 3639
5 2362
Name: income_cat, dtype: int64
from sklearn.model_selection import train_test_split
strat_train_set, strat_test_set = train_test_split(df, shuffle=True, stratify=df['income_cat'])
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
#df['income_cat'].value_counts() / len(df)
for train_index, test_index in split.split(df, df['income_cat']):
strat_train_set = df.loc[train_index]
strat_test_set = df.loc[test_index]
print(len(strat_train_set), 'train', len(strat_test_set), 'test')
16512 train 4128 test
print(strat_test_set.head())
longitude latitude ... ocean_proximity income_cat
5241 -118.39 34.12 ... <1H OCEAN 5
10970 -117.86 33.77 ... <1H OCEAN 4
20351 -119.05 34.21 ... <1H OCEAN 3
6568 -118.15 34.20 ... INLAND 3
13285 -117.68 34.07 ... INLAND 3
[5 rows x 11 columns]
def income_cat_props(data):
return data['income_cat'].value_counts()/len(data)
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)
compare_props = pd.DataFrame({
'Overall': income_cat_props(df),
'Stratified': income_cat_props(strat_test_set),
'Random': income_cat_props(test_set)
}).sort_index()
compare_props['Rand. %error'] = 100 * compare_props['Random'] / compare_props['Overall'] - 100
compare_props['Strat. %error'] = 100 * compare_props['Stratified'] / compare_props['Overall'] - 100
print(compare_props)
Overall Stratified Random Rand. %error Strat. %error
1 0.039826 0.039729 0.040213 0.973236 -0.243309
2 0.318847 0.318798 0.324370 1.732260 -0.015195
3 0.350581 0.350533 0.358527 2.266446 -0.013820
4 0.176308 0.176357 0.167393 -5.056334 0.027480
5 0.114438 0.114583 0.109496 -4.318374 0.127011
# scatter plot of all districts
housing = strat_train_set.copy()
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.1)
plt.show()
import matplotlib.pyplot as plt
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.4,
s=housing['population'] / 100, label='population',
c='median_house_value', cmap=plt.get_cmap('jet'), colorbar=True)
plt.legend()
plt.show()
corr_matrix = housing.corr()
print(corr_matrix['median_house_value'].sort_values(ascending=False))
median_house_value 1.000000
median_income 0.687160
total_rooms 0.135097
housing_median_age 0.114110
households 0.064506
total_bedrooms 0.047689
population -0.026920
longitude -0.047432
latitude -0.142724
Name: median_house_value, dtype: float64
# scatter matrix
from pandas.plotting import scatter_matrix
attributes = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']
scatter_matrix(housing[attributes], figsize=(12,8))
plt.show()
housing.plot(kind='scatter', x='median_income', y='median_house_value', alpha=0.1)
plt.show()
print(housing[(housing['median_house_value'] < 360000) & (housing['median_house_value'] > 340000)])
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity income_cat
18632 -121.93 37.05 14.0 679.0 108.0 306.0 113.0 6.4214 340600.0 <1H OCEAN 5
1844 -122.28 37.91 38.0 2501.0 348.0 805.0 329.0 6.5576 358500.0 NEAR BAY 5
16969 -122.31 37.54 42.0 1159.0 261.0 465.0 247.0 3.1842 352800.0 NEAR OCEAN 3
8709 -118.35 33.85 34.0 1770.0 291.0 916.0 289.0 5.0000 354200.0 <1H OCEAN 4
18684 -121.82 36.95 16.0 2599.0 430.0 1417.0 445.0 4.6611 349300.0 <1H OCEAN 4
... ... ... ... ... ... ... ... ... ... ... ...
15829 -122.43 37.75 52.0 1970.0 495.0 871.0 474.0 4.0625 355600.0 NEAR BAY 3
8725 -118.36 33.83 35.0 2828.0 487.0 1439.0 490.0 5.6013 350200.0 <1H OCEAN 4
1545 -121.93 37.73 8.0 831.0 231.0 404.0 224.0 3.3750 350000.0 <1H OCEAN 3
5618 -118.23 33.78 20.0 59.0 24.0 69.0 23.0 2.5588 350000.0 NEAR OCEAN 2
15270 -117.29 33.08 18.0 3225.0 515.0 1463.0 476.0 5.7787 346700.0 NEAR OCEAN 4
[427 rows x 11 columns]