housing = strat_train_set.drop('median_house_value', axis=1)
housing_labels=strat_train_set['median_house_value'].copy()
# Get rid of districts with missing values
housing.dropna(subset=['total_bedrooms'])
# Get rid of the attribute
housing.drop('total_bedrooms', axis = 1)
median = housing['total_bedrooms'].median()
# Set the missing values to median
print(housing['total_bedrooms'].fillna(median))
17606 351.0
18632 108.0
14650 471.0
3230 371.0
3555 1525.0
...
6563 236.0
12053 294.0
13908 872.0
11159 380.0
15775 682.0
Name: total_bedrooms, Length: 16512, dtype: float64
sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head()
print(sample_incomplete_rows)
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income ocean_proximity income_cat
4629 -118.30 34.07 18.0 3759.0 NaN 3296.0 1462.0 2.2708 <1H OCEAN 2
6068 -117.86 34.01 16.0 4632.0 NaN 3038.0 727.0 5.1762 <1H OCEAN 4
17923 -121.97 37.35 30.0 1955.0 NaN 999.0 386.0 4.6328 <1H OCEAN 4
13656 -117.30 34.05 6.0 2155.0 NaN 1039.0 391.0 1.6675 INLAND 2
19252 -122.79 38.48 7.0 6837.0 NaN 3468.0 1405.0 3.1662 <1H OCEAN 3
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
housing_num = housing.drop('ocean_proximity', axis = 1)
imputer.fit(housing_num)
print(imputer.statistics_)
[-118.51 34.26 29. 2119.5 433. 1164. 408. 3.5409 3. ]
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index = list(housing.index.values))
print(housing_tr.loc[sample_incomplete_rows.index.values])
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income income_cat
4629 -118.30 34.07 18.0 3759.0 433.0 3296.0 1462.0 2.2708 2.0
6068 -117.86 34.01 16.0 4632.0 433.0 3038.0 727.0 5.1762 4.0
17923 -121.97 37.35 30.0 1955.0 433.0 999.0 386.0 4.6328 4.0
13656 -117.30 34.05 6.0 2155.0 433.0 1039.0 391.0 1.6675 2.0
19252 -122.79 38.48 7.0 6837.0 433.0 3468.0 1405.0 3.1662 3.0
housing_cat = housing[['ocean_proximity']]
print(housing_cat.head())
ocean_proximity
17606 <1H OCEAN
18632 <1H OCEAN
14650 NEAR OCEAN
3230 INLAND
3555 <1H OCEAN
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
print(housing_cat_encoded)
[[0.]
[0.]
[4.]
...
[1.]
[0.]
[3.]]
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
housing_cat_encoded = encoder.fit_transform(housing_cat)
print(housing_cat_encoded[:10])
[0 0 4 1 0 1 0 1 0 0]
print(encoder.classes_)
['<1H OCEAN' 'INLAND' 'ISLAND' 'NEAR BAY' 'NEAR OCEAN']
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
print(housing_cat_1hot.toarray())
[[1. 0. 0. 0. 0.]
[1. 0. 0. 0. 0.]
[0. 0. 0. 0. 1.]
...
[0. 1. 0. 0. 0.]
[1. 0. 0. 0. 0.]
[0. 0. 0. 1. 0.]]
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
housing_cat_1hot = encoder.fit_transform(housing_cat)
print(housing_cat_1hot)
[[1 0 0 0 0]
[1 0 0 0 0]
[0 0 0 0 1]
...
[0 1 0 0 0]
[1 0 0 0 0]
[0 0 0 1 0]]
from sklearn.base import BaseEstimator, TransformerMixin
# column index
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X, y=None):
rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
population_per_household = X[:, population_ix] / X[:, households_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
else:
return np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)
housing_extra_attribs = pd.DataFrame(
housing_extra_attribs,
columns=list(housing.columns)+["rooms_per_household", "population_per_household"])
print(housing_extra_attribs)
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income ocean_proximity income_cat rooms_per_household population_per_household
0 -121.89 37.29 38 1568 351 710 339 2.7042 <1H OCEAN 2 4.62537 2.0944
1 -121.93 37.05 14 679 108 306 113 6.4214 <1H OCEAN 5 6.00885 2.70796
2 -117.2 32.77 31 1952 471 936 462 2.8621 NEAR OCEAN 2 4.22511 2.02597
3 -119.61 36.31 25 1847 371 1460 353 1.8839 INLAND 2 5.23229 4.13598
4 -118.59 34.23 17 6592 1525 4459 1463 3.0347 <1H OCEAN 3 4.50581 3.04785
... ... ... ... ... ... ... ... ... ... ... ... ...
16507 -118.13 34.2 46 1271 236 573 210 4.9312 INLAND 4 6.05238 2.72857
16508 -117.56 33.88 40 1196 294 1052 258 2.0682 INLAND 2 4.63566 4.07752
16509 -116.4 34.09 9 4855 872 2098 765 3.2723 INLAND 3 6.34641 2.74248
16510 -118.01 33.82 31 1960 380 1356 356 4.0625 <1H OCEAN 3 5.50562 3.80899
16511 -122.45 37.77 52 3095 682 1269 639 3.575 NEAR BAY 3 4.84351 1.98592
[16512 rows x 12 columns]
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
housing_num_tr = num_pipeline.fit_transform(housing_num)
from sklearn.compose import ColumnTransformer
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", OneHotEncoder(), cat_attribs),
])
housing_prepared = full_pipeline.fit_transform(housing)
print(housing_prepared)
[[-1.15604281 0.77194962 0.74333089 ... 0. 0. 0. ]
[-1.17602483 0.6596948 -1.1653172 ... 0. 0. 0. ]
[ 1.18684903 -1.34218285 0.18664186 ... 0. 0. 1. ]
...
[ 1.58648943 -0.72478134 -1.56295222 ... 0. 0. 0. ]
[ 0.78221312 -0.85106801 0.18664186 ... 0. 0. 0. ]
[-1.43579109 0.99645926 1.85670895 ... 0. 1. 0. ]]
print(housing_prepared.shape)
(16512, 17)