housing = strat_train_set.drop('median_house_value', axis=1)housing_labels=strat_train_set['median_house_value'].copy()# Get rid of districts with missing valueshousing.dropna(subset=['total_bedrooms'])# Get rid of the attributehousing.drop('total_bedrooms', axis = 1)median = housing['total_bedrooms'].median()# Set the missing values to medianprint(housing['total_bedrooms'].fillna(median))17606 351.0
18632 108.0
14650 471.0
3230 371.0
3555 1525.0
...
6563 236.0
12053 294.0
13908 872.0
11159 380.0
15775 682.0
Name: total_bedrooms, Length: 16512, dtype: float64
sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head()print(sample_incomplete_rows)longitude latitude housing_median_age total_rooms total_bedrooms population households median_income ocean_proximity income_cat
4629 -118.30 34.07 18.0 3759.0 NaN 3296.0 1462.0 2.2708 <1H OCEAN 2
6068 -117.86 34.01 16.0 4632.0 NaN 3038.0 727.0 5.1762 <1H OCEAN 4
17923 -121.97 37.35 30.0 1955.0 NaN 999.0 386.0 4.6328 <1H OCEAN 4
13656 -117.30 34.05 6.0 2155.0 NaN 1039.0 391.0 1.6675 INLAND 2
19252 -122.79 38.48 7.0 6837.0 NaN 3468.0 1405.0 3.1662 <1H OCEAN 3
from sklearn.impute import SimpleImputerimputer = SimpleImputer(strategy='median')housing_num = housing.drop('ocean_proximity', axis = 1)imputer.fit(housing_num)print(imputer.statistics_)[-118.51 34.26 29. 2119.5 433. 1164. 408. 3.5409 3. ]
X = imputer.transform(housing_num)housing_tr = pd.DataFrame(X, columns=housing_num.columns, index = list(housing.index.values))print(housing_tr.loc[sample_incomplete_rows.index.values])longitude latitude housing_median_age total_rooms total_bedrooms population households median_income income_cat
4629 -118.30 34.07 18.0 3759.0 433.0 3296.0 1462.0 2.2708 2.0
6068 -117.86 34.01 16.0 4632.0 433.0 3038.0 727.0 5.1762 4.0
17923 -121.97 37.35 30.0 1955.0 433.0 999.0 386.0 4.6328 4.0
13656 -117.30 34.05 6.0 2155.0 433.0 1039.0 391.0 1.6675 2.0
19252 -122.79 38.48 7.0 6837.0 433.0 3468.0 1405.0 3.1662 3.0
housing_cat = housing[['ocean_proximity']]print(housing_cat.head())ocean_proximity
17606 <1H OCEAN
18632 <1H OCEAN
14650 NEAR OCEAN
3230 INLAND
3555 <1H OCEAN
from sklearn.preprocessing import OrdinalEncoderordinal_encoder = OrdinalEncoder()housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)print(housing_cat_encoded)[[0.]
[0.]
[4.]
...
[1.]
[0.]
[3.]]
from sklearn.preprocessing import LabelEncoderencoder = LabelEncoder()housing_cat_encoded = encoder.fit_transform(housing_cat)print(housing_cat_encoded[:10])[0 0 4 1 0 1 0 1 0 0]
print(encoder.classes_)['<1H OCEAN' 'INLAND' 'ISLAND' 'NEAR BAY' 'NEAR OCEAN']
from sklearn.preprocessing import OneHotEncoderencoder = OneHotEncoder()housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))print(housing_cat_1hot.toarray())[[1. 0. 0. 0. 0.]
[1. 0. 0. 0. 0.]
[0. 0. 0. 0. 1.]
...
[0. 1. 0. 0. 0.]
[1. 0. 0. 0. 0.]
[0. 0. 0. 1. 0.]]
from sklearn.preprocessing import LabelBinarizerencoder = LabelBinarizer()housing_cat_1hot = encoder.fit_transform(housing_cat)print(housing_cat_1hot)[[1 0 0 0 0]
[1 0 0 0 0]
[0 0 0 0 1]
...
[0 1 0 0 0]
[1 0 0 0 0]
[0 0 0 1 0]]
from sklearn.base import BaseEstimator, TransformerMixin# column indexrooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6class CombinedAttributesAdder(BaseEstimator, TransformerMixin): def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs self.add_bedrooms_per_room = add_bedrooms_per_room def fit(self, X, y=None): return self # nothing else to do def transform(self, X, y=None): rooms_per_household = X[:, rooms_ix] / X[:, households_ix] population_per_household = X[:, population_ix] / X[:, households_ix] if self.add_bedrooms_per_room: bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix] return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room] else: return np.c_[X, rooms_per_household, population_per_household]attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)housing_extra_attribs = attr_adder.transform(housing.values)housing_extra_attribs = pd.DataFrame( housing_extra_attribs, columns=list(housing.columns)+["rooms_per_household", "population_per_household"])print(housing_extra_attribs)longitude latitude housing_median_age total_rooms total_bedrooms population households median_income ocean_proximity income_cat rooms_per_household population_per_household
0 -121.89 37.29 38 1568 351 710 339 2.7042 <1H OCEAN 2 4.62537 2.0944
1 -121.93 37.05 14 679 108 306 113 6.4214 <1H OCEAN 5 6.00885 2.70796
2 -117.2 32.77 31 1952 471 936 462 2.8621 NEAR OCEAN 2 4.22511 2.02597
3 -119.61 36.31 25 1847 371 1460 353 1.8839 INLAND 2 5.23229 4.13598
4 -118.59 34.23 17 6592 1525 4459 1463 3.0347 <1H OCEAN 3 4.50581 3.04785
... ... ... ... ... ... ... ... ... ... ... ... ...
16507 -118.13 34.2 46 1271 236 573 210 4.9312 INLAND 4 6.05238 2.72857
16508 -117.56 33.88 40 1196 294 1052 258 2.0682 INLAND 2 4.63566 4.07752
16509 -116.4 34.09 9 4855 872 2098 765 3.2723 INLAND 3 6.34641 2.74248
16510 -118.01 33.82 31 1960 380 1356 356 4.0625 <1H OCEAN 3 5.50562 3.80899
16511 -122.45 37.77 52 3095 682 1269 639 3.575 NEAR BAY 3 4.84351 1.98592
[16512 rows x 12 columns]
from sklearn.pipeline import Pipelinefrom sklearn.preprocessing import StandardScalernum_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ])housing_num_tr = num_pipeline.fit_transform(housing_num)from sklearn.compose import ColumnTransformernum_attribs = list(housing_num)cat_attribs = ["ocean_proximity"]full_pipeline = ColumnTransformer([ ("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs), ])housing_prepared = full_pipeline.fit_transform(housing)print(housing_prepared)[[-1.15604281 0.77194962 0.74333089 ... 0. 0. 0. ]
[-1.17602483 0.6596948 -1.1653172 ... 0. 0. 0. ]
[ 1.18684903 -1.34218285 0.18664186 ... 0. 0. 1. ]
...
[ 1.58648943 -0.72478134 -1.56295222 ... 0. 0. 0. ]
[ 0.78221312 -0.85106801 0.18664186 ... 0. 0. 0. ]
[-1.43579109 0.99645926 1.85670895 ... 0. 1. 0. ]]
print(housing_prepared.shape)(16512, 17)