2. Transforming features for better clustering

Feature variance is equal to feature influence in K-Means
StandardScaler transforms each feature to have mean 0 and variance 1
Need to perform two steps: StandardScaler to standardize the data, and then KMeans to cluster the standardize data
- Use sklearn pipeline to combine multiple steps
Example:

from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans

import pandas as pd

scaler = StandardScaler() # Create scaler

kmeans = KMeans(n_clusters=4) # Create KMeans instance

pipeline = make_pipeline(scaler, kmeans) # Create pipeline

pipeline.fit(samples) # Fit the pipeline to samples

labels = pipeline.predict(samples) # Calculate the cluster labels

df = pd.DataFrame({'labels': labels, 'species':species}) # Create a DataFrame with labels and species as columns

ct = pd.crosstab(df['labels'],df['species']) # Create crosstab

print(ct) # Display ct

from sklearn.pipeline import make_pipeline

from sklearn.cluster import KMeans

from sklearn.preprocessing import Normalizer # Import Normalizer

import pandas as pd

normalizer = Normalizer() # Create a normalizer

kmeans = KMeans(n_clusters=10) # Create a KMeans model with 10 clusters

pipeline = make_pipeline(normalizer, kmeans) # Make a pipeline chaining normalizer and kmeans

pipeline.fit(movements) # Fit pipeline to the daily price movements

labels = pipeline.predict(movements) # Predict the cluster labels

df = pd.DataFrame({'labels': labels, 'companies': companies}) # Create a DataFrame aligning labels and companies

print(df.sort_values("labels")) # Display df sorted by cluster label

Google Sites

Report abuse