df.loc[df['#col'] == #criterion, '#new_col'] = 1 #Create new categorical variable based on criterion
import numpy as np
df['#new_col'] = pd.cut(df['#col'], bins = #num_bin OR #[-np.inf, #bin1, #bin 2, np.inf], labels = [#1, #2, #3]) #Create new bins
Converting categorical variables
pd.get_dummies(#df, columns = ['#col']) #One-hot encoding with n categories for explainable features but can cause collinearity and model instability due to duplication of information
pd.get_dummies(#df, columns = ['#col'], drop_first = True, prefix = '#letter') #Dummy encoding with n-1 categories for non-duplication of information
counts = df['#col'].value_counts() #Count number of categories
#mask = df['#col'].isin(#counts[#counts < #5].index) #Create list for new category
df['#col'][mask] = 'Others' #Create new category
Missing values
#df.isnull()
#sub_df = #df[['#col1', '#col2']] #Sub-setting
print(#sub_df.info()) #Print the number of non-missing values