Installation using pip for Python 3
pip install pandas
import pandas as pd
pd.set_option('display.max_columns', None) #allow Jupyter to show all columns
Read data from file
df = pd.read_csv(csv_file_path)
or
df = pd.read_csv(csv_file_path, index_col = 0)
Save data to file
df.to_csv(csv_file_path,index=False)
Constructing DataFrame from a list
d= [7,8,9]
df = pd.DataFrame(data={'col1':d})
df
col1
0 7
1 8
2 9
Constructing DataFrame from a dictionary
d= {"col1":[1,2,3], "col2":[4,5,6], "col3":[7,8,9]}
df = pd.DataFrame(data=d)
df
col1 col2 col3
0 1 4 7
1 2 5 8
2 3 6 9
Constructing DataFrame from a NumPy ndarray
arr = np.array([[1,4,7],[2,5,8],[3,6,9]])
df = pd.DataFrame(arr,columns=["col1","col2","col3"],index=["row1","row2","row3"])
df
col1 col2 col3
row1 1 4 7
row2 2 5 8
row3 3 6 9
DataFrame Attributes
df.columns #all column names
df.index #all row names
df.shape #dataframe dimension
df.size
df.dtypes #column data types
DataFrame methods
df.head(n=5)
df.count()
df.copy()
df.any(axis=1) #check if any of the value in the axis direction is True; 0 for column-wise, 1 for row-wise
df[column_name or condition].min()
df["col1"].min()
1
df[column_name or condition].max()
df["col2"].max()
6
df[column_name or condition].mean()
df[column_name or condition].median()
df[column_name or condition].std()
df.quantile(0.25)
df.describe()
col1 col2 col3
count 3.0 3.0 3.0
mean 2.0 5.0 8.0
std 1.0 1.0 1.0
min 1.0 4.0 7.0
25% 1.5 4.5 7.5
50% 2.0 5.0 8.0
75% 2.5 5.5 8.5
max 3.0 6.0 9.0
df.describe(include='all')
df.info()
df['col1'].nunique() #number of unique values
df['col1'].count() #number of non-null values
df['col1'].size() #number of all values
df['col1'].map(dictionary)
df.value_counts(col_name,normalize=True) #show in percentage
df.replace('a','b')
df.replace(dictionary)
df.isin(list)
df.astype({'col1':'int32'})
df=df.rename(columns={'col1':'colA'})
df.corr(method='pearson') #show correlation matrix
pd.unique(df['col1'])
df.sort_values(by=['col1','col2'])
Adding a column
df["col4"] = df["col3"] + 10
df
col1 col2 col3 col4
row1 1 4 7 17
row2 2 5 8 18
row3 3 6 9 19
Deleting a column
df.drop(columns=["col3","col4"])
df
col1 col2
row1 1 4
row2 2 5
row3 3 6
df.drop(index="row2")
df
col1 col2 col3 col4
row1 1 4 7 17
row3 3 6 9 19
concat()
axis0 vertical down, axis1 horizontal to right
d= {"col1":["A","B","C"], "col2":[1,2,3]}
df1 = pd.DataFrame(data=d)
df1
col1 col2
0 A 1
1 B 2
2 C 3
d= {"col1":["D","E","F"], "col2":[4,5,6]}
df2 = pd.DataFrame(data=d)
df2
col1 col2
0 D 4
1 E 5
2 F 6
df3 = pd.concat([df1,df2],axis=0)
df3
col1 col2
0 A 1
1 B 2
2 C 3
0 D 4
1 E 5
2 F 6
df3 = df3.reset_index(drop=True)
df3
col1 col2
0 A 1
1 B 2
2 C 3
3 D 4
4 E 5
5 F 6
merge()
d = {"col1":["B","D","F","G","H"], "col3":["bb","dd","ff","gg","hh"]}
df4 = pd.DataFrame(data=d)
df4
col1 col3
0 B bb
1 D dd
2 F ff
3 G gg
4 H hh
inner join: Only the keys in both dataframes will be included in the merge.
inner = pd.merge(df3,df4,on="col1",how="inner")
inner
col1 col2 col3
0 B 2 bb
1 D 4 dd
2 F 6 ff
outer join: All keys from both dataframes will be included in the merge.
outer = pd.merge(df3,df4,on="col1",how="outer")
outer
col1 col2 col3
0 A 1.0 NaN
1 B 2.0 bb
2 C 3.0 NaN
3 D 4.0 dd
4 E 5.0 NaN
5 F 6.0 ff
6 G NaN gg
7 H NaN hh
left join: Only keys from the left dataframes will be included in the merge.
left = pd.merge(df3,df4,on="col1",how="left")
left
col1 col2 col3
0 A 1 NaN
1 B 2 bb
2 C 3 NaN
3 D 4 dd
4 E 5 NaN
5 F 6 ff
right join: Only keys from the right dataframes will be included in the merge.
right = pd.merge(df3,df4,on="col1",how="right")
right
col1 col2 col3
0 B 2.0 bb
1 D 4.0 dd
2 F 6.0 ff
3 G NaN gg
4 H NaN hh
d= {"col1":[1,2,3], "col2":[4,5,6], "col3":['a','b','c']}
df = pd.DataFrame(data=d)
df
col1 col2 col3
0 1 4 a
1 2 5 b
2 3 6 c
Select Columns
df[["col1","col2"]]
col1 col2
row1 1 4
row2 2 5
row3 3 6
df.select_dtypes('object')
col3
0 a
1 b
2 c
Select Rows
df[["col1","col2"]]
col1 col2
row1 1 4
row2 2 5
row3 3 6
df.iloc[row_indices,col_indices]
df.iloc[0:2]
col1 col2 col3
0 1 4 a
1 2 5 b
df.iloc[:,0:2]
col1 col2
0 1 4
1 2 5
2 3 6
df.iloc[1,2]
'b'
df.iloc[1:3,0:2]
col1 col2
1 2 5
2 3 6
df.loc[row_indices,[col_names]]
df.loc[:,['col1']]
col1
0 1
1 2
2 3
df.loc[:,['col1','col3']]
col1 col3
0 1 a
1 2 b
2 3 c
df.loc[1:3,['col1','col3']]
col1 col3
1 2 b
2 3 c
df.loc[df['col1']>=2,['col2','col3']]
col2 col3
1 5 b
2 6 c
df.loc[df['col1']>0,'col1'] = 0
col1 col2 col3
0 0 4 a
1 0 5 b
2 0 6 c
df.loc[df['col1']==np.inf,'col1']=0 #equivalent to the following
df['col1']=np.where(df['col1']==np.inf, 0, df['col1'])
Boolean Mask
d= {"col1":["A","B","C","D","E"], "col2":[5,6,7,8,9], "col3":["aa","bb","cc","dd","ee"]}
df = pd.DataFrame(data=d)
df
col1 col2 col3
0 A 5 aa
1 B 6 bb
2 C 7 cc
3 D 8 dd
4 E 9 ee
mask = df["col2"] >=6
mask
0 False
1 True
2 True
3 True
4 True
Name: col2, dtype: bool
df[mask]
col1 col2 col3
1 B 6 bb
2 C 7 cc
3 D 8 dd
4 E 9 ee
df[df["col2"] >=6]
col1 col2 col3
1 B 6 bb
2 C 7 cc
3 D 8 dd
4 E 9 ee
mask = (df["col2"] >=6) & (df["col2"] < 8) # & and, | or, ~ not
mask
0 False
1 True
2 True
3 False
4 False
Name: col2, dtype: bool
groupby()
d= {"col1":["A","B","C","D","E"], "col2":[5,6,7,8,9], "col3":["aa","bb","aa","bb","cc"]}
df = pd.DataFrame(data=d)
df
col1 col2 col3
0 A 5 aa
1 B 6 bb
2 C 7 aa
3 D 8 bb
4 E 9 cc
df.groupby("col3").sum()
col2
col3
aa 12
bb 14
cc 9
df.groupby("col3").sum().sort_values("col2",ascending=False)
col2
col3
bb 14
aa 12
cc 9
agg()
df.agg(["mean","median"])
col2
mean 7.0
median 7.0
df.groupby("col3").agg(["mean","median"])
col2
mean median
col3
aa 6.0 6.0
bb 7.0 7.0
cc 9.0 9.0
df.groupby(["col3"]).agg(col4=pd.NamedAgg(column="col2",aggfunc='sum'))
col4
col3
aa 12
bb 14
cc 9
df['col']=pd.get_dummies(df['col']) #get dummy variables
df['col']=pd.get_dummies(df['col'],drop_first=True)
df['col']=pd.get_dummies(df,drop_first=True,columns=['col1'])
df = pd.get_dummies(df,drop_first=True,columns=['col1'])
pd.to_numeric()
df['col1'].astype('category').cat.codes
d= {"col1":[11,11,22,22,33,33], "col2":['a','a','a','b','b','c'], "col3":[1,1,2,3,4,4]}
df = pd.DataFrame(data=d)
df
col1 col2 col3
0 11 a 1
1 11 a 1
2 22 a 2
3 22 b 3
4 33 b 4
5 33 c 4
df.duplicated()
0 False
1 True
2 False
3 False
4 False
5 False
dtype: bool
df.duplicated(keep=False)
0 True
1 True
2 False
3 False
4 False
5 False
dtype: bool
df.duplicated(subset=['col1'],keep='last')
0 True
1 False
2 True
3 False
4 True
5 False
dtype: bool
df.drop_duplicates()
col1 col2 col3
0 11 a 1
2 22 a 2
3 22 b 3
4 33 b 4
5 33 c 4
df.drop_duplicates(subset='col1')
col1 col2 col3
0 11 a 1
2 22 a 2
4 33 b 4
df.drop_duplicates(subset=['col1','col3'])
col1 col2 col3
0 11 a 1
2 22 a 2
3 22 b 3
4 33 b 4
Filter missing values
df[pd.isnull(df['col1'])]
df.isna() #check for null or missing value
df.isnull()
df.notna() #check for not null or not missing value; same as df.notnull()
df.notnull() #check for not null or not missing value; same as df.notna()
df.isna().sum() # number of missing values in each column
df.isna().any(axis=1) rows with missing values show True
df.isna().any(axis=1).sum() # rows with missing values
Drop rows with missing data
df=df.dropna() #row-wise
df=df.reset_index(drop=True)
df.dropna(axis=1) #column-wise
df.dropna(subset=['col1'])
df(axis='rows',subset=['col1']) #drop row if missing data is in col1
Filling missing data
df.fillna(1)
df.fillna(method='backfill')
df['col1'] = df['col1'].str.upper() #convert to uppercase