pandas

Installation using pip for Python 3

pip install pandas

import pandas as pd

pd.set_option('display.max_columns', None) #allow Jupyter to show all columns

Read data from file

df = pd.read_csv(csv_file_path)

or

df = pd.read_csv(csv_file_path, index_col = 0)

Save data to file

df.to_csv(csv_file_path,index=False)

Constructing DataFrame from a list

d= [7,8,9]

df = pd.DataFrame(data={'col1':d})

df

col1

0 7

1 8

2 9

Constructing DataFrame from a dictionary

d= {"col1":[1,2,3], "col2":[4,5,6], "col3":[7,8,9]}

df = pd.DataFrame(data=d)

df

col1 col2 col3

0 1 4 7

1 2 5 8

2 3 6 9

Constructing DataFrame from a NumPy ndarray

arr = np.array([[1,4,7],[2,5,8],[3,6,9]])

df = pd.DataFrame(arr,columns=["col1","col2","col3"],index=["row1","row2","row3"])

df

col1 col2 col3

row1 1 4 7

row2 2 5 8

row3 3 6 9

DataFrame Attributes

df.columns #all column names
df.index #all row names
df.shape #dataframe dimension
df.size
df.dtypes #column data types

DataFrame methods

df.head(n=5)
df.count()
df.copy()
df.any(axis=1) #check if any of the value in the axis direction is True; 0 for column-wise, 1 for row-wise
df[column_name or condition].min()

df["col1"].min()

1

df[column_name or condition].max()

df["col2"].max()

6

df[column_name or condition].mean()
df[column_name or condition].median()
df[column_name or condition].std()
df.quantile(0.25)

df.describe()

col1 col2 col3

count 3.0 3.0 3.0

mean 2.0 5.0 8.0

std 1.0 1.0 1.0

min 1.0 4.0 7.0

25% 1.5 4.5 7.5

50% 2.0 5.0 8.0

75% 2.5 5.5 8.5

max 3.0 6.0 9.0

df.describe(include='all')
df.info()
df['col1'].nunique() #number of unique values
df['col1'].count() #number of non-null values
df['col1'].size() #number of all values
df['col1'].map(dictionary)
df.sample()
df.value_counts(col_name)
- df.value_counts(col_name,normalize=True) #show in percentage
df.sort_values(by=col_name,ascending=False)
df.replace('a','b')
df.replace(dictionary)
df.pivot()
df.isin(list)
df.diff()
df.astype({'col1':'int32'})
df.rename()
- df=df.rename(columns={'col1':'colA'})
df.corr(method='pearson') #show correlation matrix
pd.unique(df['col1'])
df.sort_values(by=['col1','col2'])

Adding or Removing a Column

Adding a column

df["col4"] = df["col3"] + 10

df

col1 col2 col3 col4

row1 1 4 7 17

row2 2 5 8 18

row3 3 6 9 19

Deleting a column

df.drop(columns=["col3","col4"])

df

col1 col2

row1 1 4

row2 2 5

row3 3 6

df.drop(index="row2")

df

col1 col2 col3 col4

row1 1 4 7 17

row3 3 6 9 19

Combining Data

concat()

axis0 vertical down, axis1 horizontal to right

d= {"col1":["A","B","C"], "col2":[1,2,3]}

df1 = pd.DataFrame(data=d)

df1

col1 col2

0 A 1

1 B 2

2 C 3

d= {"col1":["D","E","F"], "col2":[4,5,6]}

df2 = pd.DataFrame(data=d)

df2

col1 col2

0 D 4

1 E 5

2 F 6

df3 = pd.concat([df1,df2],axis=0)

df3

col1 col2

0 A 1

1 B 2

2 C 3

0 D 4

1 E 5

2 F 6

df3 = df3.reset_index(drop=True)

df3

col1 col2

0 A 1

1 B 2

2 C 3

3 D 4

4 E 5

5 F 6

merge()

d = {"col1":["B","D","F","G","H"], "col3":["bb","dd","ff","gg","hh"]}

df4 = pd.DataFrame(data=d)

df4

col1 col3

0 B bb

1 D dd

2 F ff

3 G gg

4 H hh

inner join: Only the keys in both dataframes will be included in the merge.

inner = pd.merge(df3,df4,on="col1",how="inner")

inner

col1 col2 col3

0 B 2 bb

1 D 4 dd

2 F 6 ff

outer join: All keys from both dataframes will be included in the merge.

outer = pd.merge(df3,df4,on="col1",how="outer")

outer

col1 col2 col3

0 A 1.0 NaN

1 B 2.0 bb

2 C 3.0 NaN

3 D 4.0 dd

4 E 5.0 NaN

5 F 6.0 ff

6 G NaN gg

7 H NaN hh

left join: Only keys from the left dataframes will be included in the merge.

left = pd.merge(df3,df4,on="col1",how="left")

left

col1 col2 col3

0 A 1 NaN

1 B 2 bb

2 C 3 NaN

3 D 4 dd

4 E 5 NaN

5 F 6 ff

right join: Only keys from the right dataframes will be included in the merge.

right = pd.merge(df3,df4,on="col1",how="right")

right

col1 col2 col3

0 B 2.0 bb

1 D 4.0 dd

2 F 6.0 ff

3 G NaN gg

4 H NaN hh

Select Data

d= {"col1":[1,2,3], "col2":[4,5,6], "col3":['a','b','c']}

df = pd.DataFrame(data=d)

df

col1 col2 col3

0 1 4 a

1 2 5 b

2 3 6 c

Select Columns

df[["col1","col2"]]

col1 col2

row1 1 4

row2 2 5

row3 3 6

df.select_dtypes('object')

col3

0 a

1 b

2 c

Select Rows

df[["col1","col2"]]

col1 col2

row1 1 4

row2 2 5

row3 3 6

df.iloc[row_indices,col_indices]

df.iloc[0:2]

col1 col2 col3

0 1 4 a

1 2 5 b

df.iloc[:,0:2]

col1 col2

0 1 4

1 2 5

2 3 6

df.iloc[1,2]

'b'

df.iloc[1:3,0:2]

col1 col2

1 2 5

2 3 6

df.loc[row_indices,[col_names]]

df.loc[:,['col1']]

col1

0 1

1 2

2 3

df.loc[:,['col1','col3']]

col1 col3

0 1 a

1 2 b

2 3 c

df.loc[1:3,['col1','col3']]

col1 col3

1 2 b

2 3 c

df.loc[df['col1']>=2,['col2','col3']]

col2 col3

1 5 b

2 6 c

df.loc[df['col1']>0,'col1'] = 0

col1 col2 col3

0 0 4 a

1 0 5 b

2 0 6 c

df.loc[df['col1']==np.inf,'col1']=0 #equivalent to the following

df['col1']=np.where(df['col1']==np.inf, 0, df['col1'])

Filter Data

Boolean Mask

d= {"col1":["A","B","C","D","E"], "col2":[5,6,7,8,9], "col3":["aa","bb","cc","dd","ee"]}

df = pd.DataFrame(data=d)

df

col1 col2 col3

0 A 5 aa

1 B 6 bb

2 C 7 cc

3 D 8 dd

4 E 9 ee

mask = df["col2"] >=6

mask

0 False

1 True

2 True

3 True

4 True

Name: col2, dtype: bool

df[mask]

col1 col2 col3

1 B 6 bb

2 C 7 cc

3 D 8 dd

4 E 9 ee

df[df["col2"] >=6]

col1 col2 col3

1 B 6 bb

2 C 7 cc

3 D 8 dd

4 E 9 ee

mask = (df["col2"] >=6) & (df["col2"] < 8) # & and, | or, ~ not

mask

0 False

1 True

2 True

3 False

4 False

Name: col2, dtype: bool

Grouping and Aggregation

groupby()

d= {"col1":["A","B","C","D","E"], "col2":[5,6,7,8,9], "col3":["aa","bb","aa","bb","cc"]}

df = pd.DataFrame(data=d)

df

col1 col2 col3

0 A 5 aa

1 B 6 bb

2 C 7 aa

3 D 8 bb

4 E 9 cc

df.groupby("col3").sum()

col2

col3

aa 12

bb 14

cc 9

df.groupby("col3").sum().sort_values("col2",ascending=False)

col2

col3

bb 14

aa 12

cc 9

agg()

df.agg(["mean","median"])

col2

mean 7.0

median 7.0

df.groupby("col3").agg(["mean","median"])

col2

mean median

col3

aa 6.0 6.0

bb 7.0 7.0

cc 9.0 9.0

df.groupby(["col3"]).agg(col4=pd.NamedAgg(column="col2",aggfunc='sum'))

col4

col3

aa 12

bb 14

cc 9

Convert Categorical Data and Numerical Data

df['col']=pd.get_dummies(df['col']) #get dummy variables
df['col']=pd.get_dummies(df['col'],drop_first=True)
df['col']=pd.get_dummies(df,drop_first=True,columns=['col1'])
df = pd.get_dummies(df,drop_first=True,columns=['col1'])
pd.Categorical()
pd.to_numeric()
pd.qcut()
df['col1'].astype('category').cat.codes

Duplicates and Missing Values

d= {"col1":[11,11,22,22,33,33], "col2":['a','a','a','b','b','c'], "col3":[1,1,2,3,4,4]}

df = pd.DataFrame(data=d)

df

col1 col2 col3

0 11 a 1

1 11 a 1

2 22 a 2

3 22 b 3

4 33 b 4

5 33 c 4

df.duplicated()

0 False

1 True

2 False

3 False

4 False

5 False

dtype: bool

df.duplicated(keep=False)

0 True

1 True

2 False

3 False

4 False

5 False

dtype: bool

df.duplicated(subset=['col1'],keep='last')

0 True

1 False

2 True

3 False

4 True

5 False

dtype: bool

df.drop_duplicates()

col1 col2 col3

0 11 a 1

2 22 a 2

3 22 b 3

4 33 b 4

5 33 c 4

df.drop_duplicates(subset='col1')

col1 col2 col3

0 11 a 1

2 22 a 2

4 33 b 4

df.drop_duplicates(subset=['col1','col3'])

col1 col2 col3

0 11 a 1

2 22 a 2

3 22 b 3

4 33 b 4

Filter missing values

df[pd.isnull(df['col1'])]

df.isna() #check for null or missing value

df.isnull()

df.notna() #check for not null or not missing value; same as df.notnull()

df.notnull() #check for not null or not missing value; same as df.notna()

df.isna().sum() # number of missing values in each column

df.isna().any(axis=1) rows with missing values show True

df.isna().any(axis=1).sum() # rows with missing values

Drop rows with missing data

df=df.dropna() #row-wise

df=df.reset_index(drop=True)

df.dropna(axis=1) #column-wise

df.dropna(subset=['col1'])

df(axis='rows',subset=['col1']) #drop row if missing data is in col1

Filling missing data

df.fillna(1)

df.fillna(method='backfill')

String

df['col1'] = df['col1'].str.upper() #convert to uppercase