pandas

basics

name

pandas stands panel data

basis

pandas is based on numpy and matplotlib

install and import

install

python3 -m install pandas

import

import pandas as pd #pd is the typical alias

help

https://www.w3schools.com/python/pandas/default.asp

methods

.agg() #aggregates

.apply() #applies command on data frame value in pandas

.columns #gives header names of data frame

.concat() #cancatenates

.cummax() #cumulative max in pandas

.cummin() #cumulative min in pandas

.cumprod() #cumulative viz. running product in pandas

.cumsum() #cumulative viz. running sum in pandas

.DataFrame() #creates a table

.describe() #gives overview for numerical values of a dataframe

.drop_duplicates #deletes duplicates

.dt #datetime object viz. date column in a data frame

.dt.day #accesses the year of a date column in a data frame

.dt.year #accesses the year of a date column in a data frame

.dt.month #accesses the year of a date column in a data frame

.groupyby() #aggregates on groups

.head() #gives the first few rows of a dataframe viz. table

.index #gives row numbers or labels of data frame

.info() #gives column names of data frames

.isin # filters values in a data frame in pandas

.isna() #scans for missing values

.max() #gives the max

.mean() #gives the mean

.min() #gives the min

.mode() #gives the max

.pivot_table() #creates a pivot table in pandas

.plot() #plots charts.quanrile() #gives a quantile

.read_csv() #reads a csv into a dataframe viz. table

.read_csv(my_data.csv, chunksize = n) #sets a portion size n to split data into chunks

.reset_index() #resets a set index into a column

.set_index() #turn a column into the index

.sort_index() #sorts on the index

.sort_values("header") #sorts column "header" of a dataframe in pandas

.shape #gives the number of rows and columns of a data frame

.std() #give the standard deviation

.to_csv() #creates a csv file from a data frame

.sum() #calculates a sum

.values #gives values of a data frame

.value_counts() #counts values in a dataframe

.var() #gives the variance

data frame

explore

.columns #gives header names of data frame

.info() #gives column names of data frames

.shape #gives the number of rows and columns of a data frame

.index #gives row numbers or labels of data frame

.describe() #gives overview for numerical values of a dataframe

.head() #gives the first few rows of a dataframe viz. table

.values #gives values of a data frame

.isna() #scans for missing values

.isna().any() #scans for missing values by field

.isna().sum() #counts missing values by field

building

.read_csv("my_input.csv") #reads "my_input.csv" into a data frame

index_col #sets the index for a data frame

my_dataframe.index = row_labels #sets a list of row labels

.index() #sets named labels for the row index of a dataframe

.reset_index() #resets a set index into a column

.reset_index(drop=True) #deletes a set index

.set_index("header") #turn a column into the index

.set_index(["header1","header2"]) #turns columns into a multilevel hierachical index

from a dictionary of lists

column by column

each key will become a column header, each value will the the list of the column's values

my_df = pd.DataFrame({"key1":[value1,value2,…],"…":[…],…})

from a list of dictonaries

row by row

each key of the dictionaries will become a column header, each value will be a row entry

my_df = pd.DataFrame([{"key1":value1,"…":…,…},{"key1":value2,"…":…,…},{…}])

csv

my_df = pd.read_csv("my_data.csv") #reads directly into a data frame

pd.read_csv("my_data.csv",parse_date=["my_date_header"]) #makes a datetime object from column

pd.read_csv("my_data.csv",index_col="header") #sets column "header" to index

my_df.to_csv("my_exported_data.csv") #creates a csv file from a data frame

handling

aggregating

.agg(my_func) #aggregates on a function

.cummax() #cumulative max in pandas

.cummin() #cumulative min in pandas

.cumprod() #cumulative viz. running product in pandas

.cumsum() #cumulative viz. running sum in pandas

.dt #datetime object viz. date column in a data frame

.dt.day #accesses the year of a date column in a data frame

.dt.year #accesses the year of a date column in a data frame

.dt.month #accesses the year of a date column in a data frame

.groupyby() #aggregates on groups

.groupby("header1")["header2"].mean() #average of "header2" by "header1"

.groupby("header1")[["header2","header3"]].sum()) #sums "header2" and "header3" on "header1"

.groupby(["header1","header2"])[["header3","header4"]].min()) #min of "header1" and "header2" on "header3" and "header4"

.groupby("header1")["header2"].agg([func1,func2]) #aggregations "header2" by "header1"

.max() #gives the max

.mean() #gives the mean

.min() #gives the min

.mode() #gives the mode

.pivot_table() #creates a pivot table in pandas for means

.pivot_table("header") #creates a pivot table in pandas for means of "header"

.pivot_table(aggfunc=my_func) #creates a pivot table in pandas for my_func

.pivot_table(aggfunc=[my_func1,my_func2]) #creates a pivot table in pandas for functions

.pivot_table(index="header") #pivot on row "header"

.pivot_table(values="header") #pivot on value "header"

.pivot_table(columns="header") #split over column "header"

.pivot_table(margins=True) #have total values for rows and columns

.pivot_table(fill_value=0) #use 0 as fill value ofr n/a

.pivot_table(axis="index") #pviot table over the index

.pivot_table(axis="columns") #pviot table over the columns

.pivot_table( … ).loc["index0":"index1"]) #selects "index0" to "index1" from pivot table

.std() #give the standard deviation

.value_counts() #counts values in a dataframe

.value_counts(sort=True) #counts values in a dataframe and sorts

.value_counts(normalize=True) #counts values in a dataframe as fraction of 1

.var() #gives the variance

filtering

my_df["header"] #reads a column incl. row labels into a series object

my_df[["header_name1",… ]] #reads columns into data frame object

my_df[n:m] #gives the a slice from row n to m-1 from the data frame

my_df["header_name"] # subselects a data frame as a series object

my_df["header"] > n #gives True/False for values > n in column "header"

my_df[my_df["header"] == "match"] #gives rows where value in header is "match"

my_df[(my_df["header1"] == match2) & (my_df["header2"] == "match2")] #gives rows with two conditions

my_df[my_df["header"] > n] #gives rows with value > n in column "header"

my_df[np.logical_and(my_df["header"] > n, my_df["header"] < m)] #gives rows with value between n and m

my_df[my_df["header"].isin(["match1","match2"])] #filters for multiple matches in column "header"

.loc #accesses data frames by their names, writes data too

.loc(my_indices_list) #show rows where the index is in the list my_indices_list

.loc["label1", … ] #accesses rows by their labels as a series object

.loc[:, "header1"] #accesses all rows for the header as a series object

.loc[["label1", … ]] #accesses rows by their labels as a data frame

.loc[["label1", … ], ["header1",… ]] #accesses rows by their labels and columns by their headers

.loc[[n:m], ["header1",… ]] #accesses rows n to m-1 by their labels and columns by their headers

.loc[:, ["header1",… ]] #accesses all rows, and columns by their headers

.loc[["name1",… ]] #gets the data by indices

.loc["label0":"label1"] #gets everthing inside the labels in a sorted data frame

.loc[("name1","nameA"):("name2","nameB")]) #gets everthing inside the mulitlevel labels

.loc[[("name1","nameA"),("name2","nameB")]] #gets the data of multilevel hierachical indices

.loc[:,"header0":"header1"] #gets everything in the columns "header0" to "header1"

.pivot_table( … ).loc["index0":"index1"]) #selects "index0" to "index1" from pivot table

.iloc #accesses data frames by their indexes

cleaning

.drop_duplicates #deletes duplicates

.drop_duplicates(subset="header") #deletes duplicates for "header"

.drop_duplicates(subset=["header1","header2"]) #deletes duplicates for header1 and header2

.dropna() #deletes rows with missing data

.fillna() replaces null values

.fillna(0) replaces null values with 0

sorting

.sort_index() #sorts on the index

.sort_index(acscending=False) #sorts on the index in descending order

.sort_index(level="name1") #sorts on level "name1"

.sort_index(level=["name1","name2"]) #sorts on a multilevel hierarchical index

.sort_values() #sorts columns of a dataframe in pandas

.sort_values( … , ascending = False) #set sorting order descending

.sort_values( [ … , … ] ) #sort by multiple columns

.sort_values( [ … , … ] , ascending = [ False , … ]) #sort by multiple columns

for loop

my_df = pd.read_csv("my_data.csv", index_col = 0)

for value in my_df:

print(value) #gives you the column names of the data frame

for label, observation in my_df.iterrows():

print(label)

print(observation) #this is an entire series object, and memory resource demanding

print(observation["header"]) #show the values the column "header"

my_df.loc[label, "new_name"] = o["name"].upper() #adds column with upper case value

apply

my_df = pd.read_csv("my_data.csv", index_col = 0)

my_df["length"] = my_df["name"].apply(len) #gives length for name in column "length"

series

a series is like a one-dimensional array incl. row labels

a number of series build a dataframe

syntax examples

pd.DataFrame(my_object)

mydata_df["header_name"] # subselects a data frame as a series

mydata_df[["header1",… ]] # subselects a data frame as a data frame

import pandas as pd

mydata_df = pd.DataFrame(mydata)

print(mydata_df.head())

print(my_df["header"].agg(my_func))

print(my_df[["header1", "header2", "header3"]].agg([my_func1, my_func2]))

my_daf["header"].cumsum()

my_df.drop_duplicates(subset = "header")

my_df.drop_duplicates(subset = ["header1","header2"])

print(my_df.groupby("item")[["header1","header"]].sum()) #sums "header1" and "header2" over "item"

Page updated

Google Sites

Report abuse