python3 -m install pandas
import pandas as pd #pd is the typical alias
.agg() #aggregates
.apply() #applies command on data frame value in pandas
.columns #gives header names of data frame
.concat() #cancatenates
.cummax() #cumulative max in pandas
.cummin() #cumulative min in pandas
.cumprod() #cumulative viz. running product in pandas
.cumsum() #cumulative viz. running sum in pandas
.DataFrame() #creates a table
.describe() #gives overview for numerical values of a dataframe
.drop_duplicates #deletes duplicates
.dt #datetime object viz. date column in a data frame
.dt.day #accesses the year of a date column in a data frame
.dt.year #accesses the year of a date column in a data frame
.dt.month #accesses the year of a date column in a data frame
.groupyby() #aggregates on groups
.head() #gives the first few rows of a dataframe viz. table
.index #gives row numbers or labels of data frame
.info() #gives column names of data frames
.isin # filters values in a data frame in pandas
.isna() #scans for missing values
.max() #gives the max
.mean() #gives the mean
.min() #gives the min
.mode() #gives the max
.pivot_table() #creates a pivot table in pandas
.plot() #plots charts.quanrile() #gives a quantile
.read_csv() #reads a csv into a dataframe viz. table
.read_csv(my_data.csv, chunksize = n) #sets a portion size n to split data into chunks
.reset_index() #resets a set index into a column
.set_index() #turn a column into the index
.sort_index() #sorts on the index
.sort_values("header") #sorts column "header" of a dataframe in pandas
.shape #gives the number of rows and columns of a data frame
.std() #give the standard deviation
.to_csv() #creates a csv file from a data frame
.sum() #calculates a sum
.values #gives values of a data frame
.value_counts() #counts values in a dataframe
.var() #gives the variance
.columns #gives header names of data frame
.info() #gives column names of data frames
.shape #gives the number of rows and columns of a data frame
.index #gives row numbers or labels of data frame
.describe() #gives overview for numerical values of a dataframe
.head() #gives the first few rows of a dataframe viz. table
.values #gives values of a data frame
.isna() #scans for missing values
.isna().any() #scans for missing values by field
.isna().sum() #counts missing values by field
.read_csv("my_input.csv") #reads "my_input.csv" into a data frame
index_col #sets the index for a data frame
my_dataframe.index = row_labels #sets a list of row labels
.index() #sets named labels for the row index of a dataframe
.reset_index() #resets a set index into a column
.reset_index(drop=True) #deletes a set index
.set_index("header") #turn a column into the index
.set_index(["header1","header2"]) #turns columns into a multilevel hierachical index
column by column
each key will become a column header, each value will the the list of the column's values
my_df = pd.DataFrame({"key1":[value1,value2,…],"…":[…],…})
row by row
each key of the dictionaries will become a column header, each value will be a row entry
my_df = pd.DataFrame([{"key1":value1,"…":…,…},{"key1":value2,"…":…,…},{…}])
my_df = pd.read_csv("my_data.csv") #reads directly into a data frame
pd.read_csv("my_data.csv",parse_date=["my_date_header"]) #makes a datetime object from column
pd.read_csv("my_data.csv",index_col="header") #sets column "header" to index
my_df.to_csv("my_exported_data.csv") #creates a csv file from a data frame
.agg(my_func) #aggregates on a function
.cummax() #cumulative max in pandas
.cummin() #cumulative min in pandas
.cumprod() #cumulative viz. running product in pandas
.cumsum() #cumulative viz. running sum in pandas
.dt #datetime object viz. date column in a data frame
.dt.day #accesses the year of a date column in a data frame
.dt.year #accesses the year of a date column in a data frame
.dt.month #accesses the year of a date column in a data frame
.groupyby() #aggregates on groups
.groupby("header1")["header2"].mean() #average of "header2" by "header1"
.groupby("header1")[["header2","header3"]].sum()) #sums "header2" and "header3" on "header1"
.groupby(["header1","header2"])[["header3","header4"]].min()) #min of "header1" and "header2" on "header3" and "header4"
.groupby("header1")["header2"].agg([func1,func2]) #aggregations "header2" by "header1"
.max() #gives the max
.mean() #gives the mean
.min() #gives the min
.mode() #gives the mode
.pivot_table() #creates a pivot table in pandas for means
.pivot_table("header") #creates a pivot table in pandas for means of "header"
.pivot_table(aggfunc=my_func) #creates a pivot table in pandas for my_func
.pivot_table(aggfunc=[my_func1,my_func2]) #creates a pivot table in pandas for functions
.pivot_table(index="header") #pivot on row "header"
.pivot_table(values="header") #pivot on value "header"
.pivot_table(columns="header") #split over column "header"
.pivot_table(margins=True) #have total values for rows and columns
.pivot_table(fill_value=0) #use 0 as fill value ofr n/a
.pivot_table(axis="index") #pviot table over the index
.pivot_table(axis="columns") #pviot table over the columns
.pivot_table( … ).loc["index0":"index1"]) #selects "index0" to "index1" from pivot table
.std() #give the standard deviation
.value_counts() #counts values in a dataframe
.value_counts(sort=True) #counts values in a dataframe and sorts
.value_counts(normalize=True) #counts values in a dataframe as fraction of 1
.var() #gives the variance
my_df["header"] #reads a column incl. row labels into a series object
my_df[["header_name1",… ]] #reads columns into data frame object
my_df[n:m] #gives the a slice from row n to m-1 from the data frame
my_df["header_name"] # subselects a data frame as a series object
my_df["header"] > n #gives True/False for values > n in column "header"
my_df[my_df["header"] == "match"] #gives rows where value in header is "match"
my_df[(my_df["header1"] == match2) & (my_df["header2"] == "match2")] #gives rows with two conditions
my_df[my_df["header"] > n] #gives rows with value > n in column "header"
my_df[np.logical_and(my_df["header"] > n, my_df["header"] < m)] #gives rows with value between n and m
my_df[my_df["header"].isin(["match1","match2"])] #filters for multiple matches in column "header"
.loc #accesses data frames by their names, writes data too
.loc(my_indices_list) #show rows where the index is in the list my_indices_list
.loc["label1", … ] #accesses rows by their labels as a series object
.loc[:, "header1"] #accesses all rows for the header as a series object
.loc[["label1", … ]] #accesses rows by their labels as a data frame
.loc[["label1", … ], ["header1",… ]] #accesses rows by their labels and columns by their headers
.loc[[n:m], ["header1",… ]] #accesses rows n to m-1 by their labels and columns by their headers
.loc[:, ["header1",… ]] #accesses all rows, and columns by their headers
.loc[["name1",… ]] #gets the data by indices
.loc["label0":"label1"] #gets everthing inside the labels in a sorted data frame
.loc[("name1","nameA"):("name2","nameB")]) #gets everthing inside the mulitlevel labels
.loc[[("name1","nameA"),("name2","nameB")]] #gets the data of multilevel hierachical indices
.loc[:,"header0":"header1"] #gets everything in the columns "header0" to "header1"
.pivot_table( … ).loc["index0":"index1"]) #selects "index0" to "index1" from pivot table
.iloc #accesses data frames by their indexes
.drop_duplicates #deletes duplicates
.drop_duplicates(subset="header") #deletes duplicates for "header"
.drop_duplicates(subset=["header1","header2"]) #deletes duplicates for header1 and header2
.dropna() #deletes rows with missing data
.fillna() replaces null values
.fillna(0) replaces null values with 0
.sort_index() #sorts on the index
.sort_index(acscending=False) #sorts on the index in descending order
.sort_index(level="name1") #sorts on level "name1"
.sort_index(level=["name1","name2"]) #sorts on a multilevel hierarchical index
.sort_values() #sorts columns of a dataframe in pandas
.sort_values( … , ascending = False) #set sorting order descending
.sort_values( [ … , … ] ) #sort by multiple columns
.sort_values( [ … , … ] , ascending = [ False , … ]) #sort by multiple columns
my_df = pd.read_csv("my_data.csv", index_col = 0)
for value in my_df:
print(value) #gives you the column names of the data frame
for label, observation in my_df.iterrows():
print(label)
print(observation) #this is an entire series object, and memory resource demanding
print(observation["header"]) #show the values the column "header"
my_df.loc[label, "new_name"] = o["name"].upper() #adds column with upper case value
my_df = pd.read_csv("my_data.csv", index_col = 0)
my_df["length"] = my_df["name"].apply(len) #gives length for name in column "length"
a series is like a one-dimensional array incl. row labels
a number of series build a dataframe
pd.DataFrame(my_object)
mydata_df["header_name"] # subselects a data frame as a series
mydata_df[["header1",… ]] # subselects a data frame as a data frame
import pandas as pd
mydata_df = pd.DataFrame(mydata)
print(mydata_df.head())
print(my_df["header"].agg(my_func))
print(my_df[["header1", "header2", "header3"]].agg([my_func1, my_func2]))
my_daf["header"].cumsum()
my_df.drop_duplicates(subset = "header")
my_df.drop_duplicates(subset = ["header1","header2"])
print(my_df.groupby("item")[["header1","header"]].sum()) #sums "header1" and "header2" over "item"