Post date: Jun 18, 2015 9:18:43 PM
This notebook shows how to output pivot table such that the number intervals of row/columns are properly sorted.
In [14]:
import pandas as pd import numpy as np
In [85]:
df = pd.DataFrame(data={'x':np.random.randint(0,10000,1000), 'y':np.random.randint(0,1000,1000), 'z':[random.choice(['a','b','c']) for i in range(1000)]})
In [86]:
mybin_1 = [0, 10, 20, 50, 1e+2, 2e+2, 5e+2, 1e+3, 2e+3, 5e+3, 1e+4, 2e+4, 5e+4] mybin_2 = [0, 10, 20, 1e+2, 2e+2, 1e+3, 1e+4, 5e+4]
In [87]:
df['x_bin'] = pd.cut(x=df.x, bins=mybin_1) df['y_bin'] = pd.cut(x=df.y, bins=mybin_2)
In [88]:
df[:10]
Out[88]:
This is how you normally use crosstab, the row and column order is based on string-order.
In [89]:
pd.crosstab(df.x_bin, df.y_bin)
Out[89]:
This is how to make the order looks good.
In [122]:
def relabel(myitem, right_most='inf'): """ This function re-label the row/column of a data frame. Especially, the one that comes from pd.cut. [0, 1, 2, 3] --> ['[0,1)', '[1,2)', '[2,3)', '[3,right_most)'] """ new_item = ['']*len(myitem) for i in np.arange(len(myitem)-1): new_item[i] = "[%d,%d)" % (myitem[i], myitem[i+1]) i = len(myitem)-1 new_item[i] = "[%d,%s)" % (myitem[i], right_most) return new_item def crosstab2(X1, X2, mybin1=None, mybin2=None): """ Produce pivolt table similar to crosstab except that the order of row or column is by the interval. X1 and X2: TimeSeries. mybin1 and mybin2: list of binning boundary. kittipat@ June 18, 2015 """ if mybin1: bin1 = pd.cut(x=X1, bins=mybin1, labels=[s for s in mybin1[:-1]]) else: bin1 = X1 if mybin2: bin2 = pd.cut(x=X2, bins=mybin2, labels=[s for s in mybin2[:-1]]) else: bin2 = X2 df_pivot = pd.crosstab(bin1, bin2) if mybin1: df_pivot.index = relabel(df_pivot.index, str(mybin1[-1])) if mybin2: df_pivot.columns = relabel(df_pivot.columns, str(mybin2[-1])) return df_pivot
In [124]:
crosstab2(df.x, df.y, mybin_1, mybin_2)
Out[124]:
In [126]:
crosstab2(df.x, df.z, mybin_1)
Out[126]:
In [127]:
crosstab2(df.z, df.x, [], mybin_1)
Out[127]:
In [ ]: