前処理

一般例

import pandas as pd

import numpy as np

from matplotlib import pyplot as plt

%matplotlib inline

data = pd.read_csv('xxx.csv', header=None)

len(data)

data.head(10)

data.tail()

row, col = data.shape

data.describe()

data['y'].describe()

data.info()

age_bining = pd.cut(data['age'], [0,20,30,40,50])

age = pd.crosstab(age_bining, data['y'], margins=True)

age['rate'] = age[1] / age['All']

data['year'] = data['datetime'].apply(lambda x: x.split('-')[0])

data['year'] = data['year'].astype(np.int)

# 欠損値

data.isnull().any()

data.isnull().sum()

data.fillna(0)

data['kcal'].fillna(data['kcal'].mean())

data.dropna(subset=['kcal'])

data['kcal'].value_counts()

# 相関関係

data[['y', 'week']].corr()

data['y'].mean()

data['y'].median()

data[data['y'] > = 100]

data[data['week'] == 'MON'].sort_values(by='y', ascending=False)

data[data['week'] == 'MON']['y'].mean()

data[data['temp'] >= data['temp'].mean()]

data[['y', 'week']].boxplot(by='week')

data.plot.scatter(x='kcal', y='y', figsize=(5, 5))

# または

plt.scatter(data['kcal'], data['y'], label='xxx')

plt.plot(data['kcal'], data['y'], label='xxx', color='red')

plt.legend()

plt.show()

ax = data['y'].plot(figsize=(12, 4), title='Graph')

ax.set_xlabel('time')

ax.set_ylabel('y')

plt.axvline(x=data['y'].mean(), color='red')

data['y'].plot.hist(figsize=(12, 4), title='Graph', grid=True)

plt.savefig('sample_fig.png')

train['pred'] = pred

train['res'] = train['y'] - train['pred']

train.sort_values(by='res')

★乱数

np.random().seed(0)

num = np.random().rand(5) # 一様乱数

num = np.random().randn(5) # 正規分布に従う乱数

num = np.random().binomial(100, 0.5, size=(5)) # 二項分布に従う乱数

df = pd.DataFrame(np.random().rand(10, 4))

b_list = np.random().choice(a_list, 5)

★CSV関連

df = pd.read_csv("xxx.data", header=None)

df.columns = ["aaa", "bbb", ...]

import csv

with open("xxx.csv", "w") as csvfile:

writer = csv.writer(csvfile, lineterminator="¥n")

writer.writerow(["aaa", "bbb", ...])

data = {

"city": ["aaa", "bbb"],

"year": [1998, 2000]

}

df = pd.DataFrame(data)

df.to_csv("xxx.csv")

★3σ法

col = 'y'

mean = df.mean()

sigma = df.std()

low = mean[col] - 3 * sigma[col]

high = mean[col] + 3 * sigma[col]

df2 = df[(df[col] > low) & (df[col] < high)]

cols = df.columns

_df = df

for col in cols:

low = mean[col] - 3 * sigma[col]

high = mean[col] + 3 * sigma[col]

_df = _df[(_df[col] > low) & (_df[col] < high)]

★欠損値処理

df.replace('None', np.nan, inplace=True)

df.dropna(subset=['thickness'], inplace=True)

df['thickness'].fillna(1, inplace=True)

df.replace('None', np.nan, inplace=True)

df['thickness'] = df['thickness'].astype('float64')

thickness_mean = df['thickness'].mean()

df['thickness'].fillna(thickness_mean, inplace=True)

★Pandas vs NumPy

NumPy → Pandas

array = np.arange(10)

series = pd.Series(array)

array = np.arange(20).reshape(4,5)

df = pd.DataFrame(array)

df = pd.DataFrame(array, index=np.linspace(0,10,4))

Pandas → NumPy

array = df["grade"].values

array.mean()

dummies = pd.get_dummies(df)

array = dummies.values

Numpy

基礎集計

・各種代表値(平均、分散、標準偏差など)を計算

・散布図行列をプロットして傾向を調べる

・相関行列を表示して傾向を調べる

前処理

・欠損値の処理、補間(空欄のあるデータを消去・補間)

・名寄せ(表記の揺れを統一)

・正規化や標準化

・時系列データの時間粒度の変更

・画像データのサイズ統一化、グレースケール化

・テキストデータの分かち書き、不要文字の削除、半角全角の統一

・データの不均衡を解消するのには、imbalanced-learnとかSMOTEとか利用

Stop Feeding Garbage To Your Model

★EDA(Exploratory Data Analysis)

統計量

・各カラムの型や値の分布、欠損値、外れ値

・目的関数と各変数の相関や関係性

・変数の平均、標準偏差、最大、最小、分位点

・カテゴリ変数の値の種類数

・変数の欠損値の数

・変数間の相関係数

可視化手法

・棒グラフ

・箱ひげ図、バイオリンプロット

・散布図

・折れ線グラフ

・ヒートマップ

・ヒストグラム

・Q-Qプロット

・t-SNE、UMAP

データ構造を対象とした前処理:抽出、集約、結合、分割、生成、展開

データ内容を対象とした前処理:数値、カテゴリ、日時、文字、位置情報

selection

df.iloc[:, 0:6] # not good

df[['reserve_id', 'hotel_id']]

df.loc[:, ['reserve_id', 'hotel_id']]

df.drop(['people_num', 'total_price'], axis=1, inplace=True) # 列の削除

df[(df['date'] >= '2018-12-24') & (df['date'] <= '2018-12-30')] # not good

df.loc[(df['date'] >= '2018-12-24') & (df['date'] <= '2018-12-30'), :] # not good

df.query('"2018-12-24" <= date <= "2018-12-30"')

df.sample(frac=0.5) # 50%サンプリング

target = pd.Series(df['customer_id'].unique()).sample(frac=0.5)

df[df['customer_id'].isin(target)]

aggregation

result = df \

.groupby('hotel_id') \

.agg({'reserve_id': 'count', 'customer_id': 'nunique'})

result.reset_index(inplace=True)

result.columns = ['hotel_id', 'rsv_cnt', 'cus_cnt']

result = df \

.groupby(['hotel_id', 'people_num'])['total_price'] \

.sum().reset_index()

result.rename(columns={'total_price': 'price_sum'}, inplace=True)

result = df \

.groupby('hotel_id') \

.agg({'total_price': ['max', 'min', 'mean', 'median',

lambda x: np.percentile(x, q=20)]}) \

.reset_index()

result.columns = ['hotel_id', 'price_max', 'price_min', 'price_mean',

'price_median', 'price_20per']

result = df \

.groupby('hotel_id') \

.agg({'total_price': ['var', 'std']}).reset_index()

result.columns = ['hotel_id', 'price_var', 'price_std']

result.fillna(0, inplace=True)

df['total_price'].round(-3).mode() # 四捨五入した後に最頻値を算出

df['reserve_datetime'] = pd.to_datetime(

df['reserve_datetime'], format='%Y-%m-%d %H:%M:%S'

)

df['log_no'] = df \

.groupby('customer_id')['reserve_datetime'] \

.rank(ascending=True, method='first') # 昇順

split

from sklearn.model_selection import train_test_split

from sklearn.model_selection import KFold

train_data, test_data, train_target, test_target = \

train_test_split(df.drop('fault_flg', axis=1),

df[['fault_flg']],

test_size=0.2)

train_data.reset_index(inplace=True, drop=True)

test_data.reset_index(inplace=True, drop=True)

train_target.reset_index(inplace=True, drop=True)

test_target.reset_index(inplace=True, drop=True)

row_no_list = list(range(len(train_target)))

# 交差検証用のデータ分割

k_fold = KFold(n_splits=4, shuffle=True)

for train_cv_no, test_cv_no in k_fold.split(row_no_list):

train_cv = train_data.iloc[train_cv_no, :]

test_cv = train_data.iloc[test_cv_no, :]

...

★应对样本不均衡的策略

・重采样resampling

・上采样:简单上采样,SMOT(Synthetic Minority Over-sampling Technique),ADASYN(Adaptive Synthetic Sampling Approach)

・下采样:简单下采样,聚类Cluter,Tomek links

・调整损失函数

・异常值检测框架

・二分类变成多分类

・EasyEnsemble

import numpy as np

from numpy import nan as NA

x = np.array([1, 2, 3])

xc = x - x.mean() # 中心化

np.linalg.inv(x) # 逆行列

np.dot(x, xc) # 行列積

np.array([1,2,3])

-> np.array([[1,2,3]])

-> np.array([[1], [2], [3]]) # .T

1次元

arr = np.arange(4) # [0 1 2 3]

arr[0:2] = 9 # [9 9 2 3]

arr[arr % 2 == 0] # [2]

array_1d = np.array([1,2,3,4])

array_2d = np.array([ [1,2], [3,4] ])

array_3d = np.array([ [[1,2], [3,4]], [[5,6], [7,8]] ])

arr2 = arr1[:] # view、Pythonと違う

arr2 = arr1.copy()

np.abs(arr) # 絶対値

np.exp(arr) # eのべき乗

np.sqrt(arr) # 平方根

np.unique(arr)

np.union1d(arr1, arr2) # 和集合

np.intersect1d(arr1, arr2) # 積集合

np.setdiff1d(arr1, arr2) # 差集合

from numpy.random import randint

arr = randint(0, 11, (5, 2)) # 0−10の整数配列(5*2)

arr = np.random.rand(3) # 0−1の小数

2次元

arr.shape # 要素数

arr.reshape(4, 2) # 配列(4*2)

arr.sum()

arr.mean(axis=0) # 列ごと

arr.sum(axis=1) # 行ごと

arr.argmax(axis=0) # それぞれの列の最大値のインデックス番号

arr = np.arange(25).reshape(5, 5)

arr[[1, 3, 0]] # 行順番

転置

arr.T

np.transpose(arr)

arr.sort(0) # 列単位

arr.sort(1) # 行単位

new_arr = np.sort(arr)

x = np.arange(0, 11, 2) # [0 2 4 6 8 10]

x = np.linspace(0, 15, 4) # [0. 2.5 5. 7.5 10.]

array1 = np.array([[1], [1], [1]]) # 縦ベクトル(3, 1)

array2 = np.array([[1, 1, 1]]) # 横ベクトル(1, 3)

array1 = np.array([1, 1, 1]).reshape((3, 1)) # 縦ベクトル

array2 = np.array([1, 1, 1]).reshape((1, 3)) # 横ベクトル

Pandas

import pandas as pd

Series

fruits = {"banana": 3, "orange": 2}

series = pd.Series(fruits)

index = ["banana", "orange"]

data = [3, 2]

series = pd.Series(data, index=index)

index = series.index

data = series.values

series = series[series >= 5][series < 10]

series.append( pd.Series([9], index=["apple"]) )

series.append( pd.Series({"apple": 9}) )

series.drop("apple")

series[0:2]

series[["banana", "orange"]]

items = series.sort_index()

items = series.sort_values()

DataFrame

data = {

"fruits": ["banana", "orange"],

"year": [2001, 2002]

}

df = pd.DataFrame(data)

series1 = pd.Series(data1, index=index)

series2 = pd.Series(data2, index=index)

df = pd.DataFrame([series1, series2])

df.index = [2, 3]

df = df.append(series, ignore_index=True) # 行追加

df["mango"] = series # 列追加

df1.append(df2).sort_values(by="ID", ascending=True).reset_index(drop=True)

df = df.loc[range(2, 6), ["banana", "orange"]] # インデックス、カラム

df = df.iloc[range(2, 6), [0, 2]] # 行番号、列番号

df = df.iloc[2:6, [0, 2]]

df = df.drop(np.arrange(2, 11, 3)) # 行削除

df = df.drop("banana", axis=1) # 列削除

df = df.sort_values(by="year", ascending=True)

df = df.sort_values(by=["year", "time"], ascending=True)

df = df[df.index % 2 == 0]

df = df.loc[df["apple"] >= 5][df["orange"] >= 6]

df = pd.concat([df1, df2], axis=0) # 縦方向

df = pd.concat([df1, df2], axis=1, keys=["X", "Y"]) # 横方向

Y_banana = df["Y", "banana"]

df = pd.merge(df1, df2, on="fruits", how="inner")

df = pd.merge(df1, df2, on="fruits", how="outer")

df = pd.merge(df1, df2, left_on="c_id", right_on="id", how="inner")

df = pd.merge(df1, df2, left_on="c_id", right_index=True, how="inner")

df = df.head(3)

df = df.tail(3)

df = df * 2 # df + df

df = df ** 2 # df * df

df = np.sqrt(df)

df_dec = df.describe().loc[["mean", "max", "min"]]

df_diff = df.diff(-2, axis=0) # 2行後の行との差

df = df.groupby("Region")

mean_df = df.mean()

df.dropna() # リストワイズ削除

df[[0, 1, 2]].dropna() # ペアワイズ削除

df.fillna(0)

df.fillna(method="ffill") # 前の値で埋める

df.fillna(df.mean())

df["xxx"].mean()

df.drop_duplicates()

df["region"] = df["city"].map(city_map)

year_bins = [1980, 1985, 1990]

cut_data = pd.cut(df.year, year_bins)

cut_data = pd.cut(df["year"], year_bins)

pd.value_counts(cut_data)

group_name = ["aaa", "bbb"]

cut_data = pd.cut(df.year, year_bins, labels=group_name)

pd.cut(df.year, 2) # 分割数

改善

df["sum"] = df.sum(axis=1)

df["max"] = df.max(axis=1)

df["min"] = df.min(axis=1)

df["mean"] = df.mean(axis=1)

f_diff = lambda x: df["xxx"] - df["yyy"]

func_list = ["sum", "max", "min", "mean", f_diff]

for func in func_list:

df[func] = df.apply(func, axis=1)