前処理
一般例
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
data = pd.read_csv('xxx.csv', header=None)
len(data)
data.head(10)
data.tail()
row, col = data.shape
data.describe()
data['y'].describe()
data.info()
age_bining = pd.cut(data['age'], [0,20,30,40,50])
age = pd.crosstab(age_bining, data['y'], margins=True)
age['rate'] = age[1] / age['All']
data['year'] = data['datetime'].apply(lambda x: x.split('-')[0])
data['year'] = data['year'].astype(np.int)
# 欠損値
data.isnull().any()
data.isnull().sum()
data.fillna(0)
data['kcal'].fillna(data['kcal'].mean())
data.dropna(subset=['kcal'])
data['kcal'].value_counts()
# 相関関係
data[['y', 'week']].corr()
data['y'].mean()
data['y'].median()
data[data['y'] > = 100]
data[data['week'] == 'MON'].sort_values(by='y', ascending=False)
data[data['week'] == 'MON']['y'].mean()
data[data['temp'] >= data['temp'].mean()]
data[['y', 'week']].boxplot(by='week')
data.plot.scatter(x='kcal', y='y', figsize=(5, 5))
# または
plt.scatter(data['kcal'], data['y'], label='xxx')
plt.plot(data['kcal'], data['y'], label='xxx', color='red')
plt.legend()
plt.show()
ax = data['y'].plot(figsize=(12, 4), title='Graph')
ax.set_xlabel('time')
ax.set_ylabel('y')
plt.axvline(x=data['y'].mean(), color='red')
data['y'].plot.hist(figsize=(12, 4), title='Graph', grid=True)
plt.savefig('sample_fig.png')
train['pred'] = pred
train['res'] = train['y'] - train['pred']
train.sort_values(by='res')
★乱数
np.random().seed(0)
num = np.random().rand(5) # 一様乱数
num = np.random().randn(5) # 正規分布に従う乱数
num = np.random().binomial(100, 0.5, size=(5)) # 二項分布に従う乱数
df = pd.DataFrame(np.random().rand(10, 4))
b_list = np.random().choice(a_list, 5)
★CSV関連
df = pd.read_csv("xxx.data", header=None)
df.columns = ["aaa", "bbb", ...]
import csv
with open("xxx.csv", "w") as csvfile:
writer = csv.writer(csvfile, lineterminator="¥n")
writer.writerow(["aaa", "bbb", ...])
data = {
"city": ["aaa", "bbb"],
"year": [1998, 2000]
}
df = pd.DataFrame(data)
df.to_csv("xxx.csv")
★3σ法
col = 'y'
mean = df.mean()
sigma = df.std()
low = mean[col] - 3 * sigma[col]
high = mean[col] + 3 * sigma[col]
df2 = df[(df[col] > low) & (df[col] < high)]
cols = df.columns
_df = df
for col in cols:
low = mean[col] - 3 * sigma[col]
high = mean[col] + 3 * sigma[col]
_df = _df[(_df[col] > low) & (_df[col] < high)]
★欠損値処理
df.replace('None', np.nan, inplace=True)
df.dropna(subset=['thickness'], inplace=True)
df['thickness'].fillna(1, inplace=True)
df.replace('None', np.nan, inplace=True)
df['thickness'] = df['thickness'].astype('float64')
thickness_mean = df['thickness'].mean()
df['thickness'].fillna(thickness_mean, inplace=True)
★Pandas vs NumPy
NumPy → Pandas
array = np.arange(10)
series = pd.Series(array)
array = np.arange(20).reshape(4,5)
df = pd.DataFrame(array)
df = pd.DataFrame(array, index=np.linspace(0,10,4))
Pandas → NumPy
array = df["grade"].values
array.mean()
dummies = pd.get_dummies(df)
array = dummies.values
Numpy
基礎集計
・各種代表値(平均、分散、標準偏差など)を計算
・散布図行列をプロットして傾向を調べる
・相関行列を表示して傾向を調べる
前処理
・欠損値の処理、補間(空欄のあるデータを消去・補間)
・名寄せ(表記の揺れを統一)
・正規化や標準化
・時系列データの時間粒度の変更
・画像データのサイズ統一化、グレースケール化
・テキストデータの分かち書き、不要文字の削除、半角全角の統一
・データの不均衡を解消するのには、imbalanced-learnとかSMOTEとか利用
Stop Feeding Garbage To Your Model
★EDA(Exploratory Data Analysis)
統計量
・各カラムの型や値の分布、欠損値、外れ値
・目的関数と各変数の相関や関係性
・変数の平均、標準偏差、最大、最小、分位点
・カテゴリ変数の値の種類数
・変数の欠損値の数
・変数間の相関係数
可視化手法
・棒グラフ
・箱ひげ図、バイオリンプロット
・散布図
・折れ線グラフ
・ヒートマップ
・ヒストグラム
・Q-Qプロット
・t-SNE、UMAP
データ構造を対象とした前処理:抽出、集約、結合、分割、生成、展開
データ内容を対象とした前処理:数値、カテゴリ、日時、文字、位置情報
selection
df.iloc[:, 0:6] # not good
df[['reserve_id', 'hotel_id']]
df.loc[:, ['reserve_id', 'hotel_id']]
df.drop(['people_num', 'total_price'], axis=1, inplace=True) # 列の削除
df[(df['date'] >= '2018-12-24') & (df['date'] <= '2018-12-30')] # not good
df.loc[(df['date'] >= '2018-12-24') & (df['date'] <= '2018-12-30'), :] # not good
df.query('"2018-12-24" <= date <= "2018-12-30"')
df.sample(frac=0.5) # 50%サンプリング
target = pd.Series(df['customer_id'].unique()).sample(frac=0.5)
df[df['customer_id'].isin(target)]
aggregation
result = df \
.groupby('hotel_id') \
.agg({'reserve_id': 'count', 'customer_id': 'nunique'})
result.reset_index(inplace=True)
result.columns = ['hotel_id', 'rsv_cnt', 'cus_cnt']
result = df \
.groupby(['hotel_id', 'people_num'])['total_price'] \
.sum().reset_index()
result.rename(columns={'total_price': 'price_sum'}, inplace=True)
result = df \
.groupby('hotel_id') \
.agg({'total_price': ['max', 'min', 'mean', 'median',
lambda x: np.percentile(x, q=20)]}) \
.reset_index()
result.columns = ['hotel_id', 'price_max', 'price_min', 'price_mean',
'price_median', 'price_20per']
result = df \
.groupby('hotel_id') \
.agg({'total_price': ['var', 'std']}).reset_index()
result.columns = ['hotel_id', 'price_var', 'price_std']
result.fillna(0, inplace=True)
df['total_price'].round(-3).mode() # 四捨五入した後に最頻値を算出
df['reserve_datetime'] = pd.to_datetime(
df['reserve_datetime'], format='%Y-%m-%d %H:%M:%S'
)
df['log_no'] = df \
.groupby('customer_id')['reserve_datetime'] \
.rank(ascending=True, method='first') # 昇順
split
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
train_data, test_data, train_target, test_target = \
train_test_split(df.drop('fault_flg', axis=1),
df[['fault_flg']],
test_size=0.2)
train_data.reset_index(inplace=True, drop=True)
test_data.reset_index(inplace=True, drop=True)
train_target.reset_index(inplace=True, drop=True)
test_target.reset_index(inplace=True, drop=True)
row_no_list = list(range(len(train_target)))
# 交差検証用のデータ分割
k_fold = KFold(n_splits=4, shuffle=True)
for train_cv_no, test_cv_no in k_fold.split(row_no_list):
train_cv = train_data.iloc[train_cv_no, :]
test_cv = train_data.iloc[test_cv_no, :]
...
★应对样本不均衡的策略
・重采样resampling
・上采样:简单上采样,SMOT(Synthetic Minority Over-sampling Technique),ADASYN(Adaptive Synthetic Sampling Approach)
・下采样:简单下采样,聚类Cluter,Tomek links
・调整损失函数
・异常值检测框架
・二分类变成多分类
・EasyEnsemble
import numpy as np
from numpy import nan as NA
x = np.array([1, 2, 3])
xc = x - x.mean() # 中心化
np.linalg.inv(x) # 逆行列
np.dot(x, xc) # 行列積
np.array([1,2,3])
-> np.array([[1,2,3]])
-> np.array([[1], [2], [3]]) # .T
1次元
arr = np.arange(4) # [0 1 2 3]
arr[0:2] = 9 # [9 9 2 3]
arr[arr % 2 == 0] # [2]
array_1d = np.array([1,2,3,4])
array_2d = np.array([ [1,2], [3,4] ])
array_3d = np.array([ [[1,2], [3,4]], [[5,6], [7,8]] ])
arr2 = arr1[:] # view、Pythonと違う
arr2 = arr1.copy()
np.abs(arr) # 絶対値
np.exp(arr) # eのべき乗
np.sqrt(arr) # 平方根
np.unique(arr)
np.union1d(arr1, arr2) # 和集合
np.intersect1d(arr1, arr2) # 積集合
np.setdiff1d(arr1, arr2) # 差集合
from numpy.random import randint
arr = randint(0, 11, (5, 2)) # 0−10の整数配列(5*2)
arr = np.random.rand(3) # 0−1の小数
2次元
arr.shape # 要素数
arr.reshape(4, 2) # 配列(4*2)
arr.sum()
arr.mean(axis=0) # 列ごと
arr.sum(axis=1) # 行ごと
arr.argmax(axis=0) # それぞれの列の最大値のインデックス番号
arr = np.arange(25).reshape(5, 5)
arr[[1, 3, 0]] # 行順番
転置
arr.T
np.transpose(arr)
arr.sort(0) # 列単位
arr.sort(1) # 行単位
new_arr = np.sort(arr)
x = np.arange(0, 11, 2) # [0 2 4 6 8 10]
x = np.linspace(0, 15, 4) # [0. 2.5 5. 7.5 10.]
array1 = np.array([[1], [1], [1]]) # 縦ベクトル(3, 1)
array2 = np.array([[1, 1, 1]]) # 横ベクトル(1, 3)
array1 = np.array([1, 1, 1]).reshape((3, 1)) # 縦ベクトル
array2 = np.array([1, 1, 1]).reshape((1, 3)) # 横ベクトル
Pandas
import pandas as pd
Series
fruits = {"banana": 3, "orange": 2}
series = pd.Series(fruits)
index = ["banana", "orange"]
data = [3, 2]
series = pd.Series(data, index=index)
index = series.index
data = series.values
series = series[series >= 5][series < 10]
series.append( pd.Series([9], index=["apple"]) )
series.append( pd.Series({"apple": 9}) )
series.drop("apple")
series[0:2]
series[["banana", "orange"]]
items = series.sort_index()
items = series.sort_values()
DataFrame
data = {
"fruits": ["banana", "orange"],
"year": [2001, 2002]
}
df = pd.DataFrame(data)
series1 = pd.Series(data1, index=index)
series2 = pd.Series(data2, index=index)
df = pd.DataFrame([series1, series2])
df.index = [2, 3]
df = df.append(series, ignore_index=True) # 行追加
df["mango"] = series # 列追加
df1.append(df2).sort_values(by="ID", ascending=True).reset_index(drop=True)
df = df.loc[range(2, 6), ["banana", "orange"]] # インデックス、カラム
df = df.iloc[range(2, 6), [0, 2]] # 行番号、列番号
df = df.iloc[2:6, [0, 2]]
df = df.drop(np.arrange(2, 11, 3)) # 行削除
df = df.drop("banana", axis=1) # 列削除
df = df.sort_values(by="year", ascending=True)
df = df.sort_values(by=["year", "time"], ascending=True)
df = df[df.index % 2 == 0]
df = df.loc[df["apple"] >= 5][df["orange"] >= 6]
df = pd.concat([df1, df2], axis=0) # 縦方向
df = pd.concat([df1, df2], axis=1, keys=["X", "Y"]) # 横方向
Y_banana = df["Y", "banana"]
df = pd.merge(df1, df2, on="fruits", how="inner")
df = pd.merge(df1, df2, on="fruits", how="outer")
df = pd.merge(df1, df2, left_on="c_id", right_on="id", how="inner")
df = pd.merge(df1, df2, left_on="c_id", right_index=True, how="inner")
df = df.head(3)
df = df.tail(3)
df = df * 2 # df + df
df = df ** 2 # df * df
df = np.sqrt(df)
df_dec = df.describe().loc[["mean", "max", "min"]]
df_diff = df.diff(-2, axis=0) # 2行後の行との差
df = df.groupby("Region")
mean_df = df.mean()
df.dropna() # リストワイズ削除
df[[0, 1, 2]].dropna() # ペアワイズ削除
df.fillna(0)
df.fillna(method="ffill") # 前の値で埋める
df.fillna(df.mean())
df["xxx"].mean()
df.drop_duplicates()
df["region"] = df["city"].map(city_map)
year_bins = [1980, 1985, 1990]
cut_data = pd.cut(df.year, year_bins)
cut_data = pd.cut(df["year"], year_bins)
pd.value_counts(cut_data)
group_name = ["aaa", "bbb"]
cut_data = pd.cut(df.year, year_bins, labels=group_name)
pd.cut(df.year, 2) # 分割数
改善
df["sum"] = df.sum(axis=1)
df["max"] = df.max(axis=1)
df["min"] = df.min(axis=1)
df["mean"] = df.mean(axis=1)
↓
f_diff = lambda x: df["xxx"] - df["yyy"]
func_list = ["sum", "max", "min", "mean", f_diff]
for func in func_list:
df[func] = df.apply(func, axis=1)