안녕하세요 코북입니다.
약 2주 동안 학원에서 Kaggle대회가 열렸었는데요. 전자 상거래 물품 배송 여부를 예측하는 문제였습니다. 대회 초반에는 열심히 참여했지만, 뭔가 마무리를 제대로 하지 못한 기분이라 아쉬움이 남는 대회였습니다. 그래도 이번에 배운 머신러닝 모델들을 사용해보면서 배운 점들이 있어 뿌듯한 시간이었습니다.
데이터 처리 소스코드입니다.
# 문제정의
# 데이터수집
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
train = pd.read_csv('KaggleCompetition/Train.csv')
test = pd.read_csv('KaggleCompetition/test.csv')
submission = pd.read_csv('KaggleCompetition/sampleSubmission.csv')
# 데이터 전처리
train.drop('ID',axis = 1, inplace = True)
test.drop('ID',axis = 1, inplace = True)
sns.countplot(data = train,
x='Gender',
hue='Reached.on.Time_Y.N')
plt.show()
train.drop('Gender', axis =1, inplace = True)
test.drop('Gender', axis =1, inplace = True)
# 결측치 채우기
train.info()
test.info()
train.corr() # 결측치 꽤 많다
train.describe()
train['Customer_care_calls'].fillna(4, inplace = True)
test['Customer_care_calls'].fillna(4, inplace = True)
train['Prior_purchases'].fillna(3.6, inplace = True)
test['Prior_purchases'].fillna(3.6, inplace = True)
# Discount_offered는 표준편차가 높고 극단적인 값(65)이 있기 때문에 중앙값을 사용한다
train['Discount_offered'].fillna(7, inplace = True)
test['Discount_offered'].fillna(7, inplace = True)
# 결측치가 너무 많으니까 그냥 드랍시켜보자
train.drop('Discount_offered',axis=1, inplace = True)
test.drop('Discount_offered',axis=1, inplace = True)
train.columns
train['Warehouse_block '].unique()
train['Mode_of_Shipment'].unique()
train['Product_importance'].unique()
train['Weight_in_gms'][train['Weight_in_gms']=='?']
train['Mode_of_Shipment'].value_counts()
# Mode_of_Shipment
train['Mode_of_Shipment'] = train['Mode_of_Shipment'].replace(' Ship','Ship') # 공백 제거
train['Mode_of_Shipment'] = train['Mode_of_Shipment'].replace(' Road','Road') # 공백 제거
train['Mode_of_Shipment'] = train['Mode_of_Shipment'].replace(' Flight','Flight') # 공백 제거
train['Mode_of_Shipment'] = train['Mode_of_Shipment'].replace(' Shipzk','Ship') # 오타
train['Mode_of_Shipment'] = train['Mode_of_Shipment'].replace(' Roadzk','Road') # 오타
train['Mode_of_Shipment'] = train['Mode_of_Shipment'].replace(' Flightzk','Flight') # 오타
train['Mode_of_Shipment'] = train['Mode_of_Shipment'].replace('?','Ship') # 비율을 고려해 결측치 채우기
train['Mode_of_Shipment'].value_counts()
test['Mode_of_Shipment'].unique()
test['Mode_of_Shipment'].unique()
test['Mode_of_Shipment'] = test['Mode_of_Shipment'].replace(' Ship','Ship') # 공백 제거
test['Mode_of_Shipment'] = test['Mode_of_Shipment'].replace(' Road','Road') # 공백 제거
test['Mode_of_Shipment'] = test['Mode_of_Shipment'].replace(' Flight','Flight') # 공백 제거
test['Mode_of_Shipment'] = test['Mode_of_Shipment'].replace(' Shipzk','Ship') # 오타
test['Mode_of_Shipment'] = test['Mode_of_Shipment'].replace(' Roadzk','Road') # 오타
test['Mode_of_Shipment'] = test['Mode_of_Shipment'].replace(' Flightzk','Flight') # 오타
test['Mode_of_Shipment'] = test['Mode_of_Shipment'].replace('?','Ship') # 비율을 고려해 결측치 채우기
test['Mode_of_Shipment'].value_counts()
train['Product_importance'].value_counts()
# Product_importance
train['Product_importance'] = train['Product_importance'].replace('loww','low') # 오타
train['Product_importance'] = train['Product_importance'].replace('highh','high') # 오타
train['Product_importance'] = train['Product_importance'].replace('mediumm','medium') # 오타
train['Product_importance'] = train['Product_importance'].replace('?','low') # 비율을 고려해 결측치 채우기
train['Product_importance'].value_counts()
test['Product_importance'] = test['Product_importance'].replace('loww','low') # 오타
test['Product_importance'] = test['Product_importance'].replace('highh','high') # 오타
test['Product_importance'] = test['Product_importance'].replace('mediumm','medium') # 오타
test['Product_importance'] = test['Product_importance'].replace('?','low') # 비율을 고려해 결측치 채우기
test['Product_importance'].value_counts()
# Weight_in_gms
train['Weight_in_gms'].unique()
train['Weight_in_gms'].value_counts()
train['Weight_in_gms'][train['Weight_in_gms']=='?']
# '?'를 숫자로 바꿀 수 없으니 일단 0으로 치환
train['Weight_in_gms'] = train['Weight_in_gms'].replace('?',0)
test['Weight_in_gms'] = test['Weight_in_gms'].replace('?',0)
train['Weight_in_gms'][train['Weight_in_gms']=='?']
# 숫자로 바꾼다
train['Weight_in_gms'] = pd.to_numeric(train['Weight_in_gms'])
test['Weight_in_gms'] = pd.to_numeric(test['Weight_in_gms'])
train.describe()
# 0인 값을 제외한 기술통계
train['Weight_in_gms'][train['Weight_in_gms']!=0].describe()
# 0으로 바꿨던 값을 0을 제외한 평균값으로 바꿔준다
train['Weight_in_gms'] = train['Weight_in_gms'].replace(0,3657)
test['Weight_in_gms'] = test['Weight_in_gms'].replace(0,3657)
train['Customer_rating'].value_counts()
train['Customer_rating'] = train['Customer_rating'].replace(99,3)
train['Customer_rating'].value_counts()
test['Customer_rating'] = test['Customer_rating'].replace(99,3)
test['Customer_rating'].value_counts()
# 이상치 제거
train['Cost_of_the_Product'][train['Cost_of_the_Product']==9999]
# 이상치를 제외한 값의 평균
train['Cost_of_the_Product'][train['Cost_of_the_Product']!=9999].mean()
# 평균값으로 변경
train['Cost_of_the_Product'] = train['Cost_of_the_Product'].replace(9999,210)
test['Cost_of_the_Product'] = test['Cost_of_the_Product'].replace(9999,210)
# 탐색적 데이터 분석
sns.countplot(data = train,
x='Warehouse_block ',
hue='Reached.on.Time_Y.N')
plt.show()
sns.countplot(data = train,
x='Mode_of_Shipment',
hue='Reached.on.Time_Y.N')
plt.show()
plt.figure(figsize=(28,5))
sns.countplot(data = train,
x='Product_importance',
hue='Reached.on.Time_Y.N')
plt.show()
plt.figure(figsize=(28,5))
sns.countplot(data = train,
x='Customer_care_calls',
hue='Reached.on.Time_Y.N')
plt.show()
plt.figure(figsize=(28,5))
sns.countplot(data = train,
x='Customer_rating',
hue='Reached.on.Time_Y.N')
plt.show()
plt.figure(figsize=(28,5))
sns.countplot(data = train,
x='Prior_purchases',
hue='Reached.on.Time_Y.N')
plt.show()
# 결측치 채우고 상관관계 다시 확인
train.corr()
pt1 = train.pivot_table(values = 'Customer_care_calls',
index = ['Weight_in_gms'],
aggfunc = 'mean')
pt1
pt2 = train.pivot_table(values = 'Weight_in_gms',
index = ['Customer_care_calls'],
aggfunc = 'mean')
pt2
# 모델링
categorical_features = ['Warehouse_block ', 'Mode_of_Shipment', 'Product_importance']
categorical_features
train['Warehouse_block ']
pd.get_dummies(train['Warehouse_block '], prefix = 'Warehouse_block ')
for feature_name in categorical_features:
one_hot = pd.get_dummies(train[feature_name], prefix = feature_name) # One-Hot Encoding
train.drop(feature_name, axis = 1, inplace = True) # 기존의 문자형 컬럼 삭제
train = pd.concat([train, one_hot], axis=1) # One-Hot Encoding한 숫자형 컬럼을 train컬럼에 붙여준다
for feature_name in categorical_features:
one_hot = pd.get_dummies(test[feature_name], prefix = feature_name)
test.drop(feature_name, axis = 1, inplace = True)
test = pd.concat([test, one_hot], axis=1)
train.shape, test.shape
# 먼저 train의 문제와 답을 분리시켜주기
X = train.drop('Reached.on.Time_Y.N', axis=1) # 드랍을 이용한 분리
y = train['Reached.on.Time_Y.N']
X.shape, y.shape
# 데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3)
X_train.shape, y_train.shape
X_test.shape, y_test.shape
# 모델선택
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
knn_model = KNeighborsClassifier()
tree_model = DecisionTreeClassifier()
# 모델 학습 및 평가
knn_model.fit(X_train,y_train)
tree_model.fit(X_train,y_train)
knn_model.score(X_test,y_test)
tree_model.score(X_test,y_test)
# 교차 검증
from sklearn.model_selection import cross_val_score
result = cross_val_score(knn_model,X_train,y_train)
result.mean()
tree_result = cross_val_score(tree_model,X_train,y_train)
tree_result.mean()
# 하이퍼 파라미터 튜닝
knn_list = []
for k in range(1,150,2):
knn_model = KNeighborsClassifier(n_neighbors=k)
knn_model.fit(X_train,y_train)
result = knn_model.score(X_test,y_test)
knn_list.append(result)
# 시각화
plt.figure(figsize=(28,5))
plt.plot(range(1,150,2), knn_list)
plt.xticks(range(1,150,2))
plt.grid()
plt.xlabel('n_neightbors')
plt.ylabel('knn_score')
plt.show()
# RandomForest, ensemble 사용
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(max_depth=5, max_features=0.7, min_samples_leaf=10,
n_estimators=1000, n_jobs=-1)
rf_model
rf_model.fit(X_train,y_train)
rf_pre = cross_val_score(rf_model, X_train, y_train, cv = 5)
rf_pre.mean()
# RandomForest - Grid Search
from sklearn.model_selection import GridSearchCV
params = {
'n_estimators' : [1000,2000],
'max_depth' : [3,5,7],
'max_features' : [0.4,0.5,0.6],
'min_samples_leaf' : [10,20,30],
'n_jobs' : [-1]
}
grid = GridSearchCV(rf_model,params,cv=3, verbose=2)
grid.fit(X_train, y_train)
grid.best_params_
# {'max_depth': 5,
# 'max_features': 0.7,
# 'min_samples_leaf': 10,
# 'n_estimators': 1000,
# 'n_jobs': -1}
grid.best_score_
rf_best_model = grid.best_estimator_
rf_best_cross = cross_val_score(rf_best_model, X_train, y_train, cv=3)
rf_best_cross.mean()
rf_best_model.predict(test)
submission['Reached.on.Time_Y.N'] = rf_best_model.predict(test)
submission
submission.to_csv('KaggleCompetition/rf_grid12.csv', index = False)
# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
gb_model = GradientBoostingClassifier(n_estimators=1000,
max_depth=5,
learning_rate=0.1) # 오차를 얼마나 반영시킬지
gb_model.fit(X_train, y_train)
print(gb_model.score(X_train,y_train))
print(gb_model.score(X_test,y_test))
print(cross_val_score(gb_model,X_train,y_train, cv=5).mean())
# XGB
from xgboost import XGBClassifier
xgb_model = XGBClassifier(booster='gbtree', # 트리모델로 분류
objective = 'binary:logistic',
n_estimators = 1000,
max_depth=5,
learning_rate = 0.01)
xgb_model.fit(X_train,y_train)
print(xgb_model.score(X_train,y_train))
print(xgb_model.score(X_test,y_test))
print(cross_val_score(xgb_model,X_train,y_train, cv=5).mean())
params3 = {
'booster' : ['gbtree'],
'objective' : ['binary:logistic'],
'n_estimators' : [1000,2000],
'max_depth' : [3,5,7],
'learning_rate' : [0.01,0.05,0.01],
'n_jobs' : [-1]
}
grid3 = GridSearchCV(xgb_model,params3,cv=3, verbose=2)
grid3.fit(X_train, y_train)
grid3.best_params_
grid3.best_score_
grid_best_model = grid3.best_estimator_
grid_best_cross = cross_val_score(grid_best_model, X_train, y_train, cv=3)
grid_best_cross.mean()
submission['Reached.on.Time_Y.N'] = grid_best_model.predict(test)
submission
submission.to_csv('KaggleCompetition/xgb2.csv', index = False)
# SVM
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train,y_train)
print(svc.score(X_train,y_train))
print(svc.score(X_test,y_test))
cross_val_score(svc,X_train,y_train).mean()
svc_params = {
'C' : [0.1,1,10],
'gamma' : [0.0001,0.01,1,10]
}
grid = GridSearchCV(svc,svc_params,cv=3, verbose=2)
grid.fit(X_train, y_train)
grid.best_params_
grid.best_score_
# StandardScaler 사용
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(scale_data)
train[['Weight_in_gms', 'Cost_of_the_Product']] = scaler.transform(scale_data)
test[['Weight_in_gms', 'Cost_of_the_Product']] = scaler.transform(scale_data2)
# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(X_train,y_train)
print(lr_model.score(X_train,y_train))
print(lr_model.score(X_test,y_test))
cross_val_score(lr_model, X_train, y_train, cv=5).mean()
submission['Reached.on.Time_Y.N'] = lr_model.predict(test)
submission
submission.to_csv('KaggleCompetition/lr_model1.csv', index = False)
배운 점
Train 예측 점수를 계속해서 올리는 것을 목표로 데이터를 처리하다 보니, 결과적으로 과대 적합된 예측값이 나와 Test 점수가 많이 낮게 나온 것 같습니다. 한 번 Train 점수가 잘 나온 후로 정답에 가까워졌다고 판단해서 거의 데이터 처리방식을 바꾸지 않았었는데, 앞으로는 데이터에 접근할 때 보다 다양한 시각에서 접근할 필요가 있는 것 같습니다.
'Machine Learning' 카테고리의 다른 글
[LinearRegression] 선형 회귀 기초 (0) | 2021.09.14 |
---|---|
[LinearRegression] 서울시 구별 CCTV현황 분석과 특성공학 (0) | 2021.09.14 |
[MachineLearning] 머신러닝 개요 (0) | 2021.09.01 |
[MachineLearning] 머신러닝과 기초통계학 (0) | 2021.08.31 |