# 각 마디에서 적절한 최적의 분리규칙(splitting rule)을 찾아서 나무를 성장시키는 과정
# 적절한 정지규칙(stopping rule)을 만족하면 중단
# 범주형 목표변수일 때, 지니지수, 엔트로피 지수가 기준값
# 가지치기 단계 : 나무의 크기를 모형의 복잡도로 볼 수 있고, 과적합을 방지하기 위함
import pandas as pd
import numpy as np
credit = pd.read_csv('../data/credit_final.csv')
X = credit.drop(['credit.rating'], axis = 1)
y = credit['credit.rating']
X.info()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.3, random_state = 48)
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassification()
dt_clf.fit(X_train, y_train)
dt_clf.score(X_train, y_train)
dt_clf.score(X_test, y_test)
# 과적합 이루어짐
dt_prediction = dt_clf.predict(X_test)
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, plot_roc_curve,
precision)score, f1_score, recall_score
confusion_matrix(y_test, dt_prediction)
precision_score(y_test, dt_prediction)
recall_score(y_test, dt_prediction)
f1_score(y_test, dt_prediction)
print(classification_report(y_test, dt_prediction))
plot_roc_curve(df_clf, X_test, y_test)
roc_auc_score(y_test, dt_clf.predict_proba(X_test)[:1])
# 가지치기
# 주요 매개변수 조정
dt_clf.get_depth()
from sklearn.model_selection import GridSearchCV
param_grid = {'max_depth' : range(2, 16, 1), 'min_samples_leaf' : range(1, 20, 1)}
model_grid_tree = GridSearch(DecisionTreeClassifier(), param_grid, cv = 5)
model_grid_tree.fit(X_train, y_train)
model_grid_tree.best_estimators_
model_grid_tree.best_score_ # train 기준
model_grid_tree.score(X_test, y_test)
y_pred = model_grid_tree.predict(X_test)
confusion_matrix(y_test, y_pred)
plot_roc_curve(model_grid_tree, X_test, y_test)
roc_auc_score(y_test, model_grid_tree.predict_proba(X_test)[:,1])
# 의사결정나무 - 회귀
# 연속형 목표변수
# 기준값은 분산분석에서 F통계량 혹은 분산의 감소량
import pandas as pd
df = pd.read_csv('../data/kc_house_data.csv')
df = df.drop(['id', 'date'], axis = 1)
X = df.drop('price', axis = 1)
y = df['price']
X.info()
X = pd.get_dummies(data = X, columns = ['wterfront'])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 48)
# y값이 연속형이므로 층화추출 하지 않음
from sklearn.tree import DecisionRegressor
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, y_train)
dt_reg.score(X_train, y_train)
dt_reg.score(X_test, y_test)
# 과적합되었음을 확인
df_reg.get_depth()
from sklearn.model_seletion import GridSearchCV
param_grid = {'max_depth' : range(1, 40, 2), 'min_samples_split' : range(2, 40, 2)}
model_grid_tree = GridSearchCV(DecisionTreeRegressor(), param_grid)
model_grid_tree.fit(X_train, y_train)
model_grid_tree.best_estimator_
model_grid_tree.score(X_train, y_train)
model_grid_tree.score(X_test, y_test)
y_pred = model_grid_tree.predict(X_test)
from sklearn.metrics import mean_squared_error
MSE = mean_squared_error(y_test, y_pred)
import numpy as np
RMSE = np.sqrt(MSE)
# 의사결정나무의 R^2값은 0.69로, 모델이 전체 데이터의 69%를 설명할 수 있음
# RMSE의 값은 202226이므로, 집값의 평균적인 차이는 202226만큼 난다고 할 수 있음
# 의사결정나무 시각화
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
from IPython.display import Image
import pydot
import pydotplus
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
import os
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
train_x, test_x, train_y, test_y = train_test_split(X, y, stratify = y, test_size = 0.3, random_state = 48)
iris_clf = DecisionTreeClassifier(max_depth = 5)
iris_clf = iris_clf.fit(train_x, train_y)
iris_prediction = iris_clf.predict(test_x)
feature_columns = iris.feature_names
from sklearn import tree
feature_names = feature_columns
target_name = np.array(['0', '1', '2'])
dt_dot_data = tree.export_graphivz(iris_clf, feature_names = feature_names, class_names = target_name,
filled = True, max_depth = 5)
dt_graph = pydotplus.graph_from_dot_data(dt_dot_data)
Image(dt_graph.create_png())
# gini 지수 값을 0으로 만들어가는 방향으로 전개됨
# 분류 리포트 생성하기
class_reprot_iris = classification_report(test_y, iris_prediction)
from sklearn.metrics import roc_auc_score
iris_clf.predict_proba(test_x)
roc_auc_score(test_y, iris_clf.predict_proba(test_x), multi_class = 'ovr')