# 배깅과 부스팅보다 더 많은 무작위성을 주어 약한 학습기들을 생성한 후 이를 선형 결합하여
# 최종 결합기를 만드는 방법
# 수천 개의 변수를 변수 제거 없이 모델링하므로 정확도 측면에서 좋은 성과를 보임

import pandas as pd
df = pd.read_csv('../data/kc_house_data.csv')
df = df.drop(['id', 'date'], axis = 1)

X = df.drop('price', axis = 1)
y = df['price']

X = pd.get_dummies(data = X, columns = ['waterfront'])

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 48)
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor(n_estimators=50)
reg = reg.fit(X_train, y_train)

Y_pred = reg.predict(X_test)

# train 정확도
reg.score(X_train, y_train)

# test 정확도
reg.score(X_test, y_test)
# 변수 중요도 확인
import numpy as np
importnaces = pd.DataFrame(np.mean([tree.feature_importances_ for tree in reg.estimators_], axis = 0))
feature_importances = pd.concat([pd.DataFrame(X.columns), importances], axis = 1)
feature_importances.columns = ['col_name', 'feature_importance']
feature_importances = feature_importances.sort_values(by = 'feature_importance', ascending = True)
feature_importances
# 랜덤포레스트 분류
from sklearn.ensemble import RandomForestClassifier
clf = RandomforestClassifier() # default 값은 n_estimators = 100
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# 정확도
clf.score(X_test, y_test)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score

# 정밀도
precision_score(y_test, y_pred)

# 재현율
recall_score(y_test, y_pred)

# f1 스코어
f1_score(y_test, y_pred)

# 오차행렬
pd.DataFrame(confusion_matrix(y_test, y_pred))
import matplotlib.pyplot as plt
from sklearn.metrics import plot_roc_curve

plot_roc_curve(clf, X_test, y_test)
plt.show()
from sklearn.model_selection import RridSearchCV
param_grid = {'max_depth' : range(2,10,2), 'min_samples_leaf' : range(2, 10, 2)}
model_grid_rf = GridSearchCV(RandomForestClassifier(), param_grid, cv = 5)
model_grid_rf.fit(X_train, y_train)

model_grid_rf.best_estimator_.estimators_

import matplotlib.pyplot as plt
from sklearn.metrics import plot_roc_curve
plot_roc_curve(model_grid_rf.best_estimator_, X_test, y_test)
plt.show()

'ADP > 실기' 카테고리의 다른 글

나이브베이즈 분류  (0) 2024.02.03
부스팅  (0) 2024.02.03
배깅  (0) 2024.01.29
의사결정나무(분류, 회귀)  (0) 2024.01.29
회귀분석(LinearRegression, Ridge, Lasso, Elasticnet)  (0) 2024.01.29

+ Recent posts