GridSearchCV

유방울 2023. 6. 2. 02:16

GridSearchCV : 최고의 hyperparameter를 찾기 위해 사용함
- sklearn.model_selection.GridSearchCV
- (estimator, param_grid, *, scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score=nan, return_train_score=False)
- estimator : 학습 모델
- param_grid : 실행해볼 hyperparameter 목록, dict 객체
- cv : CrossValication에 사용할 나누는 개수, 기본값 = 5
- verbose : 0(default) : 메시지 출력 안함, 1 : 간단한 메시지, 2 : 하이퍼 파라미터별 메시지 출력
- n_jobs : -1로 지정시 사용시 모든 코어를 다 사용, 속도가 빨라짐
- p : 거리를 제는 방법 변경

from sklearn.model_selection import GridSearchCV
help(GridSearchCV)

# 종료별로 50개씩 순차별로 들어있음
# 마지막에 한 쪽만 보면 2, 첫번째 쪽만 보면 0만 있음
df['target'].to_numpy()


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

# 그래서 데이터를 섞어야 함
# [11] GridSearchCV 사용하기
from sklearn.model_selection import GridSearchCV

# 데이터 섞기 (중요)
df2 = df.sample(frac=1, random_state=0)

# X, Y 분리하기
X = df2.iloc[:,:-1]
Y = df2.iloc[:,-1]

# X에 대해 Scale 하기
# 스케일링 해놓고 까먹고 안 쓰기도 함 - 실수 잦음
# 주의해서 scaledX를 꼭곡꼭 사용하기!
scaledX = StandardScaler().fit_transform(X)

# 시도해볼 hyperparameter 종류 나열하기  (n_neighbors를 3 ~ 9 범위로 지정하기)
params = {'n_neighbors':range(3,10)}
# 거리측정을 유클리안 or 맨하튼 설정도 가능 
# params = {'n_neighbors':range(3,10),'p':[1,2]}


# KNeighborsClassifier 학습 모델 객체 생성하기
model = KNeighborsClassifier()

# GridSearchCV 객체 만들기 (위에서 생성한 model, params를 사용하고, cv의 개수를 지정함)
# CrossValication에 사용할 나누는 개수
# 5조각으로 나눠서 실행해봄
gs = GridSearchCV(model, params, cv=5)

# gs 를 학습하기
# scaledX
gs.fit(scaledX,Y)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(3, 10)})

# gs의 주요 attribute 살펴보기
print(dir(gs))


['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_cache', '_abc_negative_cache', '_abc_negative_cache_version', '_abc_registry', '_check_is_fitted', '_check_n_features', '_estimator_type', '_format_results', '_get_param_names', '_get_tags', '_more_tags', '_pairwise', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_required_parameters', '_run_search', '_validate_data', 'best_estimator_', 'best_index_', 'best_params_', 'best_score_', 'classes_', 'cv', 'cv_results_', 'decision_function', 'error_score', 'estimator', 'fit', 'get_params', 'iid', 'inverse_transform', 'multimetric_', 'n_features_in_', 'n_jobs', 'n_splits_', 'param_grid', 'pre_dispatch', 'predict', 'predict_log_proba', 'predict_proba', 'refit', 'refit_time_', 'return_train_score', 'score', 'scorer_', 'scoring', 'set_params', 'transform', 'verbose']

그 중 중요 속성

# best_estimator_ 최고로 좋은 성능을 보인 것
'best_estimator_'  # gs
# 그 때의 파라미터는?
'best_params_'
# 전체적인 결과는?
'cv_results_'

# 보기 불편함
gs.cv_results_

{'mean_fit_time': array([0.00084453, 0.00058208, 0.00051627, 0.00050292, 0.0004921 ,
        0.00048261, 0.00047159]),
 'std_fit_time': array([2.49534648e-04, 2.75299033e-05, 1.28105122e-05, 8.54935914e-06,
        1.01656940e-05, 8.30764741e-06, 5.80685079e-06]),
 'mean_score_time': array([0.00184426, 0.00131168, 0.00124898, 0.00123596, 0.00123997,
        0.00121503, 0.00121059]),
 'std_score_time': array([3.97268647e-04, 6.85710403e-05, 1.17930226e-05, 1.16946683e-05,
        3.59408080e-05, 8.04909033e-06, 1.25613787e-05]),
 'param_n_neighbors': masked_array(data=[3, 4, 5, 6, 7, 8, 9],
              mask=[False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neighbors': 3},
  {'n_neighbors': 4},
  {'n_neighbors': 5},
  {'n_neighbors': 6},
  {'n_neighbors': 7},
  {'n_neighbors': 8},
  {'n_neighbors': 9}],
 'split0_test_score': array([0.96666667, 0.96666667, 1.        , 1.        , 1.        ,
        1.        , 1.        ]),
 'split1_test_score': array([0.86666667, 0.83333333, 0.86666667, 0.86666667, 0.86666667,
        0.9       , 0.86666667]),
 'split2_test_score': array([0.96666667, 0.96666667, 1.        , 1.        , 1.        ,
        0.96666667, 0.96666667]),
 'split3_test_score': array([1., 1., 1., 1., 1., 1., 1.]),
 'split4_test_score': array([0.93333333, 0.96666667, 0.93333333, 0.93333333, 0.93333333,
        0.93333333, 0.93333333]),
 'mean_test_score': array([0.94666667, 0.94666667, 0.96      , 0.96      , 0.96      ,
        0.96      , 0.95333333]),
 'std_test_score': array([0.04521553, 0.05811865, 0.05333333, 0.05333333, 0.05333333,
        0.03887301, 0.04988877]),
 'rank_test_score': array([6, 6, 1, 1, 1, 1, 5], dtype=int32)}
 
 # [12] 결과를 DataFrame으로 확인하기
# rank도 볼 수 있음
result = pd.DataFrame(gs.cv_results_)
result


mean_fit_time	std_fit_time	mean_score_time	std_score_time	param_n_neighbors	params	split0_test_score	split1_test_score	split2_test_score	split3_test_score	split4_test_score	mean_test_score	std_test_score	rank_test_score
0	0.000845	0.000250	0.001844	0.000397	3	{'n_neighbors': 3}	0.966667	0.866667	0.966667	1.0	0.933333	0.946667	0.045216	6
1	0.000582	0.000028	0.001312	0.000069	4	{'n_neighbors': 4}	0.966667	0.833333	0.966667	1.0	0.966667	0.946667	0.058119	6
2	0.000516	0.000013	0.001249	0.000012	5	{'n_neighbors': 5}	1.000000	0.866667	1.000000	1.0	0.933333	0.960000	0.053333	1
3	0.000503	0.000009	0.001236	0.000012	6	{'n_neighbors': 6}	1.000000	0.866667	1.000000	1.0	0.933333	0.960000	0.053333	1
4	0.000492	0.000010	0.001240	0.000036	7	{'n_neighbors': 7}	1.000000	0.866667	1.000000	1.0	0.933333	0.960000	0.053333	1
5	0.000483	0.000008	0.001215	0.000008	8	{'n_neighbors': 8}	1.000000	0.900000	0.966667	1.0	0.933333	0.960000	0.038873	1
6	0.000472	0.000006	0.001211	0.000013	9	{'n_neighbors': 9}	1.000000	0.866667	0.966667	1.0	0.933333	0.953333	0.049889	5

# [13] 최고 점수가 나오는 모델 가져오기, 성능평가
model = gs.best_estimator_
print(model.score(x_test, y_test), gs.best_params_)

0.9 {'n_neighbors': 5}

# [14] 각 class 별 예측 확률 구하기 (분류인 경우)
# model.predict(x_test)
model.predict_proba(x_test)

array([[1. , 0. , 0. ],
       [0. , 1. , 0. ],
       [1. , 0. , 0. ],
       [0. , 0. , 1. ],
       [1. , 0. , 0. ],
       [0. , 1. , 0. ],
       [0. , 0.2, 0.8],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [0. , 0.8, 0.2],
       [0. , 0. , 1. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [0. , 0. , 1. ],
       [0. , 1. , 0. ],
       [0. , 0. , 1. ],
       [0. , 0.8, 0.2],
       [0. , 0.2, 0.8],
       [0. , 1. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [0. , 0.2, 0.8],
       [0. , 0.6, 0.4],
       [0. , 0. , 1. ],
       [1. , 0. , 0. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [0. , 0. , 1. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ]])