Python/IBA

회귀모델링

유방울 2023. 5. 8. 19:38
# 독립변수로 설정할 train_x에서는 종속변수를 제거합니다.
X_train = train.drop(['price'], axis = 1)
# train_y 변수를 종속변수로 사용하기 위해 price 데이터를 지정하였습니다.
y_train = train['price']

# train_x와 달리 분석에 활용하지 않는 ID 데이터를 제거합니다.
X_test = test.drop('id', axis = 1)
# 학습 데이터와 테스트 데이터로 분할
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=42)
object_features = ['brand', 'model', 'transmission', 'fuelType']

for feature in object_features:
    le = LabelEncoder()
    le = le.fit(X_train[feature])
    X_train[feature] = le.transform(X_train[feature])

    # train데이터에서 존재하지 않았던 값이 test 데이터에 존재할 수도 있습니다.
    # 따라서 test 데이터를 바로 변형시키지 않고 고윳값을 확인후 test 데이터를 변환합니다.
    for label in np.unique(X_test[feature]):
        if label not in le.classes_:
            print(label)
            le.classes_ = np.append(le.classes_, label)
    X_test[feature] = le.transform(X_test[feature])

선형회귀

# 선형회귀
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression() # 모델 생성
model.fit(X_train, y_train) # 모델 적합
y_pred = model.predict(X_test) # 예측값 생성

print("RMSE :", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 :", r2_score(y_test, y_pred))

교차검증 & 하이퍼 파라미터 튜닝

from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, KFold

# Ridge 회귀 모델 생성
model = Ridge()

# 하이퍼파라미터 후보
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1.0, 10.0],
    'max_iter': [100, 500, 1000],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

# 5-fold 교차 검증 객체 생성
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV를 사용한 교차 검증과 하이퍼파라미터 튜닝
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=kf, n_jobs=-1)
grid_search.fit(X_train, y_train)

# 교차 검증 결과 출력
print('Best parameters: ', grid_search.best_params_)
print('Best score: ', np.sqrt(-grid_search.best_score_))

최종 예측 및 평가

# Ridge 모델 학습
model.fit(X_train, y_train)

# 모델 성과 평가
y_pred = model.predict(X_test)
print("RMSE :", np.sqrt(mean_squared_error(y_train, y_pred)))
print("R2 :", r2_score(y_train, y_pred))