본문 바로가기
[코드쉐도잉]/공공데이터

[Linear Regression] Boston Housing Data

by 에디터 윤슬 2024. 11. 27.

Data EDA

 

  • 히스토그램으로 데이터 시각화
plt.figure(figsize = (20, 15))
plotnumber = 1

for column in df:
    if plotnumber <= 14:
        ax = plt.subplot(3, 5, plotnumber)
        sns.histplot(df[column], kde=True)
        plt.xlabel(column, fontsize = 12)
        
    plotnumber += 1
    
plt.tight_layout()
plt.show()

 

  • scatterplot으로 데이터 시각화
plt.figure(figsize=(20, 15))
plotnumber = 1

for column in df:
    if plotnumber <= 14:
        ax = plt.subplot(3, 5, plotnumber)
        sns.scatterplot(x = df['MEDV'], y = df[column])
        
    plotnumber += 1
    
plt.tight_layout()
plt.show()

 

  • boxplot으로 데이터 시각화
plt.figure(figsize=(20, 8))
sns.boxplot(data = df, width=0.8)
plt.show()

선형회귀분석 전 데이터 처리

  • X, y 선언
  • X = 가격(MEDV) 제외한 모든 컬럼
  • y = 가격
X = df.drop(columns = 'MEDV', axis = 1)
y = df['MEDV']

 

  • 정규화(standard scaler)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

 

  • 다중공선성 문제 체크
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()

vif['VIF'] = [variance_inflation_factor(X_scaled, i) for i in range(X_scaled.shape[1])]
vif['Features'] = X.columns

vif

  • VIF 값이 높은 'TAX' 제거
X = X.drop(columns = ['TAX'], axis = 1)

 

  • 히트맵 활용하여 높은 상관관계 확인
# 히트맵
# 상관성을 확인하여 높은 상관성이 있는 컬럼 중 영향이 적은 컬럼을 삭제

fig, ax = plt.subplots(figsize = (16, 8))
sns.heatmap(df.corr(), annot=True, fmt = '1.2f', annot_kws={'size': 10}, linewidths=1)
plt.show()

 

  • 단순선형회귀 ols(최소자승법)로 TAX와 상관성이 높으나 영향력이 높을지 확인
import statsmodels.formula.api as smf

lm = smf.ols(formula = 'MEDV ~ RAD', data = df).fit()
lm.summary()

lm = smf.ols(formula = 'MEDV ~ TAX', data = df).fit()
lm.summary()

 

  • 결정계수가 낮은 RAD 컬럼 제거
# RAD 컬럼 제거

df.drop(columns = 'RAD', axis = 1, inplace = True)
df.head()

모델 훈련

# 훈련 및 테스트 데이터 분리

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=0)

 

# 선형회귀 fitting

from sklearn.linear_model import LinearRegression

model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

 

# 모델 예측

y_pred_lr = model_lr.predict(X_test)

# 정확도 측정

model_lr.score(X_train, y_train)
0.7927657852108868

model_lr.score(X_test, y_test)
0.6873411644615656

# 결정계수 조정

def adj_r2(X, y, model):
    r2 = model.score(X, y)
    n = X.shape[0]
    p = X.shape[1]
    adjusted_r2 = 1 - (1- r2) * (n - 1) / (n - p - 1)
    
    return adjusted_r2

print(adj_r2(X_train, y_train, model_lr))
print(adj_r2(X_test, y_test, model_lr))
0.7824437745125785
0.6486310229187118

Lasso Regression

from sklearn.linear_model import Lasso, LassoCV

lasso_cv = LassoCV(alphas= None, cv = 10, max_iter=100000)
lasso_cv.fit(X_train, y_train)

# best alpha parameter

alpha = lasso_cv.alpha_
alpha

lasso = Lasso(alpha = lasso_cv.alpha_)
lasso.fit(X_train, y_train)

lasso.score(X_train, y_train)  0.7921168665495498
lasso.score(X_test, y_test)  0.6860187913255504

print(adj_r2(X_train, y_train, lasso))  0.7817625342320944
print(adj_r2(X_test, y_test, lasso))  0.6471449273944281

 

Ridge Regression

from sklearn.linear_model import Ridge, RidgeCV

alphas = np.random.uniform(0, 10, 50)
ridge_cv = RidgeCV(alphas = alphas, cv = 10)
ridge_cv.fit(X_train, y_train)

# best alphas parameter

alpha = ridge_cv.alpha_
alpha   9.698258757421916

ridge = Ridge(alpha = ridge_cv.alpha_)
ridge.fit(X_train, y_train)

print(ridge.score(X_train, y_train))   0.7909631831654819
print(ridge.score(X_test, y_test))   0.6854831727171677


print(adj_r2(X_train, y_train, ridge))    0.7805513876909657
print(adj_r2(X_test, y_test, ridge))   0.646542994101198