머신러닝/데이터분석

IQR(사분위수) 기반 이상치 검출 및 해결방안

su0a 2024. 3. 30. 10:49

# 이상치를 탐지하는 함수 정의

def detect_outliers(dataframe, column):
    Q1 = dataframe[column].quantile(0.25)
    Q3 = dataframe[column].quantile(0.75)
    IQR = Q3 - Q1

    # IQR 기반으로 이상치 범위를 정의
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # 이상치의 인덱스를 반환
    return dataframe[(dataframe[column] < lower_bound) | (dataframe[column] > upper_bound)].index

# 숫자형 데이터만 대상으로 이상치 탐지
numeric_columns = train.select_dtypes(include=[np.number]).columns.tolist()
outliers_dict = {}
for column in numeric_columns:
    outliers = detect_outliers(train, column)
    if len(outliers) > 0:
        outliers_dict[column] = outliers

 

#이상치 중앙값으로 대체

for column, outlier_indices in outliers_dict.items():
    median_value = train[column].mean()
    trai.loc[outlier_indices, column] = median_value

 

#이상치 삭제

rows_to_drop = set()
for column in features_org:
    outliers_indices = detect_outliers(train_delete_outlier, column)
    rows_to_drop.update(outliers_indices)
train_delete_outlier = train_delete_outlier.drop(rows_to_drop)