데이터 사이언스/데이터 전처리
2. 이상치(outlier)
Merware
2023. 5. 22. 16:07
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
titanic = sns.load_dataset("titanic")
titanic.head()
"""
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True
titanic.describe()
"""
survived pclass age sibsp parch fare
count 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
class_names = ['first', 'second', 'third']
plt.figure(figsize=(12,5))
for i in range(0,3):
plt.subplot(1,3,i+1)
sns.distplot(titanic[titanic.pclass == (i + 1)]['fare'], axlabel = class_names[i])
plt.tight_layout()
plt.show()
plt.figure(figsize=(16, 6))
sns.boxplot(x = 'fare', y='class', orient='h', data=titanic)
plt.grid()
plt.show()
plt.figure(figsize=(8, 6))
sns.catplot(x='class', y='fare', kind='swarm', data=titanic)
plt.grid()
plt.show()
pd.DataFrame(titanic.fare.sort_values(ascending=False).head(10))
"""
fare
258 512.3292
737 512.3292
679 512.3292
88 263.0000
27 263.0000
341 263.0000
438 263.0000
311 262.3750
742 262.3750
118 247.5208
pd.DataFrame(titanic.fare.sort_values(ascending=True).head(10))
"""
fare
271 0.0
597 0.0
302 0.0
633 0.0
277 0.0
413 0.0
674 0.0
263 0.0
466 0.0
732 0.0
titanic2 = titanic.copy()
titanic2.loc[titanic2.fare > 512, 'fare'] = 263
plt.figure(figsize=(8, 6))
sns.catplot(x='class', y='fare', kind='swarm', data=titanic2)
plt.grid()
plt.show()
titanic3 = titanic.copy()
plt.figure(figsize=(8, 6))
sns.catplot(x='class', y='fare', kind='swarm', data=titanic3)
plt.grid()
plt.show()
def get_bound(series):
quartile_1, quartile_3 = np.percentile(series, [25, 75])
iqr = quartile_3 - quartile_1
lower_bound = quartile_1 - (iqr * 1.5)
upper_bound = quartile_3 + (iqr * 1.5)
return lower_bound, upper_bound
class_1 = titanic3[titanic3.pclass == 1]['fare']
class_2 = titanic3[titanic3.pclass == 2]['fare']
class_3 = titanic3[titanic3.pclass == 3]['fare']
class_1_lower, class_1_upper = get_bound(class_1)
class_2_lower, class_2_upper = get_bound(class_2)
class_3_lower, class_3_upper = get_bound(class_3)
titanic3.loc[(titanic3.pclass == 1) & (titanic3.fare < class_1_lower), 'fare'] = class_1_lower
titanic3.loc[(titanic3.pclass == 1) & (titanic3.fare > class_1_upper), 'fare'] = class_1_upper
titanic3.loc[(titanic3.pclass == 2) & (titanic3.fare < class_2_lower), 'fare'] = class_2_lower
titanic3.loc[(titanic3.pclass == 2) & (titanic3.fare > class_2_upper), 'fare'] = class_2_upper
titanic3.loc[(titanic3.pclass == 3) & (titanic3.fare < class_3_lower), 'fare'] = class_3_lower
titanic3.loc[(titanic3.pclass == 3) & (titanic3.fare > class_3_upper), 'fare'] = class_3_upper
plt.figure(figsize=(8, 6))
sns.catplot(x='class', y='fare', kind='swarm', data=titanic3)
plt.grid()
plt.show()
titanic4 = titanic.copy()
class_1_mean = titanic4[titanic4.pclass == 1]['fare'].mean()
class_2_mean = titanic4[titanic4.pclass == 2]['fare'].mean()
class_3_mean = titanic4[titanic4.pclass == 3]['fare'].mean()
titanic4[(titanic4.pclass == 1) & (np.abs(titanic4.fare - class_1_mean) > 3 * titanic4.fare.std())]
"""
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
27 0 1 male 19.0 3 2 263.0000 S First man True C Southampton no False
88 1 1 female 23.0 3 2 263.0000 S First woman False C Southampton yes False
118 0 1 male 24.0 0 1 247.5208 C First man True B Cherbourg no False
258 1 1 female 35.0 0 0 512.3292 C First woman False NaN Cherbourg yes True
299 1 1 female 50.0 0 1 247.5208 C First woman False B Cherbourg yes False
311 1 1 female 18.0 2 2 262.3750 C First woman False B Cherbourg yes False
341 1 1 female 24.0 3 2 263.0000 S First woman False C Southampton yes False
438 0 1 male 64.0 1 4 263.0000 S First man True C Southampton no False
679 1 1 male 36.0 0 1 512.3292 C First man True B Cherbourg yes False
737 1 1 male 35.0 0 0 512.3292 C First man True B Cherbourg yes True
742 1 1 female 21.0 2 2 262.3750 C First woman False B Cherbourg yes False
titanic4[(titanic4.pclass == 2) & (np.abs(titanic4.fare - class_2_mean) > 3 * titanic4.fare.std())]
"""
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
titanic4[(titanic4.pclass == 3) & (np.abs(titanic4.fare - class_3_mean) > 3 * titanic4.fare.std())]
"""
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
titanic4 = titanic4[~((titanic4.pclass == 1) & (np.abs(titanic4.fare - class_1_mean) > 3 * titanic4.fare.std()))]
titanic4 = titanic4[~((titanic4.pclass == 2) & (np.abs(titanic4.fare - class_2_mean) > 3 * titanic4.fare.std()))]
titanic4 = titanic4[~((titanic4.pclass == 3) & (np.abs(titanic4.fare - class_3_mean) > 3 * titanic4.fare.std()))]
plt.figure(figsize=(8, 6))
sns.catplot(x='class', y='fare', kind='swarm', data=titanic4)
plt.grid()
plt.show()
plt.figure(figsize=(12,5))
for i in range(0,3):
plt.subplot(1,3,i+1)
sns.distplot(titanic4[titanic4.pclass == (i + 1)]['fare'], axlabel = class_names[i])
plt.tight_layout()
plt.show()