데이터 사이언스/데이터 전처리
4. 스케일링(scaling)
Merware
2023. 5. 22. 16:09
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
titanic = sns.load_dataset("titanic")
titanic.head()
"""
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True
titanic.describe()
"""
survived pclass age sibsp parch fare
count 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
def draw_distplot(col_name):
plt.figure(figsize=(6, 6))
sns.distplot(titanic[col_name])
plt.grid()
plt.show()
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
# np.array(titanic.fare)
# np.array(titanic.fare).reshape(-1,1)
# np.array(titanic.fare).reshape(-1,1).reshape(-1,)
titanic['fare_z'] = StandardScaler().fit_transform(np.array(titanic.fare).reshape(-1,1)).reshape(-1,)
draw_distplot('fare_z')
titanic['fare_mm'] = MinMaxScaler().fit_transform(np.array(titanic.fare).reshape(-1,1)).reshape(-1,)
draw_distplot('fare_mm')
titanic['fare_rc'] = RobustScaler().fit_transform(np.array(titanic.fare).reshape(-1,1)).reshape(-1,)
draw_distplot('fare_rc')