import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
titanic = sns.load_dataset("titanic")
titanic.head()
"""
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True
결측치 탐색
titanic.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 survived 891 non-null int64
1 pclass 891 non-null int64
2 sex 891 non-null object
3 age 714 non-null float64
4 sibsp 891 non-null int64
5 parch 891 non-null int64
6 fare 891 non-null float64
7 embarked 889 non-null object
8 class 891 non-null category
9 who 891 non-null object
10 adult_male 891 non-null bool
11 deck 203 non-null category
12 embark_town 889 non-null object
13 alive 891 non-null object
14 alone 891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
titanic.isnull().sum()
survived 0
pclass 0
sex 0
age 177
sibsp 0
parch 0
fare 0
embarked 2
class 0
who 0
adult_male 0
deck 688
embark_town 2
alive 0
alone 0
dtype: int64
import missingno as msno
msno.matrix(titanic, color=(0.2, 0.6, 0.1))
plt.show()
msno.bar(titanic, color=(0.1, 0.6, 0.6))
plt.show()
결측치 처리
titanic = titanic.dropna(thresh=int(len(titanic)/2), axis=1)
titanic.isnull().sum()
survived 0
pclass 0
sex 0
age 177
sibsp 0
parch 0
fare 0
embarked 2
class 0
who 0
adult_male 0
embark_town 2
alive 0
alone 0
dtype: int64
titanic = titanic[titanic.embarked.notnull()]
titanic
"""
survived pclass sex age sibsp parch fare embarked class who adult_male embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True Southampton no True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 0 2 male 27.0 0 0 13.0000 S Second man True Southampton no True
887 1 1 female 19.0 0 0 30.0000 S First woman False Southampton yes True
888 0 3 female NaN 1 2 23.4500 S Third woman False Southampton no False
889 1 1 male 26.0 0 0 30.0000 C First man True Cherbourg yes True
890 0 3 male 32.0 0 0 7.7500 Q Third man True Queenstown no True
titanic = titanic[titanic.embarked.notnull()]
titanic.isnull().sum()
survived 0
pclass 0
sex 0
age 177
sibsp 0
parch 0
fare 0
embarked 0
class 0
who 0
adult_male 0
embark_town 0
alive 0
alone 0
dtype: int64
plt.figure(figsize=(8, 8))
sns.distplot(titanic['age'])
plt.grid()
plt.show()
titanic.age.fillna(value = titanic.age.median(), inplace=True)
titanic.isnull().sum()
survived 0
pclass 0
sex 0
age 0
sibsp 0
parch 0
fare 0
embarked 2
class 0
who 0
adult_male 0
deck 688
embark_town 2
alive 0
alone 0
dtype: int64
'데이터 사이언스 > 데이터 전처리' 카테고리의 다른 글
5. 피쳐링(featureing), 더미 배리어블(dummy variable) (0) | 2023.05.22 |
---|---|
4. 스케일링(scaling) (0) | 2023.05.22 |
3. 데이터 변경(data_transform(skew_kurtosis)) (0) | 2023.05.22 |
2. 이상치(outlier) (0) | 2023.05.22 |