Merware 2023. 5. 22. 16:07
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
titanic = sns.load_dataset("titanic")
titanic.head()

"""
survived	pclass	sex	age	sibsp	parch	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
0	0	3	male	22.0	1	0	7.2500	S	Third	man	True	NaN	Southampton	no	False
1	1	1	female	38.0	1	0	71.2833	C	First	woman	False	C	Cherbourg	yes	False
2	1	3	female	26.0	0	0	7.9250	S	Third	woman	False	NaN	Southampton	yes	True
3	1	1	female	35.0	1	0	53.1000	S	First	woman	False	C	Southampton	yes	False
4	0	3	male	35.0	0	0	8.0500	S	Third	man	True	NaN	Southampton	no	True

 

titanic.describe()

"""
survived	pclass	age	sibsp	parch	fare
count	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

 

class_names = ['first', 'second', 'third']
plt.figure(figsize=(12,5))
for i in range(0,3):    
    plt.subplot(1,3,i+1)
    sns.distplot(titanic[titanic.pclass == (i + 1)]['fare'], axlabel = class_names[i])
plt.tight_layout()
plt.show()

 

plt.figure(figsize=(16, 6)) 
sns.boxplot(x = 'fare', y='class', orient='h', data=titanic)
plt.grid()
plt.show()

 

plt.figure(figsize=(8, 6)) 
sns.catplot(x='class', y='fare', kind='swarm', data=titanic)
plt.grid()
plt.show()

 

pd.DataFrame(titanic.fare.sort_values(ascending=False).head(10))

"""
	fare
258	512.3292
737	512.3292
679	512.3292
88	263.0000
27	263.0000
341	263.0000
438	263.0000
311	262.3750
742	262.3750
118	247.5208

 

pd.DataFrame(titanic.fare.sort_values(ascending=True).head(10))

"""
	fare
271	0.0
597	0.0
302	0.0
633	0.0
277	0.0
413	0.0
674	0.0
263	0.0
466	0.0
732	0.0

 

titanic2 = titanic.copy()

titanic2.loc[titanic2.fare > 512, 'fare'] = 263

plt.figure(figsize=(8, 6)) 
sns.catplot(x='class', y='fare', kind='swarm', data=titanic2)
plt.grid()
plt.show()

 

titanic3 = titanic.copy() 

plt.figure(figsize=(8, 6)) 
sns.catplot(x='class', y='fare', kind='swarm', data=titanic3)
plt.grid()
plt.show()

 

def get_bound(series):
    quartile_1, quartile_3 = np.percentile(series, [25, 75])
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5)
    upper_bound = quartile_3 + (iqr * 1.5)
    return lower_bound, upper_bound
class_1 = titanic3[titanic3.pclass == 1]['fare']
class_2 = titanic3[titanic3.pclass == 2]['fare']
class_3 = titanic3[titanic3.pclass == 3]['fare']
class_1_lower, class_1_upper = get_bound(class_1)
class_2_lower, class_2_upper = get_bound(class_2)
class_3_lower, class_3_upper = get_bound(class_3)
titanic3.loc[(titanic3.pclass == 1) & (titanic3.fare < class_1_lower), 'fare'] = class_1_lower
titanic3.loc[(titanic3.pclass == 1) & (titanic3.fare > class_1_upper), 'fare'] = class_1_upper
titanic3.loc[(titanic3.pclass == 2) & (titanic3.fare < class_2_lower), 'fare'] = class_2_lower
titanic3.loc[(titanic3.pclass == 2) & (titanic3.fare > class_2_upper), 'fare'] = class_2_upper
titanic3.loc[(titanic3.pclass == 3) & (titanic3.fare < class_3_lower), 'fare'] = class_3_lower
titanic3.loc[(titanic3.pclass == 3) & (titanic3.fare > class_3_upper), 'fare'] = class_3_upper
plt.figure(figsize=(8, 6)) 
sns.catplot(x='class', y='fare', kind='swarm', data=titanic3)
plt.grid()
plt.show()

 

titanic4 = titanic.copy()

class_1_mean = titanic4[titanic4.pclass == 1]['fare'].mean()
class_2_mean = titanic4[titanic4.pclass == 2]['fare'].mean()
class_3_mean = titanic4[titanic4.pclass == 3]['fare'].mean()

titanic4[(titanic4.pclass == 1) & (np.abs(titanic4.fare - class_1_mean) > 3 * titanic4.fare.std())]

"""
survived	pclass	sex	age	sibsp	parch	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
27	0	1	male	19.0	3	2	263.0000	S	First	man	True	C	Southampton	no	False
88	1	1	female	23.0	3	2	263.0000	S	First	woman	False	C	Southampton	yes	False
118	0	1	male	24.0	0	1	247.5208	C	First	man	True	B	Cherbourg	no	False
258	1	1	female	35.0	0	0	512.3292	C	First	woman	False	NaN	Cherbourg	yes	True
299	1	1	female	50.0	0	1	247.5208	C	First	woman	False	B	Cherbourg	yes	False
311	1	1	female	18.0	2	2	262.3750	C	First	woman	False	B	Cherbourg	yes	False
341	1	1	female	24.0	3	2	263.0000	S	First	woman	False	C	Southampton	yes	False
438	0	1	male	64.0	1	4	263.0000	S	First	man	True	C	Southampton	no	False
679	1	1	male	36.0	0	1	512.3292	C	First	man	True	B	Cherbourg	yes	False
737	1	1	male	35.0	0	0	512.3292	C	First	man	True	B	Cherbourg	yes	True
742	1	1	female	21.0	2	2	262.3750	C	First	woman	False	B	Cherbourg	yes	False

 

titanic4[(titanic4.pclass == 2) & (np.abs(titanic4.fare - class_2_mean) > 3 * titanic4.fare.std())]

"""
survived	pclass	sex	age	sibsp	parch	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone

 

titanic4[(titanic4.pclass == 3) & (np.abs(titanic4.fare - class_3_mean) > 3 * titanic4.fare.std())]

"""
survived	pclass	sex	age	sibsp	parch	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone

 

titanic4 = titanic4[~((titanic4.pclass == 1) & (np.abs(titanic4.fare - class_1_mean) > 3 * titanic4.fare.std()))]
titanic4 = titanic4[~((titanic4.pclass == 2) & (np.abs(titanic4.fare - class_2_mean) > 3 * titanic4.fare.std()))]
titanic4 = titanic4[~((titanic4.pclass == 3) & (np.abs(titanic4.fare - class_3_mean) > 3 * titanic4.fare.std()))]
plt.figure(figsize=(8, 6)) 
sns.catplot(x='class', y='fare', kind='swarm', data=titanic4)
plt.grid()
plt.show()

 

plt.figure(figsize=(12,5))
for i in range(0,3):    
    plt.subplot(1,3,i+1)
    sns.distplot(titanic4[titanic4.pclass == (i + 1)]['fare'], axlabel = class_names[i])
plt.tight_layout()
plt.show()