import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
tips = sns.load_dataset('tips')
tips[:10]
"""
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
5 25.29 4.71 Male No Sun Dinner 4
6 8.77 2.00 Male No Sun Dinner 2
7 26.88 3.12 Male No Sun Dinner 4
8 15.04 1.96 Male No Sun Dinner 2
9 14.78 3.23 Male No Sun Dinner 2
tips.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 total_bill 244 non-null float64
1 tip 244 non-null float64
2 sex 244 non-null category
3 smoker 244 non-null category
4 day 244 non-null category
5 time 244 non-null category
6 size 244 non-null int64
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB
tips['per_one_bill'] = round(tips.total_bill / tips['size'], 2)
tips[:10]
"""
total_bill tip sex smoker day time size per_one_bill
0 16.99 1.01 Female No Sun Dinner 2 8.49
1 10.34 1.66 Male No Sun Dinner 3 3.45
2 21.01 3.50 Male No Sun Dinner 3 7.00
3 23.68 3.31 Male No Sun Dinner 2 11.84
4 24.59 3.61 Female No Sun Dinner 4 6.15
5 25.29 4.71 Male No Sun Dinner 4 6.32
6 8.77 2.00 Male No Sun Dinner 2 4.38
7 26.88 3.12 Male No Sun Dinner 4 6.72
8 15.04 1.96 Male No Sun Dinner 2 7.52
9 14.78 3.23 Male No Sun Dinner 2 7.39
tips['per_one_tip'] = round(tips.tip / tips['size'], 2)
tips.head()
"""
total_bill tip sex smoker day time size per_one_bill per_one_tip
0 16.99 1.01 Female No Sun Dinner 2 8.49 0.50
1 10.34 1.66 Male No Sun Dinner 3 3.45 0.55
2 21.01 3.50 Male No Sun Dinner 3 7.00 1.17
3 23.68 3.31 Male No Sun Dinner 2 11.84 1.66
4 24.59 3.61 Female No Sun Dinner 4 6.15 0.90
tips['time'].value_counts()
Dinner 176
Lunch 68
Name: time, dtype: int64
tips['time'] = tips['time'].map({'Lunch':0, 'Dinner':1})
tips.head()
"""
total_bill tip sex smoker day time size per_one_bill per_one_tip
0 16.99 1.01 Female No Sun 1 2 8.49 0.50
1 10.34 1.66 Male No Sun 1 3 3.45 0.55
2 21.01 3.50 Male No Sun 1 3 7.00 1.17
3 23.68 3.31 Male No Sun 1 2 11.84 1.66
4 24.59 3.61 Female No Sun 1 4 6.15 0.90
tips['time'].value_counts()
1 176
0 68
Name: time, dtype: int64
tips.day.value_counts()
Sat 87
Sun 76
Thur 62
Fri 19
Name: day, dtype: int64
day_dummy = pd.get_dummies(tips.day)
tips = pd.concat([tips, day_dummy], axis = 1)
tips
"""
total_bill tip sex smoker day time size per_one_bill per_one_tip Thur Fri Sat Sun
0 16.99 1.01 Female No Sun 1 2 8.49 0.50 0 0 0 1
1 10.34 1.66 Male No Sun 1 3 3.45 0.55 0 0 0 1
2 21.01 3.50 Male No Sun 1 3 7.00 1.17 0 0 0 1
3 23.68 3.31 Male No Sun 1 2 11.84 1.66 0 0 0 1
4 24.59 3.61 Female No Sun 1 4 6.15 0.90 0 0 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ...
239 29.03 5.92 Male No Sat 1 3 9.68 1.97 0 0 1 0
240 27.18 2.00 Female Yes Sat 1 2 13.59 1.00 0 0 1 0
241 22.67 2.00 Male Yes Sat 1 2 11.34 1.00 0 0 1 0
242 17.82 1.75 Male No Sat 1 2 8.91 0.88 0 0 1 0
243 18.78 3.00 Female No Thur 1 2 9.39 1.50 1 0 0 0
244 rows × 13 columns
'데이터 사이언스 > 데이터 전처리' 카테고리의 다른 글
4. 스케일링(scaling) (0) | 2023.05.22 |
---|---|
3. 데이터 변경(data_transform(skew_kurtosis)) (0) | 2023.05.22 |
2. 이상치(outlier) (0) | 2023.05.22 |
1. 결측치(missing_value) (0) | 2023.05.22 |