-
Kaggle_Titanic머신러닝(MACHINE LEARNING) 2021. 4. 9. 00:44반응형
import pandas as pd import numpy as np import matplotlib.pyplot as plt
test_df = pd.read_csv("./test.csv") train_df = pd.read_csv("./train.csv")
train_df.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S train_df.set_index('PassengerId', inplace=True) test_df.set_index('PassengerId', inplace=True)
train_index = train_df.index test_index = test_df.index train_index
Int64Index([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ... 882, 883, 884, 885, 886, 887, 888, 889, 890, 891], dtype='int64', name='PassengerId', length=891)
y_train_df = train_df.pop("Survived")
Data _ Processing 작업
pd.set_option('display.float_format', lambda x: '%.2f'%x)
test_df.isnull().sum()
Pclass 0 Name 0 Sex 0 Age 86 SibSp 0 Parch 0 Ticket 0 Fare 1 Cabin 327 Embarked 0 dtype: int64
train_df.isnull().sum()
Pclass 0 Name 0 Sex 0 Age 177 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 687 Embarked 2 dtype: int64
decision 1 : Drop Cabin
del train_df["Cabin"] del test_df["Cabin"]
all_df = train_df.append(test_df)
all_df
Pclass Name Sex Age SibSp Parch Ticket Fare Embarked PassengerId 1 3 Braund, Mr. Owen Harris male 22.00 1 0 A/5 21171 7.25 S 2 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.00 1 0 PC 17599 71.28 C 3 3 Heikkinen, Miss. Laina female 26.00 0 0 STON/O2. 3101282 7.92 S 4 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.00 1 0 113803 53.10 S 5 3 Allen, Mr. William Henry male 35.00 0 0 373450 8.05 S ... ... ... ... ... ... ... ... ... ... 1305 3 Spector, Mr. Woolf male nan 0 0 A.5. 3236 8.05 S 1306 1 Oliva y Ocana, Dona. Fermina female 39.00 0 0 PC 17758 108.90 C 1307 3 Saether, Mr. Simon Sivertsen male 38.50 0 0 SOTON/O.Q. 3101262 7.25 S 1308 3 Ware, Mr. Frederick male nan 0 0 359309 8.05 S 1309 3 Peter, Master. Michael J male nan 1 1 2668 22.36 C 1309 rows × 9 columns
del all_df['Name'] del all_df["Ticket"]
all_df.head()
Pclass Sex Age SibSp Parch Fare Embarked PassengerId 1 3 male 22.00 1 0 7.25 S 2 1 female 38.00 1 0 71.28 C 3 3 female 26.00 0 0 7.92 S 4 1 female 35.00 1 0 53.10 S 5 3 male 35.00 0 0 8.05 S all_df["Sex"] = all_df["Sex"].replace({"male" : 0 , "female" : 1}) all_df["Embarked"] = all_df["Embarked"].replace({"S":0,"C":1,"Q":2, np.nan:99})
all_df.head()
Pclass Sex Age SibSp Parch Fare Embarked PassengerId 1 3 0 22.00 1 0 7.25 0 2 1 1 38.00 1 0 71.28 1 3 3 1 26.00 0 0 7.92 0 4 1 1 35.00 1 0 53.10 0 5 3 0 35.00 0 0 8.05 0 pd.get_dummies(all_df["Embarked"], prefix = "embarked")
embarked_0 embarked_1 embarked_2 embarked_99 PassengerId 1 1 0 0 0 2 0 1 0 0 3 1 0 0 0 4 1 0 0 0 5 1 0 0 0 ... ... ... ... ... 1305 1 0 0 0 1306 0 1 0 0 1307 1 0 0 0 1308 1 0 0 0 1309 0 1 0 0 1309 rows × 4 columns
matrix_df = pd.merge( all_df,pd.get_dummies(all_df["Embarked"], prefix = "embarked"), left_index= True, right_index = True) matrix_df.head()
Pclass Sex Age SibSp Parch Fare Embarked embarked_0 embarked_1 embarked_2 embarked_99 PassengerId 1 3 0 22.00 1 0 7.25 0 1 0 0 0 2 1 1 38.00 1 0 71.28 1 0 1 0 0 3 3 1 26.00 0 0 7.92 0 1 0 0 0 4 1 1 35.00 1 0 53.10 0 1 0 0 0 5 3 0 35.00 0 0 8.05 0 1 0 0 0 matrix_df.corr()
Pclass Sex Age SibSp Parch Fare Embarked embarked_0 embarked_1 embarked_2 embarked_99 Pclass 1.00 -0.12 -0.41 0.06 0.02 -0.56 -0.05 0.10 -0.27 0.23 -0.06 Sex -0.12 1.00 -0.06 0.11 0.21 0.19 0.07 -0.12 0.07 0.09 0.05 Age -0.41 -0.06 1.00 -0.24 -0.15 0.18 0.07 -0.08 0.09 -0.02 0.06 SibSp 0.06 0.11 -0.24 1.00 0.37 0.16 -0.03 0.08 -0.05 -0.05 -0.02 Parch 0.02 0.21 -0.15 0.37 1.00 0.22 -0.03 0.07 -0.01 -0.10 -0.02 Fare -0.56 0.19 0.18 0.16 0.22 1.00 0.05 -0.17 0.29 -0.13 0.04 Embarked -0.05 0.07 0.07 -0.03 -0.03 0.05 1.00 -0.21 0.06 0.12 0.99 embarked_0 0.10 -0.12 -0.08 0.08 0.07 -0.17 -0.21 1.00 -0.78 -0.49 -0.06 embarked_1 -0.27 0.07 0.09 -0.05 -0.01 0.29 0.06 -0.78 1.00 -0.16 -0.02 embarked_2 0.23 0.09 -0.02 -0.05 -0.10 -0.13 0.12 -0.49 -0.16 1.00 -0.01 embarked_99 -0.06 0.05 0.06 -0.02 -0.02 0.04 0.99 -0.06 -0.02 -0.01 1.00 all_df.groupby("Pclass")["Age"].mean()
Pclass 1 39.16 2 29.51 3 24.82 Name: Age, dtype: float64
all_df.loc[(all_df["Pclass"] ==1 ) & (all_df["Age"].isnull()) , 'Age']
PassengerId 32 nan 56 nan 65 nan 167 nan 169 nan 186 nan 257 nan 271 nan 285 nan 296 nan 299 nan 307 nan 335 nan 352 nan 376 nan 458 nan 476 nan 508 nan 528 nan 558 nan 603 nan 634 nan 670 nan 712 nan 741 nan 767 nan 794 nan 816 nan 840 nan 850 nan 914 nan 933 nan 1038 nan 1040 nan 1060 nan 1083 nan 1097 nan 1158 nan 1182 nan Name: Age, dtype: float64
all_df.loc[(all_df["Pclass"] == 1) & ( all_df["Age"].isnull()), "Age"] = 39.16 all_df.loc[(all_df["Pclass"] == 2) & (all_df["Age"].isnull()) , "Age"] = 29.51 all_df.loc[(all_df["Pclass"] == 3) & (all_df["Age"].isnull()) , "Age"] = 24.82
all_df.isnull().sum()
Pclass 0 Sex 0 Age 0 SibSp 0 Parch 0 Fare 1 Embarked 0 dtype: int64
all_df.groupby("Pclass")["Fare"].mean()
Pclass 1 87.51 2 21.18 3 13.30 Name: Fare, dtype: float64
all_df[all_df["Fare"].isnull()]
Pclass Sex Age SibSp Parch Fare Embarked PassengerId 1044 3 0 60.50 0 0 nan 0 all_df.loc[all_df["Fare"].isnull(), "Fare"] = 13.30
del all_df["Embarked"]
all_df["Pclass"] = all_df["Pclass"].replace({1:"A",2:"B",3:"C"})
all_df = pd.get_dummies(all_df) all_df.head()
Sex Age SibSp Parch Fare Pclass_A Pclass_B Pclass_C PassengerId 1 0 22.00 1 0 7.25 0 0 1 2 1 38.00 1 0 71.28 1 0 0 3 1 26.00 0 0 7.92 0 0 1 4 1 35.00 1 0 53.10 1 0 0 5 0 35.00 0 0 8.05 0 0 1 all_df = pd.merge( all_df, matrix_df[["embarked_0", "embarked_1" , "embarked_2","embarked_99" ]], left_index=True, right_index=True)
train_df = all_df[all_df.index.isin(train_index)] test_df = all_df[all_df.index.isin(test_index)] train_df.head()
Sex Age SibSp Parch Fare Pclass_A Pclass_B Pclass_C embarked_0 embarked_1 embarked_2 embarked_99 PassengerId 1 0 22.00 1 0 7.25 0 0 1 1 0 0 0 2 1 38.00 1 0 71.28 1 0 0 0 1 0 0 3 1 26.00 0 0 7.92 0 0 1 1 0 0 0 4 1 35.00 1 0 53.10 1 0 0 1 0 0 0 5 0 35.00 0 0 8.05 0 0 1 1 0 0 0 Build Model
x_data = train_df.values y_data = y_train_df.values
y_data
array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0], dtype=int64)
from sklearn.linear_model import LogisticRegression cls = LogisticRegression() cls.fit(x_data,y_data)
C:\Users\sangi\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:762: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression n_iter_i = _check_optimize_result( LogisticRegression()
cls.intercept_
array([0.1770517])
cls.coef_
array([[ 2.60380921e+00, -4.11086178e-02, -3.25554029e-01, -8.54824009e-02, 2.34901265e-03, 1.11040984e+00, 1.42993750e-01, -1.09047200e+00, -2.08728159e-01, 2.07424688e-01, 9.74431422e-02, 6.67919187e-02]])
cls.predict(test_df.values)
array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0], dtype=int64)
x_test=test_df.values y_test = cls.predict(x_test) y_test
array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0], dtype=int64)
result = np.concatenate((test_index.values.reshape(-1,1), cls.predict(x_test).reshape(-1,1)), axis =1 ) result[:5]
array([[892, 0], [893, 0], [894, 0], [895, 0], [896, 1]], dtype=int64)
df_submission = pd.DataFrame(result, columns = ["PassengerId", "Survived"])
df_submission.to_csv("submission_result.csv", index=False)
반응형'머신러닝(MACHINE LEARNING)' 카테고리의 다른 글
GridSearchCV () - 파라매터 와 교차검증을 동시에 하는 API (0) 2021.04.13 교차 검증을 위한 Cross_val_score() 함수 사용법 (0) 2021.04.13 K-Fold 와 Stratified-KFold 기법 (2) 2021.04.13 Train_Test_Split 함수 사용Train_Test_Split 함수 사용 (0) 2021.04.13 pandas_sklearn_DecisionTreeclassifier (0) 2021.04.13