데이터마이닝의 이론과 실제 14주차

초기 모델

#install.packages(“tidyverse”) #install.packages(“tidymodels”) #install.packages(“tidytext”) #install.packages(“skimr”) #install.packages(“vip”)

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidymodels)
## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --
## v broom        0.8.0     v rsample      0.1.1
## v dials        0.1.1     v tune         0.2.0
## v infer        1.0.0     v workflows    0.2.6
## v modeldata    0.1.1     v workflowsets 0.2.1
## v parsnip      0.2.1     v yardstick    0.0.9
## v recipes      0.2.0
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## * Search for functions across packages at https://www.tidymodels.org/find/
library(skimr)
library(vip)
## 
## 다음의 패키지를 부착합니다: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi

01. 데이터 불러오기

bank_tb <- read_csv("UniversalBank.csv",
                    col_names = TRUE,
                    locale = locale("ko", encoding = "euc-kr"),
                    na = ".") %>%
  mutate_if(is.character, as.factor)
## Rows: 5000 Columns: 14
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (14): ID, Age, Experience, Income, ZIP Code, Family, CCAvg, Education, M...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(bank_tb)
## tibble [5,000 x 14] (S3: tbl_df/tbl/data.frame)
##  $ ID                : num [1:5000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Age               : num [1:5000] 25 45 39 35 35 37 53 50 35 34 ...
##  $ Experience        : num [1:5000] 1 19 15 9 8 13 27 24 10 9 ...
##  $ Income            : num [1:5000] 49 34 11 100 45 29 72 22 81 180 ...
##  $ ZIP Code          : num [1:5000] 91107 90089 94720 94112 91330 ...
##  $ Family            : num [1:5000] 4 3 1 1 4 4 2 1 3 1 ...
##  $ CCAvg             : num [1:5000] 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
##  $ Education         : num [1:5000] 1 1 1 2 2 2 2 3 2 3 ...
##  $ Mortgage          : num [1:5000] 0 0 0 0 0 155 0 0 104 0 ...
##  $ Personal Loan     : num [1:5000] 0 0 0 0 0 0 0 0 0 1 ...
##  $ Securities Account: num [1:5000] 1 1 0 0 0 0 0 0 0 0 ...
##  $ CD Account        : num [1:5000] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Online            : num [1:5000] 0 0 0 0 0 1 1 0 1 0 ...
##  $ CreditCard        : num [1:5000] 0 0 0 0 1 0 0 1 0 0 ...
bank_tb
## # A tibble: 5,000 x 14
##       ID   Age Experience Income `ZIP Code` Family CCAvg Education Mortgage
##    <dbl> <dbl>      <dbl>  <dbl>      <dbl>  <dbl> <dbl>     <dbl>    <dbl>
##  1     1    25          1     49      91107      4   1.6         1        0
##  2     2    45         19     34      90089      3   1.5         1        0
##  3     3    39         15     11      94720      1   1           1        0
##  4     4    35          9    100      94112      1   2.7         2        0
##  5     5    35          8     45      91330      4   1           2        0
##  6     6    37         13     29      92121      4   0.4         2      155
##  7     7    53         27     72      91711      2   1.5         2        0
##  8     8    50         24     22      93943      1   0.3         3        0
##  9     9    35         10     81      90089      3   0.6         2      104
## 10    10    34          9    180      93023      1   8.9         3        0
## # ... with 4,990 more rows, and 5 more variables: `Personal Loan` <dbl>,
## #   `Securities Account` <dbl>, `CD Account` <dbl>, Online <dbl>,
## #   CreditCard <dbl>

전처리 작업

bank_tb <- bank_tb %>%
  rename(c('Personal_Loan' = 'Personal Loan',
           'CD_Account' = 'CD Account',
           'Securities_Account' = 'Securities Account'))

str(bank_tb)
## tibble [5,000 x 14] (S3: tbl_df/tbl/data.frame)
##  $ ID                : num [1:5000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Age               : num [1:5000] 25 45 39 35 35 37 53 50 35 34 ...
##  $ Experience        : num [1:5000] 1 19 15 9 8 13 27 24 10 9 ...
##  $ Income            : num [1:5000] 49 34 11 100 45 29 72 22 81 180 ...
##  $ ZIP Code          : num [1:5000] 91107 90089 94720 94112 91330 ...
##  $ Family            : num [1:5000] 4 3 1 1 4 4 2 1 3 1 ...
##  $ CCAvg             : num [1:5000] 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
##  $ Education         : num [1:5000] 1 1 1 2 2 2 2 3 2 3 ...
##  $ Mortgage          : num [1:5000] 0 0 0 0 0 155 0 0 104 0 ...
##  $ Personal_Loan     : num [1:5000] 0 0 0 0 0 0 0 0 0 1 ...
##  $ Securities_Account: num [1:5000] 1 1 0 0 0 0 0 0 0 0 ...
##  $ CD_Account        : num [1:5000] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Online            : num [1:5000] 0 0 0 0 0 1 1 0 1 0 ...
##  $ CreditCard        : num [1:5000] 0 0 0 0 1 0 0 1 0 0 ...
bank_tb
## # A tibble: 5,000 x 14
##       ID   Age Experience Income `ZIP Code` Family CCAvg Education Mortgage
##    <dbl> <dbl>      <dbl>  <dbl>      <dbl>  <dbl> <dbl>     <dbl>    <dbl>
##  1     1    25          1     49      91107      4   1.6         1        0
##  2     2    45         19     34      90089      3   1.5         1        0
##  3     3    39         15     11      94720      1   1           1        0
##  4     4    35          9    100      94112      1   2.7         2        0
##  5     5    35          8     45      91330      4   1           2        0
##  6     6    37         13     29      92121      4   0.4         2      155
##  7     7    53         27     72      91711      2   1.5         2        0
##  8     8    50         24     22      93943      1   0.3         3        0
##  9     9    35         10     81      90089      3   0.6         2      104
## 10    10    34          9    180      93023      1   8.9         3        0
## # ... with 4,990 more rows, and 5 more variables: Personal_Loan <dbl>,
## #   Securities_Account <dbl>, CD_Account <dbl>, Online <dbl>, CreditCard <dbl>

factor 처리

bank_tb <- bank_tb %>%
  mutate(Personal_Loan = factor(Personal_Loan,
                                levels = c(1,0),
                                labels = c("Yes","No"))) %>%
  mutate(Securities_Account = factor(Securities_Account,
                                     levels = c(0,1),
                                     labels = c("No","Yes"))) %>%
  mutate(CD_Account = factor(CD_Account,
                             levels = c(0,1),
                             labels = c("No","Yes"))) %>%
  mutate(Online = factor(Online,
                         levels = c(0,1),
                         labels = c("No","Yes"))) %>%
  mutate(CreditCard = factor(CreditCard,
                             levels = c(0,1),
                             labels = c("No","Yes"))) %>%
  mutate(Education = factor(Education,
                             levels = c(1:3),
                             labels = c("Undergrad","Graduate",
                                        "Professonal"))) 
str(bank_tb)
## tibble [5,000 x 14] (S3: tbl_df/tbl/data.frame)
##  $ ID                : num [1:5000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Age               : num [1:5000] 25 45 39 35 35 37 53 50 35 34 ...
##  $ Experience        : num [1:5000] 1 19 15 9 8 13 27 24 10 9 ...
##  $ Income            : num [1:5000] 49 34 11 100 45 29 72 22 81 180 ...
##  $ ZIP Code          : num [1:5000] 91107 90089 94720 94112 91330 ...
##  $ Family            : num [1:5000] 4 3 1 1 4 4 2 1 3 1 ...
##  $ CCAvg             : num [1:5000] 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
##  $ Education         : Factor w/ 3 levels "Undergrad","Graduate",..: 1 1 1 2 2 2 2 3 2 3 ...
##  $ Mortgage          : num [1:5000] 0 0 0 0 0 155 0 0 104 0 ...
##  $ Personal_Loan     : Factor w/ 2 levels "Yes","No": 2 2 2 2 2 2 2 2 2 1 ...
##  $ Securities_Account: Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
##  $ CD_Account        : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Online            : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 2 1 2 1 ...
##  $ CreditCard        : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 2 1 1 ...

필요없는 변수 제거

bank_tb <- bank_tb %>%
  select(-c(ID, 'ZIP Code'))
str(bank_tb)
## tibble [5,000 x 12] (S3: tbl_df/tbl/data.frame)
##  $ Age               : num [1:5000] 25 45 39 35 35 37 53 50 35 34 ...
##  $ Experience        : num [1:5000] 1 19 15 9 8 13 27 24 10 9 ...
##  $ Income            : num [1:5000] 49 34 11 100 45 29 72 22 81 180 ...
##  $ Family            : num [1:5000] 4 3 1 1 4 4 2 1 3 1 ...
##  $ CCAvg             : num [1:5000] 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
##  $ Education         : Factor w/ 3 levels "Undergrad","Graduate",..: 1 1 1 2 2 2 2 3 2 3 ...
##  $ Mortgage          : num [1:5000] 0 0 0 0 0 155 0 0 104 0 ...
##  $ Personal_Loan     : Factor w/ 2 levels "Yes","No": 2 2 2 2 2 2 2 2 2 1 ...
##  $ Securities_Account: Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
##  $ CD_Account        : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Online            : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 2 1 2 1 ...
##  $ CreditCard        : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 2 1 1 ...

03. 데이터 탐색

skim(bank_tb)
Data summary
Name bank_tb
Number of rows 5000
Number of columns 12
_______________________
Column type frequency:
factor 6
numeric 6
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
Education 0 1 FALSE 3 Und: 2096, Pro: 1501, Gra: 1403
Personal_Loan 0 1 FALSE 2 No: 4520, Yes: 480
Securities_Account 0 1 FALSE 2 No: 4478, Yes: 522
CD_Account 0 1 FALSE 2 No: 4698, Yes: 302
Online 0 1 FALSE 2 Yes: 2984, No: 2016
CreditCard 0 1 FALSE 2 No: 3530, Yes: 1470

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Age 0 1 45.34 11.46 23 35.0 45.0 55.0 67 ▅▇▇▇▆
Experience 0 1 20.10 11.47 -3 10.0 20.0 30.0 43 ▅▇▇▇▅
Income 0 1 73.77 46.03 8 39.0 64.0 98.0 224 ▇▇▃▂▁
Family 0 1 2.40 1.15 1 1.0 2.0 3.0 4 ▇▇▁▆▆
CCAvg 0 1 1.94 1.75 0 0.7 1.5 2.5 10 ▇▃▁▁▁
Mortgage 0 1 56.50 101.71 0 0.0 0.0 101.0 635 ▇▂▁▁▁
bank_tb %>%
  group_by(Personal_Loan) %>%
  skim()
Data summary
Name Piped data
Number of rows 5000
Number of columns 12
_______________________
Column type frequency:
factor 5
numeric 6
________________________
Group variables Personal_Loan

Variable type: factor

skim_variable Personal_Loan n_missing complete_rate ordered n_unique top_counts
Education Yes 0 1 FALSE 3 Pro: 205, Gra: 182, Und: 93
Education No 0 1 FALSE 3 Und: 2003, Pro: 1296, Gra: 1221
Securities_Account Yes 0 1 FALSE 2 No: 420, Yes: 60
Securities_Account No 0 1 FALSE 2 No: 4058, Yes: 462
CD_Account Yes 0 1 FALSE 2 No: 340, Yes: 140
CD_Account No 0 1 FALSE 2 No: 4358, Yes: 162
Online Yes 0 1 FALSE 2 Yes: 291, No: 189
Online No 0 1 FALSE 2 Yes: 2693, No: 1827
CreditCard Yes 0 1 FALSE 2 No: 337, Yes: 143
CreditCard No 0 1 FALSE 2 No: 3193, Yes: 1327

Variable type: numeric

skim_variable Personal_Loan n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Age Yes 0 1 45.07 11.59 26 35.0 45.0 55.00 65.0 ▇▇▇▇▇
Age No 0 1 45.37 11.45 23 35.0 45.0 55.00 67.0 ▅▇▇▇▅
Experience Yes 0 1 19.84 11.58 0 9.0 20.0 30.00 41.0 ▇▇▇▇▆
Experience No 0 1 20.13 11.46 -3 10.0 20.0 30.00 43.0 ▅▇▇▇▅
Income Yes 0 1 144.75 31.58 60 122.0 142.5 172.00 203.0 ▁▅▇▆▆
Income No 0 1 66.24 40.58 8 35.0 59.0 84.00 224.0 ▇▇▂▁▁
Family Yes 0 1 2.61 1.12 1 2.0 3.0 4.00 4.0 ▆▆▁▇▇
Family No 0 1 2.37 1.15 1 1.0 2.0 3.00 4.0 ▇▇▁▅▆
CCAvg Yes 0 1 3.91 2.10 0 2.6 3.8 5.35 10.0 ▅▇▇▃▁
CCAvg No 0 1 1.73 1.57 0 0.6 1.4 2.30 8.8 ▇▅▁▁▁
Mortgage Yes 0 1 100.85 160.85 0 0.0 0.0 192.50 617.0 ▇▁▁▁▁
Mortgage No 0 1 51.79 92.04 0 0.0 0.0 98.00 635.0 ▇▂▁▁▁

base accuracy

bank_tb %>%
  count(Personal_Loan) %>%
  mutate(prop = n / sum(n))
## # A tibble: 2 x 3
##   Personal_Loan     n  prop
##   <fct>         <int> <dbl>
## 1 Yes             480 0.096
## 2 No             4520 0.904

04. 훈련용, 테스트용 데이터 분할

set.seed(123)
bank_split <- bank_tb %>%
  initial_split(starata=Personal_Loan)
bank_split # 테스트 1250
## <Analysis/Assess/Total>
## <3750/1250/5000>
train_data <- training(bank_split)
test_data <- testing(bank_split)
str(train_data)
## tibble [3,750 x 12] (S3: tbl_df/tbl/data.frame)
##  $ Age               : num [1:3750] 52 62 25 64 66 48 42 32 39 37 ...
##  $ Experience        : num [1:3750] 28 38 1 38 42 23 17 7 13 13 ...
##  $ Income            : num [1:3750] 23 52 98 79 95 63 91 143 59 38 ...
##  $ Family            : num [1:3750] 3 4 1 2 2 4 1 3 3 1 ...
##  $ CCAvg             : num [1:3750] 0.4 1.3 5.4 2.8 0 3.6 0.1 2.9 0.9 1.5 ...
##  $ Education         : Factor w/ 3 levels "Undergrad","Graduate",..: 1 2 1 1 3 3 2 3 3 2 ...
##  $ Mortgage          : num [1:3750] 0 0 0 179 0 0 199 0 199 116 ...
##  $ Personal_Loan     : Factor w/ 2 levels "Yes","No": 2 2 2 2 2 2 2 1 2 2 ...
##  $ Securities_Account: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 2 1 ...
##  $ CD_Account        : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Online            : Factor w/ 2 levels "No","Yes": 2 2 2 1 2 1 2 2 2 1 ...
##  $ CreditCard        : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 2 1 1 2 ...
str(test_data)
## tibble [1,250 x 12] (S3: tbl_df/tbl/data.frame)
##  $ Age               : num [1:1250] 35 37 50 35 29 44 51 39 37 40 ...
##  $ Experience        : num [1:1250] 9 13 24 10 5 18 25 14 12 16 ...
##  $ Income            : num [1:1250] 100 29 22 81 62 43 71 43 194 49 ...
##  $ Family            : num [1:1250] 1 4 1 3 1 2 1 3 4 1 ...
##  $ CCAvg             : num [1:1250] 2.7 0.4 0.3 0.6 1.2 0.7 1.4 0.7 0.2 1.8 ...
##  $ Education         : Factor w/ 3 levels "Undergrad","Graduate",..: 2 2 3 2 1 1 3 2 3 1 ...
##  $ Mortgage          : num [1:1250] 0 155 0 104 260 163 198 153 211 0 ...
##  $ Personal_Loan     : Factor w/ 2 levels "Yes","No": 2 2 2 2 2 2 2 2 1 2 ...
##  $ Securities_Account: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 1 2 1 ...
##  $ CD_Account        : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 2 1 ...
##  $ Online            : Factor w/ 2 levels "No","Yes": 1 2 1 2 2 1 1 2 2 1 ...
##  $ CreditCard        : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 1 1 1 2 2 ...

05. 모델만들기

args(decision_tree)
## function (mode = "unknown", engine = "rpart", cost_complexity = NULL, 
##     tree_depth = NULL, min_n = NULL) 
## NULL

모델설정

tree_model <- 
  decision_tree()%>%
  set_engine("rpart") %>%
  set_mode("classification")

recipe

tree_recipe <-
  train_data %>% 
  recipe(Personal_Loan ~ .) %>% # 모든변수 . 
  step_dummy(all_nominal(), -all_outcomes()) # personal_Loan 변수만 결과 변수 ,나머지 설명변수  
summary(tree_recipe)
## # A tibble: 12 x 4
##    variable           type    role      source  
##    <chr>              <chr>   <chr>     <chr>   
##  1 Age                numeric predictor original
##  2 Experience         numeric predictor original
##  3 Income             numeric predictor original
##  4 Family             numeric predictor original
##  5 CCAvg              numeric predictor original
##  6 Education          nominal predictor original
##  7 Mortgage           numeric predictor original
##  8 Securities_Account nominal predictor original
##  9 CD_Account         nominal predictor original
## 10 Online             nominal predictor original
## 11 CreditCard         nominal predictor original
## 12 Personal_Loan      nominal outcome   original

06 workflow 만들기

tree_workflow <- 
  workflow() %>%
  add_model(tree_model) %>%
  add_recipe(tree_recipe)
tree_workflow 
## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: decision_tree()
## 
## -- Preprocessor ----------------------------------------------------------------
## 1 Recipe Step
## 
## * step_dummy()
## 
## -- Model -----------------------------------------------------------------------
## Decision Tree Model Specification (classification)
## 
## Computational engine: rpart

07 모델 훈련하기

tree_train_fit <-
  tree_workflow %>%
  fit(data = train_data)

tree_train_fit  
## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: decision_tree()
## 
## -- Preprocessor ----------------------------------------------------------------
## 1 Recipe Step
## 
## * step_dummy()
## 
## -- Model -----------------------------------------------------------------------
## n= 3750 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 3750 354 No (0.094400000 0.905600000)  
##    2) Income>=100.5 917 327 No (0.356597601 0.643402399)  
##      4) Family>=2.5 242  54 Yes (0.776859504 0.223140496)  
##        8) Income>=116.5 162   0 Yes (1.000000000 0.000000000) *
##        9) Income< 116.5 80  26 No (0.325000000 0.675000000)  
##         18) CCAvg>=2.7 27   8 Yes (0.703703704 0.296296296)  
##           36) Age< 57.5 17   1 Yes (0.941176471 0.058823529) *
##           37) Age>=57.5 10   3 No (0.300000000 0.700000000) *
##         19) CCAvg< 2.7 53   7 No (0.132075472 0.867924528) *
##      5) Family< 2.5 675 139 No (0.205925926 0.794074074)  
##       10) Education_Graduate>=0.5 78  12 Yes (0.846153846 0.153846154)  
##         20) Income>=116.5 58   0 Yes (1.000000000 0.000000000) *
##         21) Income< 116.5 20   8 No (0.400000000 0.600000000)  
##           42) Age>=43.5 8   2 Yes (0.750000000 0.250000000) *
##           43) Age< 43.5 12   2 No (0.166666667 0.833333333) *
##       11) Education_Graduate< 0.5 597  73 No (0.122278057 0.877721943)  
##         22) Education_Professonal>=0.5 89  19 Yes (0.786516854 0.213483146)  
##           44) Income>=114.5 62   0 Yes (1.000000000 0.000000000) *
##           45) Income< 114.5 27   8 No (0.296296296 0.703703704) *
##         23) Education_Professonal< 0.5 508   3 No (0.005905512 0.994094488) *
##    3) Income< 100.5 2833  27 No (0.009530533 0.990469467)  
##      6) CCAvg>=2.95 168  27 No (0.160714286 0.839285714)  
##       12) CD_Account_Yes>=0.5 9   0 Yes (1.000000000 0.000000000) *
##       13) CD_Account_Yes< 0.5 159  18 No (0.113207547 0.886792453) *
##      7) CCAvg< 2.95 2665   0 No (0.000000000 1.000000000) *

08 . 훈련모델 검증 해보기

예측 결과 표

predict(예측치, 실제값)

tree_train_pred <- 
  predict(tree_train_fit,
          train_data,
          type="prob") %>%
  bind_cols(predict(tree_train_fit,
                    train_data)) %>%
  bind_cols(train_data %>%
              select(Personal_Loan))    #Personal_Loan은 실제값  
tree_train_pred
## # A tibble: 3,750 x 4
##    .pred_Yes .pred_No .pred_class Personal_Loan
##        <dbl>    <dbl> <fct>       <fct>        
##  1     0        1     No          No           
##  2     0        1     No          No           
##  3     0.113    0.887 No          No           
##  4     0        1     No          No           
##  5     0        1     No          No           
##  6     0.113    0.887 No          No           
##  7     0        1     No          No           
##  8     1        0     Yes         Yes          
##  9     0        1     No          No           
## 10     0        1     No          No           
## # ... with 3,740 more rows

정오분류표

tree_train_conf <- 
  tree_train_pred %>%
  conf_mat(truth = Personal_Loan,
           estimate = .pred_class)

autoplot(tree_train_conf, type = "heatmap")

autoplot(tree_train_conf, type = "mosaic")

summary(tree_train_conf)
## # A tibble: 13 x 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             binary        0.988 
##  2 kap                  binary        0.928 
##  3 sens                 binary        0.884 
##  4 spec                 binary        0.999 
##  5 ppv                  binary        0.991 
##  6 npv                  binary        0.988 
##  7 mcc                  binary        0.930 
##  8 j_index              binary        0.883 
##  9 bal_accuracy         binary        0.942 
## 10 detection_prevalence binary        0.0843
## 11 precision            binary        0.991 
## 12 recall               binary        0.884 
## 13 f_meas               binary        0.934

AUC

tree_train_pred %>%
  roc_auc(truth = Personal_Loan,
          .pred_Yes)
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.996

ROC

train_auc <- 
  tree_train_pred %>%
  roc_curve(truth = Personal_Loan,
            estimate = .pred_Yes) %>%
  mutate(model = "train_auc")

autoplot(train_auc)

lift

tree_train_pred %>% 
  lift_curve(truth = Personal_Loan,
             estimate = .pred_Yes) %>%
  autoplot

gain

tree_train_pred %>%
  gain_curve(truth = Personal_Loan,
             estimate = .pred_Yes) %>%
  autoplot

중요변수 확인

tree_train_fit %>%
  pull_workflow_fit() %>% 
  vip()
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.

09. 테스트 데이터 검증

tree_test_fit <-
  tree_workflow %>%
  last_fit(bank_split)
tree_test_fit$.predictions
## [[1]]
## # A tibble: 1,250 x 6
##    .pred_Yes .pred_No  .row .pred_class Personal_Loan .config             
##        <dbl>    <dbl> <int> <fct>       <fct>         <chr>               
##  1         0        1     4 No          No            Preprocessor1_Model1
##  2         0        1     6 No          No            Preprocessor1_Model1
##  3         0        1     8 No          No            Preprocessor1_Model1
##  4         0        1     9 No          No            Preprocessor1_Model1
##  5         0        1    23 No          No            Preprocessor1_Model1
##  6         0        1    24 No          No            Preprocessor1_Model1
##  7         0        1    38 No          No            Preprocessor1_Model1
##  8         0        1    47 No          No            Preprocessor1_Model1
##  9         1        0    48 Yes         Yes           Preprocessor1_Model1
## 10         0        1    50 No          No            Preprocessor1_Model1
## # ... with 1,240 more rows
tree_test_pred <-
tree_test_fit %>% 
  collect_predictions
tree_test_pred
## # A tibble: 1,250 x 7
##    id               .pred_Yes .pred_No  .row .pred_class Personal_Loan .config  
##    <chr>                <dbl>    <dbl> <int> <fct>       <fct>         <chr>    
##  1 train/test split         0        1     4 No          No            Preproce~
##  2 train/test split         0        1     6 No          No            Preproce~
##  3 train/test split         0        1     8 No          No            Preproce~
##  4 train/test split         0        1     9 No          No            Preproce~
##  5 train/test split         0        1    23 No          No            Preproce~
##  6 train/test split         0        1    24 No          No            Preproce~
##  7 train/test split         0        1    38 No          No            Preproce~
##  8 train/test split         0        1    47 No          No            Preproce~
##  9 train/test split         1        0    48 Yes         Yes           Preproce~
## 10 train/test split         0        1    50 No          No            Preproce~
## # ... with 1,240 more rows

정오분류표

tree_test_conf <-
  tree_test_pred %>%
  conf_mat(truth = Personal_Loan,
           estimate = .pred_class)
tree_test_conf
##           Truth
## Prediction  Yes   No
##        Yes  111    7
##        No    15 1117
autoplot(tree_test_conf, type="heatmap")

autoplot(tree_test_conf, type="mosaic")

summary(tree_test_conf)
## # A tibble: 13 x 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             binary        0.982 
##  2 kap                  binary        0.900 
##  3 sens                 binary        0.881 
##  4 spec                 binary        0.994 
##  5 ppv                  binary        0.941 
##  6 npv                  binary        0.987 
##  7 mcc                  binary        0.901 
##  8 j_index              binary        0.875 
##  9 bal_accuracy         binary        0.937 
## 10 detection_prevalence binary        0.0944
## 11 precision            binary        0.941 
## 12 recall               binary        0.881 
## 13 f_meas               binary        0.910

AUC

tree_test_pred %>%
  roc_auc(truth = Personal_Loan,
          estimate = .pred_Yes)
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.993
test_auc <-
  tree_test_pred %>%
  roc_curve(truth = Personal_Loan,
            estimate = .pred_Yes) %>%
  mutate(model = "test_auc")

autoplot(test_auc)

tree_test_pred %>%
  lift_curve(truth = Personal_Loan,
             testimate = .pred_Yes) %>%
  autoplot()

tree_test_pred %>%
  gain_curve(truth = Personal_Loan,
             estimate = .pred_Yes) %>%
  autoplot()

tree_test_fit %>%
  pluck(".workflow",1) %>%
  pull_workflow_fit() %>%
  vip(num_features =20)
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.

10. 비교

tree_train_conf
##           Truth
## Prediction  Yes   No
##        Yes  313    3
##        No    41 3393
tree_test_conf
##           Truth
## Prediction  Yes   No
##        Yes  111    7
##        No    15 1117
summary(tree_train_conf)
## # A tibble: 13 x 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             binary        0.988 
##  2 kap                  binary        0.928 
##  3 sens                 binary        0.884 
##  4 spec                 binary        0.999 
##  5 ppv                  binary        0.991 
##  6 npv                  binary        0.988 
##  7 mcc                  binary        0.930 
##  8 j_index              binary        0.883 
##  9 bal_accuracy         binary        0.942 
## 10 detection_prevalence binary        0.0843
## 11 precision            binary        0.991 
## 12 recall               binary        0.884 
## 13 f_meas               binary        0.934
summary(tree_test_conf)
## # A tibble: 13 x 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             binary        0.982 
##  2 kap                  binary        0.900 
##  3 sens                 binary        0.881 
##  4 spec                 binary        0.994 
##  5 ppv                  binary        0.941 
##  6 npv                  binary        0.987 
##  7 mcc                  binary        0.901 
##  8 j_index              binary        0.875 
##  9 bal_accuracy         binary        0.937 
## 10 detection_prevalence binary        0.0944
## 11 precision            binary        0.941 
## 12 recall               binary        0.881 
## 13 f_meas               binary        0.910
bind_rows(train_auc, test_auc) %>% 
  ggplot(mapping = aes(x=1-specificity,
                       y=sensitivity,
                       color = model)) +
  geom_path(lwd = 1.5) +
  geom_abline(lty = 3) +
  coord_equal()

11. decision tree

#install.packages("rpart.plot")
library(rpart.plot)  
rpart_fit <-
  tree_train_fit %>%
  pull_workflow_fit()
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.

수정삽입 방법 :fn+insert

rpart.plot(x =rpart_fit$fit,
          yesno = 1,
          type = 2,
          extra = 1,
          split.font =1,
          varlen = -10) 
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call rpart.plot with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.

prp(x = rpart_fit$fit,
    type = 1,
    extra = 1,
    under = TRUE,
    split.font = 1,
    varlen = -10,
    box.col = ifelse(rpart_fit$fit$frame$var == "<leaf>", "gray", "white"))
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call prp with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.

Leave a comment