초기 모델
#install.packages(“tidyverse”) #install.packages(“tidymodels”) #install.packages(“tidytext”) #install.packages(“skimr”) #install.packages(“vip”)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidymodels)
## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --
## v broom        0.8.0     v rsample      0.1.1
## v dials        0.1.1     v tune         0.2.0
## v infer        1.0.0     v workflows    0.2.6
## v modeldata    0.1.1     v workflowsets 0.2.1
## v parsnip      0.2.1     v yardstick    0.0.9
## v recipes      0.2.0
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## * Search for functions across packages at https://www.tidymodels.org/find/
library(skimr)
library(vip)
## 
## 다음의 패키지를 부착합니다: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
 
01. 데이터 불러오기
bank_tb <- read_csv("UniversalBank.csv",
                    col_names = TRUE,
                    locale = locale("ko", encoding = "euc-kr"),
                    na = ".") %>%
  mutate_if(is.character, as.factor)
## Rows: 5000 Columns: 14
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (14): ID, Age, Experience, Income, ZIP Code, Family, CCAvg, Education, M...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(bank_tb)
## tibble [5,000 x 14] (S3: tbl_df/tbl/data.frame)
##  $ ID                : num [1:5000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Age               : num [1:5000] 25 45 39 35 35 37 53 50 35 34 ...
##  $ Experience        : num [1:5000] 1 19 15 9 8 13 27 24 10 9 ...
##  $ Income            : num [1:5000] 49 34 11 100 45 29 72 22 81 180 ...
##  $ ZIP Code          : num [1:5000] 91107 90089 94720 94112 91330 ...
##  $ Family            : num [1:5000] 4 3 1 1 4 4 2 1 3 1 ...
##  $ CCAvg             : num [1:5000] 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
##  $ Education         : num [1:5000] 1 1 1 2 2 2 2 3 2 3 ...
##  $ Mortgage          : num [1:5000] 0 0 0 0 0 155 0 0 104 0 ...
##  $ Personal Loan     : num [1:5000] 0 0 0 0 0 0 0 0 0 1 ...
##  $ Securities Account: num [1:5000] 1 1 0 0 0 0 0 0 0 0 ...
##  $ CD Account        : num [1:5000] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Online            : num [1:5000] 0 0 0 0 0 1 1 0 1 0 ...
##  $ CreditCard        : num [1:5000] 0 0 0 0 1 0 0 1 0 0 ...
bank_tb
## # A tibble: 5,000 x 14
##       ID   Age Experience Income `ZIP Code` Family CCAvg Education Mortgage
##    <dbl> <dbl>      <dbl>  <dbl>      <dbl>  <dbl> <dbl>     <dbl>    <dbl>
##  1     1    25          1     49      91107      4   1.6         1        0
##  2     2    45         19     34      90089      3   1.5         1        0
##  3     3    39         15     11      94720      1   1           1        0
##  4     4    35          9    100      94112      1   2.7         2        0
##  5     5    35          8     45      91330      4   1           2        0
##  6     6    37         13     29      92121      4   0.4         2      155
##  7     7    53         27     72      91711      2   1.5         2        0
##  8     8    50         24     22      93943      1   0.3         3        0
##  9     9    35         10     81      90089      3   0.6         2      104
## 10    10    34          9    180      93023      1   8.9         3        0
## # ... with 4,990 more rows, and 5 more variables: `Personal Loan` <dbl>,
## #   `Securities Account` <dbl>, `CD Account` <dbl>, Online <dbl>,
## #   CreditCard <dbl>
 
전처리 작업
bank_tb <- bank_tb %>%
  rename(c('Personal_Loan' = 'Personal Loan',
           'CD_Account' = 'CD Account',
           'Securities_Account' = 'Securities Account'))
str(bank_tb)
## tibble [5,000 x 14] (S3: tbl_df/tbl/data.frame)
##  $ ID                : num [1:5000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Age               : num [1:5000] 25 45 39 35 35 37 53 50 35 34 ...
##  $ Experience        : num [1:5000] 1 19 15 9 8 13 27 24 10 9 ...
##  $ Income            : num [1:5000] 49 34 11 100 45 29 72 22 81 180 ...
##  $ ZIP Code          : num [1:5000] 91107 90089 94720 94112 91330 ...
##  $ Family            : num [1:5000] 4 3 1 1 4 4 2 1 3 1 ...
##  $ CCAvg             : num [1:5000] 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
##  $ Education         : num [1:5000] 1 1 1 2 2 2 2 3 2 3 ...
##  $ Mortgage          : num [1:5000] 0 0 0 0 0 155 0 0 104 0 ...
##  $ Personal_Loan     : num [1:5000] 0 0 0 0 0 0 0 0 0 1 ...
##  $ Securities_Account: num [1:5000] 1 1 0 0 0 0 0 0 0 0 ...
##  $ CD_Account        : num [1:5000] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Online            : num [1:5000] 0 0 0 0 0 1 1 0 1 0 ...
##  $ CreditCard        : num [1:5000] 0 0 0 0 1 0 0 1 0 0 ...
bank_tb
## # A tibble: 5,000 x 14
##       ID   Age Experience Income `ZIP Code` Family CCAvg Education Mortgage
##    <dbl> <dbl>      <dbl>  <dbl>      <dbl>  <dbl> <dbl>     <dbl>    <dbl>
##  1     1    25          1     49      91107      4   1.6         1        0
##  2     2    45         19     34      90089      3   1.5         1        0
##  3     3    39         15     11      94720      1   1           1        0
##  4     4    35          9    100      94112      1   2.7         2        0
##  5     5    35          8     45      91330      4   1           2        0
##  6     6    37         13     29      92121      4   0.4         2      155
##  7     7    53         27     72      91711      2   1.5         2        0
##  8     8    50         24     22      93943      1   0.3         3        0
##  9     9    35         10     81      90089      3   0.6         2      104
## 10    10    34          9    180      93023      1   8.9         3        0
## # ... with 4,990 more rows, and 5 more variables: Personal_Loan <dbl>,
## #   Securities_Account <dbl>, CD_Account <dbl>, Online <dbl>, CreditCard <dbl>
 
factor 처리
bank_tb <- bank_tb %>%
  mutate(Personal_Loan = factor(Personal_Loan,
                                levels = c(1,0),
                                labels = c("Yes","No"))) %>%
  mutate(Securities_Account = factor(Securities_Account,
                                     levels = c(0,1),
                                     labels = c("No","Yes"))) %>%
  mutate(CD_Account = factor(CD_Account,
                             levels = c(0,1),
                             labels = c("No","Yes"))) %>%
  mutate(Online = factor(Online,
                         levels = c(0,1),
                         labels = c("No","Yes"))) %>%
  mutate(CreditCard = factor(CreditCard,
                             levels = c(0,1),
                             labels = c("No","Yes"))) %>%
  mutate(Education = factor(Education,
                             levels = c(1:3),
                             labels = c("Undergrad","Graduate",
                                        "Professonal"))) 
str(bank_tb)
## tibble [5,000 x 14] (S3: tbl_df/tbl/data.frame)
##  $ ID                : num [1:5000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Age               : num [1:5000] 25 45 39 35 35 37 53 50 35 34 ...
##  $ Experience        : num [1:5000] 1 19 15 9 8 13 27 24 10 9 ...
##  $ Income            : num [1:5000] 49 34 11 100 45 29 72 22 81 180 ...
##  $ ZIP Code          : num [1:5000] 91107 90089 94720 94112 91330 ...
##  $ Family            : num [1:5000] 4 3 1 1 4 4 2 1 3 1 ...
##  $ CCAvg             : num [1:5000] 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
##  $ Education         : Factor w/ 3 levels "Undergrad","Graduate",..: 1 1 1 2 2 2 2 3 2 3 ...
##  $ Mortgage          : num [1:5000] 0 0 0 0 0 155 0 0 104 0 ...
##  $ Personal_Loan     : Factor w/ 2 levels "Yes","No": 2 2 2 2 2 2 2 2 2 1 ...
##  $ Securities_Account: Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
##  $ CD_Account        : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Online            : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 2 1 2 1 ...
##  $ CreditCard        : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 2 1 1 ...
 
필요없는 변수 제거
bank_tb <- bank_tb %>%
  select(-c(ID, 'ZIP Code'))
str(bank_tb)
## tibble [5,000 x 12] (S3: tbl_df/tbl/data.frame)
##  $ Age               : num [1:5000] 25 45 39 35 35 37 53 50 35 34 ...
##  $ Experience        : num [1:5000] 1 19 15 9 8 13 27 24 10 9 ...
##  $ Income            : num [1:5000] 49 34 11 100 45 29 72 22 81 180 ...
##  $ Family            : num [1:5000] 4 3 1 1 4 4 2 1 3 1 ...
##  $ CCAvg             : num [1:5000] 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
##  $ Education         : Factor w/ 3 levels "Undergrad","Graduate",..: 1 1 1 2 2 2 2 3 2 3 ...
##  $ Mortgage          : num [1:5000] 0 0 0 0 0 155 0 0 104 0 ...
##  $ Personal_Loan     : Factor w/ 2 levels "Yes","No": 2 2 2 2 2 2 2 2 2 1 ...
##  $ Securities_Account: Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
##  $ CD_Account        : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Online            : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 2 1 2 1 ...
##  $ CreditCard        : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 2 1 1 ...
 
03. 데이터 탐색
skim(bank_tb)
Data summary
| Name | 
bank_tb | 
| Number of rows | 
5000 | 
| Number of columns | 
12 | 
| _______________________ | 
 | 
| Column type frequency: | 
 | 
| factor | 
6 | 
| numeric | 
6 | 
| ________________________ | 
 | 
| Group variables | 
None | 
Variable type: factor
| Education | 
0 | 
1 | 
FALSE | 
3 | 
Und: 2096, Pro: 1501, Gra: 1403 | 
| Personal_Loan | 
0 | 
1 | 
FALSE | 
2 | 
No: 4520, Yes: 480 | 
| Securities_Account | 
0 | 
1 | 
FALSE | 
2 | 
No: 4478, Yes: 522 | 
| CD_Account | 
0 | 
1 | 
FALSE | 
2 | 
No: 4698, Yes: 302 | 
| Online | 
0 | 
1 | 
FALSE | 
2 | 
Yes: 2984, No: 2016 | 
| CreditCard | 
0 | 
1 | 
FALSE | 
2 | 
No: 3530, Yes: 1470 | 
Variable type: numeric
| Age | 
0 | 
1 | 
45.34 | 
11.46 | 
23 | 
35.0 | 
45.0 | 
55.0 | 
67 | 
▅▇▇▇▆ | 
| Experience | 
0 | 
1 | 
20.10 | 
11.47 | 
-3 | 
10.0 | 
20.0 | 
30.0 | 
43 | 
▅▇▇▇▅ | 
| Income | 
0 | 
1 | 
73.77 | 
46.03 | 
8 | 
39.0 | 
64.0 | 
98.0 | 
224 | 
▇▇▃▂▁ | 
| Family | 
0 | 
1 | 
2.40 | 
1.15 | 
1 | 
1.0 | 
2.0 | 
3.0 | 
4 | 
▇▇▁▆▆ | 
| CCAvg | 
0 | 
1 | 
1.94 | 
1.75 | 
0 | 
0.7 | 
1.5 | 
2.5 | 
10 | 
▇▃▁▁▁ | 
| Mortgage | 
0 | 
1 | 
56.50 | 
101.71 | 
0 | 
0.0 | 
0.0 | 
101.0 | 
635 | 
▇▂▁▁▁ | 
bank_tb %>%
  group_by(Personal_Loan) %>%
  skim()
Data summary
| Name | 
Piped data | 
| Number of rows | 
5000 | 
| Number of columns | 
12 | 
| _______________________ | 
 | 
| Column type frequency: | 
 | 
| factor | 
5 | 
| numeric | 
6 | 
| ________________________ | 
 | 
| Group variables | 
Personal_Loan | 
Variable type: factor
| Education | 
Yes | 
0 | 
1 | 
FALSE | 
3 | 
Pro: 205, Gra: 182, Und: 93 | 
| Education | 
No | 
0 | 
1 | 
FALSE | 
3 | 
Und: 2003, Pro: 1296, Gra: 1221 | 
| Securities_Account | 
Yes | 
0 | 
1 | 
FALSE | 
2 | 
No: 420, Yes: 60 | 
| Securities_Account | 
No | 
0 | 
1 | 
FALSE | 
2 | 
No: 4058, Yes: 462 | 
| CD_Account | 
Yes | 
0 | 
1 | 
FALSE | 
2 | 
No: 340, Yes: 140 | 
| CD_Account | 
No | 
0 | 
1 | 
FALSE | 
2 | 
No: 4358, Yes: 162 | 
| Online | 
Yes | 
0 | 
1 | 
FALSE | 
2 | 
Yes: 291, No: 189 | 
| Online | 
No | 
0 | 
1 | 
FALSE | 
2 | 
Yes: 2693, No: 1827 | 
| CreditCard | 
Yes | 
0 | 
1 | 
FALSE | 
2 | 
No: 337, Yes: 143 | 
| CreditCard | 
No | 
0 | 
1 | 
FALSE | 
2 | 
No: 3193, Yes: 1327 | 
Variable type: numeric
| Age | 
Yes | 
0 | 
1 | 
45.07 | 
11.59 | 
26 | 
35.0 | 
45.0 | 
55.00 | 
65.0 | 
▇▇▇▇▇ | 
| Age | 
No | 
0 | 
1 | 
45.37 | 
11.45 | 
23 | 
35.0 | 
45.0 | 
55.00 | 
67.0 | 
▅▇▇▇▅ | 
| Experience | 
Yes | 
0 | 
1 | 
19.84 | 
11.58 | 
0 | 
9.0 | 
20.0 | 
30.00 | 
41.0 | 
▇▇▇▇▆ | 
| Experience | 
No | 
0 | 
1 | 
20.13 | 
11.46 | 
-3 | 
10.0 | 
20.0 | 
30.00 | 
43.0 | 
▅▇▇▇▅ | 
| Income | 
Yes | 
0 | 
1 | 
144.75 | 
31.58 | 
60 | 
122.0 | 
142.5 | 
172.00 | 
203.0 | 
▁▅▇▆▆ | 
| Income | 
No | 
0 | 
1 | 
66.24 | 
40.58 | 
8 | 
35.0 | 
59.0 | 
84.00 | 
224.0 | 
▇▇▂▁▁ | 
| Family | 
Yes | 
0 | 
1 | 
2.61 | 
1.12 | 
1 | 
2.0 | 
3.0 | 
4.00 | 
4.0 | 
▆▆▁▇▇ | 
| Family | 
No | 
0 | 
1 | 
2.37 | 
1.15 | 
1 | 
1.0 | 
2.0 | 
3.00 | 
4.0 | 
▇▇▁▅▆ | 
| CCAvg | 
Yes | 
0 | 
1 | 
3.91 | 
2.10 | 
0 | 
2.6 | 
3.8 | 
5.35 | 
10.0 | 
▅▇▇▃▁ | 
| CCAvg | 
No | 
0 | 
1 | 
1.73 | 
1.57 | 
0 | 
0.6 | 
1.4 | 
2.30 | 
8.8 | 
▇▅▁▁▁ | 
| Mortgage | 
Yes | 
0 | 
1 | 
100.85 | 
160.85 | 
0 | 
0.0 | 
0.0 | 
192.50 | 
617.0 | 
▇▁▁▁▁ | 
| Mortgage | 
No | 
0 | 
1 | 
51.79 | 
92.04 | 
0 | 
0.0 | 
0.0 | 
98.00 | 
635.0 | 
▇▂▁▁▁ | 
 
base accuracy
bank_tb %>%
  count(Personal_Loan) %>%
  mutate(prop = n / sum(n))
## # A tibble: 2 x 3
##   Personal_Loan     n  prop
##   <fct>         <int> <dbl>
## 1 Yes             480 0.096
## 2 No             4520 0.904
 
04. 훈련용, 테스트용 데이터 분할
set.seed(123)
bank_split <- bank_tb %>%
  initial_split(starata=Personal_Loan)
bank_split # 테스트 1250
## <Analysis/Assess/Total>
## <3750/1250/5000>
train_data <- training(bank_split)
test_data <- testing(bank_split)
str(train_data)
## tibble [3,750 x 12] (S3: tbl_df/tbl/data.frame)
##  $ Age               : num [1:3750] 52 62 25 64 66 48 42 32 39 37 ...
##  $ Experience        : num [1:3750] 28 38 1 38 42 23 17 7 13 13 ...
##  $ Income            : num [1:3750] 23 52 98 79 95 63 91 143 59 38 ...
##  $ Family            : num [1:3750] 3 4 1 2 2 4 1 3 3 1 ...
##  $ CCAvg             : num [1:3750] 0.4 1.3 5.4 2.8 0 3.6 0.1 2.9 0.9 1.5 ...
##  $ Education         : Factor w/ 3 levels "Undergrad","Graduate",..: 1 2 1 1 3 3 2 3 3 2 ...
##  $ Mortgage          : num [1:3750] 0 0 0 179 0 0 199 0 199 116 ...
##  $ Personal_Loan     : Factor w/ 2 levels "Yes","No": 2 2 2 2 2 2 2 1 2 2 ...
##  $ Securities_Account: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 2 1 ...
##  $ CD_Account        : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Online            : Factor w/ 2 levels "No","Yes": 2 2 2 1 2 1 2 2 2 1 ...
##  $ CreditCard        : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 2 1 1 2 ...
str(test_data)
## tibble [1,250 x 12] (S3: tbl_df/tbl/data.frame)
##  $ Age               : num [1:1250] 35 37 50 35 29 44 51 39 37 40 ...
##  $ Experience        : num [1:1250] 9 13 24 10 5 18 25 14 12 16 ...
##  $ Income            : num [1:1250] 100 29 22 81 62 43 71 43 194 49 ...
##  $ Family            : num [1:1250] 1 4 1 3 1 2 1 3 4 1 ...
##  $ CCAvg             : num [1:1250] 2.7 0.4 0.3 0.6 1.2 0.7 1.4 0.7 0.2 1.8 ...
##  $ Education         : Factor w/ 3 levels "Undergrad","Graduate",..: 2 2 3 2 1 1 3 2 3 1 ...
##  $ Mortgage          : num [1:1250] 0 155 0 104 260 163 198 153 211 0 ...
##  $ Personal_Loan     : Factor w/ 2 levels "Yes","No": 2 2 2 2 2 2 2 2 1 2 ...
##  $ Securities_Account: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 1 2 1 ...
##  $ CD_Account        : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 2 1 ...
##  $ Online            : Factor w/ 2 levels "No","Yes": 1 2 1 2 2 1 1 2 2 1 ...
##  $ CreditCard        : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 1 1 1 2 2 ...
 
05. 모델만들기
args(decision_tree)
## function (mode = "unknown", engine = "rpart", cost_complexity = NULL, 
##     tree_depth = NULL, min_n = NULL) 
## NULL
 
모델설정
tree_model <- 
  decision_tree()%>%
  set_engine("rpart") %>%
  set_mode("classification")
 
recipe
tree_recipe <-
  train_data %>% 
  recipe(Personal_Loan ~ .) %>% # 모든변수 . 
  step_dummy(all_nominal(), -all_outcomes()) # personal_Loan 변수만 결과 변수 ,나머지 설명변수  
summary(tree_recipe)
## # A tibble: 12 x 4
##    variable           type    role      source  
##    <chr>              <chr>   <chr>     <chr>   
##  1 Age                numeric predictor original
##  2 Experience         numeric predictor original
##  3 Income             numeric predictor original
##  4 Family             numeric predictor original
##  5 CCAvg              numeric predictor original
##  6 Education          nominal predictor original
##  7 Mortgage           numeric predictor original
##  8 Securities_Account nominal predictor original
##  9 CD_Account         nominal predictor original
## 10 Online             nominal predictor original
## 11 CreditCard         nominal predictor original
## 12 Personal_Loan      nominal outcome   original
 
06 workflow 만들기
tree_workflow <- 
  workflow() %>%
  add_model(tree_model) %>%
  add_recipe(tree_recipe)
tree_workflow 
## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: decision_tree()
## 
## -- Preprocessor ----------------------------------------------------------------
## 1 Recipe Step
## 
## * step_dummy()
## 
## -- Model -----------------------------------------------------------------------
## Decision Tree Model Specification (classification)
## 
## Computational engine: rpart
 
07 모델 훈련하기
tree_train_fit <-
  tree_workflow %>%
  fit(data = train_data)
tree_train_fit  
## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: decision_tree()
## 
## -- Preprocessor ----------------------------------------------------------------
## 1 Recipe Step
## 
## * step_dummy()
## 
## -- Model -----------------------------------------------------------------------
## n= 3750 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 3750 354 No (0.094400000 0.905600000)  
##    2) Income>=100.5 917 327 No (0.356597601 0.643402399)  
##      4) Family>=2.5 242  54 Yes (0.776859504 0.223140496)  
##        8) Income>=116.5 162   0 Yes (1.000000000 0.000000000) *
##        9) Income< 116.5 80  26 No (0.325000000 0.675000000)  
##         18) CCAvg>=2.7 27   8 Yes (0.703703704 0.296296296)  
##           36) Age< 57.5 17   1 Yes (0.941176471 0.058823529) *
##           37) Age>=57.5 10   3 No (0.300000000 0.700000000) *
##         19) CCAvg< 2.7 53   7 No (0.132075472 0.867924528) *
##      5) Family< 2.5 675 139 No (0.205925926 0.794074074)  
##       10) Education_Graduate>=0.5 78  12 Yes (0.846153846 0.153846154)  
##         20) Income>=116.5 58   0 Yes (1.000000000 0.000000000) *
##         21) Income< 116.5 20   8 No (0.400000000 0.600000000)  
##           42) Age>=43.5 8   2 Yes (0.750000000 0.250000000) *
##           43) Age< 43.5 12   2 No (0.166666667 0.833333333) *
##       11) Education_Graduate< 0.5 597  73 No (0.122278057 0.877721943)  
##         22) Education_Professonal>=0.5 89  19 Yes (0.786516854 0.213483146)  
##           44) Income>=114.5 62   0 Yes (1.000000000 0.000000000) *
##           45) Income< 114.5 27   8 No (0.296296296 0.703703704) *
##         23) Education_Professonal< 0.5 508   3 No (0.005905512 0.994094488) *
##    3) Income< 100.5 2833  27 No (0.009530533 0.990469467)  
##      6) CCAvg>=2.95 168  27 No (0.160714286 0.839285714)  
##       12) CD_Account_Yes>=0.5 9   0 Yes (1.000000000 0.000000000) *
##       13) CD_Account_Yes< 0.5 159  18 No (0.113207547 0.886792453) *
##      7) CCAvg< 2.95 2665   0 No (0.000000000 1.000000000) *
 
08 . 훈련모델 검증 해보기
예측 결과 표
predict(예측치, 실제값)
tree_train_pred <- 
  predict(tree_train_fit,
          train_data,
          type="prob") %>%
  bind_cols(predict(tree_train_fit,
                    train_data)) %>%
  bind_cols(train_data %>%
              select(Personal_Loan))    #Personal_Loan은 실제값  
tree_train_pred
## # A tibble: 3,750 x 4
##    .pred_Yes .pred_No .pred_class Personal_Loan
##        <dbl>    <dbl> <fct>       <fct>        
##  1     0        1     No          No           
##  2     0        1     No          No           
##  3     0.113    0.887 No          No           
##  4     0        1     No          No           
##  5     0        1     No          No           
##  6     0.113    0.887 No          No           
##  7     0        1     No          No           
##  8     1        0     Yes         Yes          
##  9     0        1     No          No           
## 10     0        1     No          No           
## # ... with 3,740 more rows
 
정오분류표
tree_train_conf <- 
  tree_train_pred %>%
  conf_mat(truth = Personal_Loan,
           estimate = .pred_class)
autoplot(tree_train_conf, type = "heatmap")

autoplot(tree_train_conf, type = "mosaic")

summary(tree_train_conf)
## # A tibble: 13 x 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             binary        0.988 
##  2 kap                  binary        0.928 
##  3 sens                 binary        0.884 
##  4 spec                 binary        0.999 
##  5 ppv                  binary        0.991 
##  6 npv                  binary        0.988 
##  7 mcc                  binary        0.930 
##  8 j_index              binary        0.883 
##  9 bal_accuracy         binary        0.942 
## 10 detection_prevalence binary        0.0843
## 11 precision            binary        0.991 
## 12 recall               binary        0.884 
## 13 f_meas               binary        0.934
 
AUC
tree_train_pred %>%
  roc_auc(truth = Personal_Loan,
          .pred_Yes)
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.996
 
ROC
train_auc <- 
  tree_train_pred %>%
  roc_curve(truth = Personal_Loan,
            estimate = .pred_Yes) %>%
  mutate(model = "train_auc")
autoplot(train_auc)

 
lift
tree_train_pred %>% 
  lift_curve(truth = Personal_Loan,
             estimate = .pred_Yes) %>%
  autoplot

 
gain
tree_train_pred %>%
  gain_curve(truth = Personal_Loan,
             estimate = .pred_Yes) %>%
  autoplot

 
중요변수 확인
tree_train_fit %>%
  pull_workflow_fit() %>% 
  vip()
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.

 
09. 테스트 데이터 검증
tree_test_fit <-
  tree_workflow %>%
  last_fit(bank_split)
tree_test_fit$.predictions
## [[1]]
## # A tibble: 1,250 x 6
##    .pred_Yes .pred_No  .row .pred_class Personal_Loan .config             
##        <dbl>    <dbl> <int> <fct>       <fct>         <chr>               
##  1         0        1     4 No          No            Preprocessor1_Model1
##  2         0        1     6 No          No            Preprocessor1_Model1
##  3         0        1     8 No          No            Preprocessor1_Model1
##  4         0        1     9 No          No            Preprocessor1_Model1
##  5         0        1    23 No          No            Preprocessor1_Model1
##  6         0        1    24 No          No            Preprocessor1_Model1
##  7         0        1    38 No          No            Preprocessor1_Model1
##  8         0        1    47 No          No            Preprocessor1_Model1
##  9         1        0    48 Yes         Yes           Preprocessor1_Model1
## 10         0        1    50 No          No            Preprocessor1_Model1
## # ... with 1,240 more rows
tree_test_pred <-
tree_test_fit %>% 
  collect_predictions
tree_test_pred
## # A tibble: 1,250 x 7
##    id               .pred_Yes .pred_No  .row .pred_class Personal_Loan .config  
##    <chr>                <dbl>    <dbl> <int> <fct>       <fct>         <chr>    
##  1 train/test split         0        1     4 No          No            Preproce~
##  2 train/test split         0        1     6 No          No            Preproce~
##  3 train/test split         0        1     8 No          No            Preproce~
##  4 train/test split         0        1     9 No          No            Preproce~
##  5 train/test split         0        1    23 No          No            Preproce~
##  6 train/test split         0        1    24 No          No            Preproce~
##  7 train/test split         0        1    38 No          No            Preproce~
##  8 train/test split         0        1    47 No          No            Preproce~
##  9 train/test split         1        0    48 Yes         Yes           Preproce~
## 10 train/test split         0        1    50 No          No            Preproce~
## # ... with 1,240 more rows
 
정오분류표
tree_test_conf <-
  tree_test_pred %>%
  conf_mat(truth = Personal_Loan,
           estimate = .pred_class)
tree_test_conf
##           Truth
## Prediction  Yes   No
##        Yes  111    7
##        No    15 1117
autoplot(tree_test_conf, type="heatmap")

autoplot(tree_test_conf, type="mosaic")

summary(tree_test_conf)
## # A tibble: 13 x 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             binary        0.982 
##  2 kap                  binary        0.900 
##  3 sens                 binary        0.881 
##  4 spec                 binary        0.994 
##  5 ppv                  binary        0.941 
##  6 npv                  binary        0.987 
##  7 mcc                  binary        0.901 
##  8 j_index              binary        0.875 
##  9 bal_accuracy         binary        0.937 
## 10 detection_prevalence binary        0.0944
## 11 precision            binary        0.941 
## 12 recall               binary        0.881 
## 13 f_meas               binary        0.910
 
AUC
tree_test_pred %>%
  roc_auc(truth = Personal_Loan,
          estimate = .pred_Yes)
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.993
test_auc <-
  tree_test_pred %>%
  roc_curve(truth = Personal_Loan,
            estimate = .pred_Yes) %>%
  mutate(model = "test_auc")
autoplot(test_auc)

tree_test_pred %>%
  lift_curve(truth = Personal_Loan,
             testimate = .pred_Yes) %>%
  autoplot()

tree_test_pred %>%
  gain_curve(truth = Personal_Loan,
             estimate = .pred_Yes) %>%
  autoplot()

tree_test_fit %>%
  pluck(".workflow",1) %>%
  pull_workflow_fit() %>%
  vip(num_features =20)
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.

 
10. 비교
tree_train_conf
##           Truth
## Prediction  Yes   No
##        Yes  313    3
##        No    41 3393
tree_test_conf
##           Truth
## Prediction  Yes   No
##        Yes  111    7
##        No    15 1117
summary(tree_train_conf)
## # A tibble: 13 x 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             binary        0.988 
##  2 kap                  binary        0.928 
##  3 sens                 binary        0.884 
##  4 spec                 binary        0.999 
##  5 ppv                  binary        0.991 
##  6 npv                  binary        0.988 
##  7 mcc                  binary        0.930 
##  8 j_index              binary        0.883 
##  9 bal_accuracy         binary        0.942 
## 10 detection_prevalence binary        0.0843
## 11 precision            binary        0.991 
## 12 recall               binary        0.884 
## 13 f_meas               binary        0.934
summary(tree_test_conf)
## # A tibble: 13 x 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             binary        0.982 
##  2 kap                  binary        0.900 
##  3 sens                 binary        0.881 
##  4 spec                 binary        0.994 
##  5 ppv                  binary        0.941 
##  6 npv                  binary        0.987 
##  7 mcc                  binary        0.901 
##  8 j_index              binary        0.875 
##  9 bal_accuracy         binary        0.937 
## 10 detection_prevalence binary        0.0944
## 11 precision            binary        0.941 
## 12 recall               binary        0.881 
## 13 f_meas               binary        0.910
bind_rows(train_auc, test_auc) %>% 
  ggplot(mapping = aes(x=1-specificity,
                       y=sensitivity,
                       color = model)) +
  geom_path(lwd = 1.5) +
  geom_abline(lty = 3) +
  coord_equal()

 
11. decision tree
#install.packages("rpart.plot")
library(rpart.plot)  
rpart_fit <-
  tree_train_fit %>%
  pull_workflow_fit()
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.
 
수정삽입 방법 :fn+insert
rpart.plot(x =rpart_fit$fit,
          yesno = 1,
          type = 2,
          extra = 1,
          split.font =1,
          varlen = -10) 
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call rpart.plot with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.

prp(x = rpart_fit$fit,
    type = 1,
    extra = 1,
    under = TRUE,
    split.font = 1,
    varlen = -10,
    box.col = ifelse(rpart_fit$fit$frame$var == "<leaf>", "gray", "white"))
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call prp with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.

 
 
        
      
Leave a comment