초기 모델
#install.packages(“tidyverse”) #install.packages(“tidymodels”) #install.packages(“tidytext”) #install.packages(“skimr”) #install.packages(“vip”)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidymodels)
## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --
## v broom 0.8.0 v rsample 0.1.1
## v dials 0.1.1 v tune 0.2.0
## v infer 1.0.0 v workflows 0.2.6
## v modeldata 0.1.1 v workflowsets 0.2.1
## v parsnip 0.2.1 v yardstick 0.0.9
## v recipes 0.2.0
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## * Search for functions across packages at https://www.tidymodels.org/find/
library(skimr)
library(vip)
##
## 다음의 패키지를 부착합니다: 'vip'
## The following object is masked from 'package:utils':
##
## vi
01. 데이터 불러오기
bank_tb <- read_csv("UniversalBank.csv",
col_names = TRUE,
locale = locale("ko", encoding = "euc-kr"),
na = ".") %>%
mutate_if(is.character, as.factor)
## Rows: 5000 Columns: 14
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (14): ID, Age, Experience, Income, ZIP Code, Family, CCAvg, Education, M...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(bank_tb)
## tibble [5,000 x 14] (S3: tbl_df/tbl/data.frame)
## $ ID : num [1:5000] 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : num [1:5000] 25 45 39 35 35 37 53 50 35 34 ...
## $ Experience : num [1:5000] 1 19 15 9 8 13 27 24 10 9 ...
## $ Income : num [1:5000] 49 34 11 100 45 29 72 22 81 180 ...
## $ ZIP Code : num [1:5000] 91107 90089 94720 94112 91330 ...
## $ Family : num [1:5000] 4 3 1 1 4 4 2 1 3 1 ...
## $ CCAvg : num [1:5000] 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
## $ Education : num [1:5000] 1 1 1 2 2 2 2 3 2 3 ...
## $ Mortgage : num [1:5000] 0 0 0 0 0 155 0 0 104 0 ...
## $ Personal Loan : num [1:5000] 0 0 0 0 0 0 0 0 0 1 ...
## $ Securities Account: num [1:5000] 1 1 0 0 0 0 0 0 0 0 ...
## $ CD Account : num [1:5000] 0 0 0 0 0 0 0 0 0 0 ...
## $ Online : num [1:5000] 0 0 0 0 0 1 1 0 1 0 ...
## $ CreditCard : num [1:5000] 0 0 0 0 1 0 0 1 0 0 ...
bank_tb
## # A tibble: 5,000 x 14
## ID Age Experience Income `ZIP Code` Family CCAvg Education Mortgage
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 25 1 49 91107 4 1.6 1 0
## 2 2 45 19 34 90089 3 1.5 1 0
## 3 3 39 15 11 94720 1 1 1 0
## 4 4 35 9 100 94112 1 2.7 2 0
## 5 5 35 8 45 91330 4 1 2 0
## 6 6 37 13 29 92121 4 0.4 2 155
## 7 7 53 27 72 91711 2 1.5 2 0
## 8 8 50 24 22 93943 1 0.3 3 0
## 9 9 35 10 81 90089 3 0.6 2 104
## 10 10 34 9 180 93023 1 8.9 3 0
## # ... with 4,990 more rows, and 5 more variables: `Personal Loan` <dbl>,
## # `Securities Account` <dbl>, `CD Account` <dbl>, Online <dbl>,
## # CreditCard <dbl>
전처리 작업
bank_tb <- bank_tb %>%
rename(c('Personal_Loan' = 'Personal Loan',
'CD_Account' = 'CD Account',
'Securities_Account' = 'Securities Account'))
str(bank_tb)
## tibble [5,000 x 14] (S3: tbl_df/tbl/data.frame)
## $ ID : num [1:5000] 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : num [1:5000] 25 45 39 35 35 37 53 50 35 34 ...
## $ Experience : num [1:5000] 1 19 15 9 8 13 27 24 10 9 ...
## $ Income : num [1:5000] 49 34 11 100 45 29 72 22 81 180 ...
## $ ZIP Code : num [1:5000] 91107 90089 94720 94112 91330 ...
## $ Family : num [1:5000] 4 3 1 1 4 4 2 1 3 1 ...
## $ CCAvg : num [1:5000] 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
## $ Education : num [1:5000] 1 1 1 2 2 2 2 3 2 3 ...
## $ Mortgage : num [1:5000] 0 0 0 0 0 155 0 0 104 0 ...
## $ Personal_Loan : num [1:5000] 0 0 0 0 0 0 0 0 0 1 ...
## $ Securities_Account: num [1:5000] 1 1 0 0 0 0 0 0 0 0 ...
## $ CD_Account : num [1:5000] 0 0 0 0 0 0 0 0 0 0 ...
## $ Online : num [1:5000] 0 0 0 0 0 1 1 0 1 0 ...
## $ CreditCard : num [1:5000] 0 0 0 0 1 0 0 1 0 0 ...
bank_tb
## # A tibble: 5,000 x 14
## ID Age Experience Income `ZIP Code` Family CCAvg Education Mortgage
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 25 1 49 91107 4 1.6 1 0
## 2 2 45 19 34 90089 3 1.5 1 0
## 3 3 39 15 11 94720 1 1 1 0
## 4 4 35 9 100 94112 1 2.7 2 0
## 5 5 35 8 45 91330 4 1 2 0
## 6 6 37 13 29 92121 4 0.4 2 155
## 7 7 53 27 72 91711 2 1.5 2 0
## 8 8 50 24 22 93943 1 0.3 3 0
## 9 9 35 10 81 90089 3 0.6 2 104
## 10 10 34 9 180 93023 1 8.9 3 0
## # ... with 4,990 more rows, and 5 more variables: Personal_Loan <dbl>,
## # Securities_Account <dbl>, CD_Account <dbl>, Online <dbl>, CreditCard <dbl>
factor 처리
bank_tb <- bank_tb %>%
mutate(Personal_Loan = factor(Personal_Loan,
levels = c(1,0),
labels = c("Yes","No"))) %>%
mutate(Securities_Account = factor(Securities_Account,
levels = c(0,1),
labels = c("No","Yes"))) %>%
mutate(CD_Account = factor(CD_Account,
levels = c(0,1),
labels = c("No","Yes"))) %>%
mutate(Online = factor(Online,
levels = c(0,1),
labels = c("No","Yes"))) %>%
mutate(CreditCard = factor(CreditCard,
levels = c(0,1),
labels = c("No","Yes"))) %>%
mutate(Education = factor(Education,
levels = c(1:3),
labels = c("Undergrad","Graduate",
"Professonal")))
str(bank_tb)
## tibble [5,000 x 14] (S3: tbl_df/tbl/data.frame)
## $ ID : num [1:5000] 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : num [1:5000] 25 45 39 35 35 37 53 50 35 34 ...
## $ Experience : num [1:5000] 1 19 15 9 8 13 27 24 10 9 ...
## $ Income : num [1:5000] 49 34 11 100 45 29 72 22 81 180 ...
## $ ZIP Code : num [1:5000] 91107 90089 94720 94112 91330 ...
## $ Family : num [1:5000] 4 3 1 1 4 4 2 1 3 1 ...
## $ CCAvg : num [1:5000] 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
## $ Education : Factor w/ 3 levels "Undergrad","Graduate",..: 1 1 1 2 2 2 2 3 2 3 ...
## $ Mortgage : num [1:5000] 0 0 0 0 0 155 0 0 104 0 ...
## $ Personal_Loan : Factor w/ 2 levels "Yes","No": 2 2 2 2 2 2 2 2 2 1 ...
## $ Securities_Account: Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
## $ CD_Account : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Online : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 2 1 2 1 ...
## $ CreditCard : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 2 1 1 ...
필요없는 변수 제거
bank_tb <- bank_tb %>%
select(-c(ID, 'ZIP Code'))
str(bank_tb)
## tibble [5,000 x 12] (S3: tbl_df/tbl/data.frame)
## $ Age : num [1:5000] 25 45 39 35 35 37 53 50 35 34 ...
## $ Experience : num [1:5000] 1 19 15 9 8 13 27 24 10 9 ...
## $ Income : num [1:5000] 49 34 11 100 45 29 72 22 81 180 ...
## $ Family : num [1:5000] 4 3 1 1 4 4 2 1 3 1 ...
## $ CCAvg : num [1:5000] 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
## $ Education : Factor w/ 3 levels "Undergrad","Graduate",..: 1 1 1 2 2 2 2 3 2 3 ...
## $ Mortgage : num [1:5000] 0 0 0 0 0 155 0 0 104 0 ...
## $ Personal_Loan : Factor w/ 2 levels "Yes","No": 2 2 2 2 2 2 2 2 2 1 ...
## $ Securities_Account: Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
## $ CD_Account : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Online : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 2 1 2 1 ...
## $ CreditCard : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 2 1 1 ...
03. 데이터 탐색
skim(bank_tb)
Data summary
Name |
bank_tb |
Number of rows |
5000 |
Number of columns |
12 |
_______________________ |
|
Column type frequency: |
|
factor |
6 |
numeric |
6 |
________________________ |
|
Group variables |
None |
Variable type: factor
Education |
0 |
1 |
FALSE |
3 |
Und: 2096, Pro: 1501, Gra: 1403 |
Personal_Loan |
0 |
1 |
FALSE |
2 |
No: 4520, Yes: 480 |
Securities_Account |
0 |
1 |
FALSE |
2 |
No: 4478, Yes: 522 |
CD_Account |
0 |
1 |
FALSE |
2 |
No: 4698, Yes: 302 |
Online |
0 |
1 |
FALSE |
2 |
Yes: 2984, No: 2016 |
CreditCard |
0 |
1 |
FALSE |
2 |
No: 3530, Yes: 1470 |
Variable type: numeric
Age |
0 |
1 |
45.34 |
11.46 |
23 |
35.0 |
45.0 |
55.0 |
67 |
▅▇▇▇▆ |
Experience |
0 |
1 |
20.10 |
11.47 |
-3 |
10.0 |
20.0 |
30.0 |
43 |
▅▇▇▇▅ |
Income |
0 |
1 |
73.77 |
46.03 |
8 |
39.0 |
64.0 |
98.0 |
224 |
▇▇▃▂▁ |
Family |
0 |
1 |
2.40 |
1.15 |
1 |
1.0 |
2.0 |
3.0 |
4 |
▇▇▁▆▆ |
CCAvg |
0 |
1 |
1.94 |
1.75 |
0 |
0.7 |
1.5 |
2.5 |
10 |
▇▃▁▁▁ |
Mortgage |
0 |
1 |
56.50 |
101.71 |
0 |
0.0 |
0.0 |
101.0 |
635 |
▇▂▁▁▁ |
bank_tb %>%
group_by(Personal_Loan) %>%
skim()
Data summary
Name |
Piped data |
Number of rows |
5000 |
Number of columns |
12 |
_______________________ |
|
Column type frequency: |
|
factor |
5 |
numeric |
6 |
________________________ |
|
Group variables |
Personal_Loan |
Variable type: factor
Education |
Yes |
0 |
1 |
FALSE |
3 |
Pro: 205, Gra: 182, Und: 93 |
Education |
No |
0 |
1 |
FALSE |
3 |
Und: 2003, Pro: 1296, Gra: 1221 |
Securities_Account |
Yes |
0 |
1 |
FALSE |
2 |
No: 420, Yes: 60 |
Securities_Account |
No |
0 |
1 |
FALSE |
2 |
No: 4058, Yes: 462 |
CD_Account |
Yes |
0 |
1 |
FALSE |
2 |
No: 340, Yes: 140 |
CD_Account |
No |
0 |
1 |
FALSE |
2 |
No: 4358, Yes: 162 |
Online |
Yes |
0 |
1 |
FALSE |
2 |
Yes: 291, No: 189 |
Online |
No |
0 |
1 |
FALSE |
2 |
Yes: 2693, No: 1827 |
CreditCard |
Yes |
0 |
1 |
FALSE |
2 |
No: 337, Yes: 143 |
CreditCard |
No |
0 |
1 |
FALSE |
2 |
No: 3193, Yes: 1327 |
Variable type: numeric
Age |
Yes |
0 |
1 |
45.07 |
11.59 |
26 |
35.0 |
45.0 |
55.00 |
65.0 |
▇▇▇▇▇ |
Age |
No |
0 |
1 |
45.37 |
11.45 |
23 |
35.0 |
45.0 |
55.00 |
67.0 |
▅▇▇▇▅ |
Experience |
Yes |
0 |
1 |
19.84 |
11.58 |
0 |
9.0 |
20.0 |
30.00 |
41.0 |
▇▇▇▇▆ |
Experience |
No |
0 |
1 |
20.13 |
11.46 |
-3 |
10.0 |
20.0 |
30.00 |
43.0 |
▅▇▇▇▅ |
Income |
Yes |
0 |
1 |
144.75 |
31.58 |
60 |
122.0 |
142.5 |
172.00 |
203.0 |
▁▅▇▆▆ |
Income |
No |
0 |
1 |
66.24 |
40.58 |
8 |
35.0 |
59.0 |
84.00 |
224.0 |
▇▇▂▁▁ |
Family |
Yes |
0 |
1 |
2.61 |
1.12 |
1 |
2.0 |
3.0 |
4.00 |
4.0 |
▆▆▁▇▇ |
Family |
No |
0 |
1 |
2.37 |
1.15 |
1 |
1.0 |
2.0 |
3.00 |
4.0 |
▇▇▁▅▆ |
CCAvg |
Yes |
0 |
1 |
3.91 |
2.10 |
0 |
2.6 |
3.8 |
5.35 |
10.0 |
▅▇▇▃▁ |
CCAvg |
No |
0 |
1 |
1.73 |
1.57 |
0 |
0.6 |
1.4 |
2.30 |
8.8 |
▇▅▁▁▁ |
Mortgage |
Yes |
0 |
1 |
100.85 |
160.85 |
0 |
0.0 |
0.0 |
192.50 |
617.0 |
▇▁▁▁▁ |
Mortgage |
No |
0 |
1 |
51.79 |
92.04 |
0 |
0.0 |
0.0 |
98.00 |
635.0 |
▇▂▁▁▁ |
base accuracy
bank_tb %>%
count(Personal_Loan) %>%
mutate(prop = n / sum(n))
## # A tibble: 2 x 3
## Personal_Loan n prop
## <fct> <int> <dbl>
## 1 Yes 480 0.096
## 2 No 4520 0.904
04. 훈련용, 테스트용 데이터 분할
set.seed(123)
bank_split <- bank_tb %>%
initial_split(starata=Personal_Loan)
bank_split # 테스트 1250
## <Analysis/Assess/Total>
## <3750/1250/5000>
train_data <- training(bank_split)
test_data <- testing(bank_split)
str(train_data)
## tibble [3,750 x 12] (S3: tbl_df/tbl/data.frame)
## $ Age : num [1:3750] 52 62 25 64 66 48 42 32 39 37 ...
## $ Experience : num [1:3750] 28 38 1 38 42 23 17 7 13 13 ...
## $ Income : num [1:3750] 23 52 98 79 95 63 91 143 59 38 ...
## $ Family : num [1:3750] 3 4 1 2 2 4 1 3 3 1 ...
## $ CCAvg : num [1:3750] 0.4 1.3 5.4 2.8 0 3.6 0.1 2.9 0.9 1.5 ...
## $ Education : Factor w/ 3 levels "Undergrad","Graduate",..: 1 2 1 1 3 3 2 3 3 2 ...
## $ Mortgage : num [1:3750] 0 0 0 179 0 0 199 0 199 116 ...
## $ Personal_Loan : Factor w/ 2 levels "Yes","No": 2 2 2 2 2 2 2 1 2 2 ...
## $ Securities_Account: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 2 1 ...
## $ CD_Account : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Online : Factor w/ 2 levels "No","Yes": 2 2 2 1 2 1 2 2 2 1 ...
## $ CreditCard : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 2 1 1 2 ...
str(test_data)
## tibble [1,250 x 12] (S3: tbl_df/tbl/data.frame)
## $ Age : num [1:1250] 35 37 50 35 29 44 51 39 37 40 ...
## $ Experience : num [1:1250] 9 13 24 10 5 18 25 14 12 16 ...
## $ Income : num [1:1250] 100 29 22 81 62 43 71 43 194 49 ...
## $ Family : num [1:1250] 1 4 1 3 1 2 1 3 4 1 ...
## $ CCAvg : num [1:1250] 2.7 0.4 0.3 0.6 1.2 0.7 1.4 0.7 0.2 1.8 ...
## $ Education : Factor w/ 3 levels "Undergrad","Graduate",..: 2 2 3 2 1 1 3 2 3 1 ...
## $ Mortgage : num [1:1250] 0 155 0 104 260 163 198 153 211 0 ...
## $ Personal_Loan : Factor w/ 2 levels "Yes","No": 2 2 2 2 2 2 2 2 1 2 ...
## $ Securities_Account: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 1 2 1 ...
## $ CD_Account : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 2 1 ...
## $ Online : Factor w/ 2 levels "No","Yes": 1 2 1 2 2 1 1 2 2 1 ...
## $ CreditCard : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 1 1 1 2 2 ...
05. 모델만들기
args(decision_tree)
## function (mode = "unknown", engine = "rpart", cost_complexity = NULL,
## tree_depth = NULL, min_n = NULL)
## NULL
모델설정
tree_model <-
decision_tree()%>%
set_engine("rpart") %>%
set_mode("classification")
recipe
tree_recipe <-
train_data %>%
recipe(Personal_Loan ~ .) %>% # 모든변수 .
step_dummy(all_nominal(), -all_outcomes()) # personal_Loan 변수만 결과 변수 ,나머지 설명변수
summary(tree_recipe)
## # A tibble: 12 x 4
## variable type role source
## <chr> <chr> <chr> <chr>
## 1 Age numeric predictor original
## 2 Experience numeric predictor original
## 3 Income numeric predictor original
## 4 Family numeric predictor original
## 5 CCAvg numeric predictor original
## 6 Education nominal predictor original
## 7 Mortgage numeric predictor original
## 8 Securities_Account nominal predictor original
## 9 CD_Account nominal predictor original
## 10 Online nominal predictor original
## 11 CreditCard nominal predictor original
## 12 Personal_Loan nominal outcome original
06 workflow 만들기
tree_workflow <-
workflow() %>%
add_model(tree_model) %>%
add_recipe(tree_recipe)
tree_workflow
## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: decision_tree()
##
## -- Preprocessor ----------------------------------------------------------------
## 1 Recipe Step
##
## * step_dummy()
##
## -- Model -----------------------------------------------------------------------
## Decision Tree Model Specification (classification)
##
## Computational engine: rpart
07 모델 훈련하기
tree_train_fit <-
tree_workflow %>%
fit(data = train_data)
tree_train_fit
## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: decision_tree()
##
## -- Preprocessor ----------------------------------------------------------------
## 1 Recipe Step
##
## * step_dummy()
##
## -- Model -----------------------------------------------------------------------
## n= 3750
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 3750 354 No (0.094400000 0.905600000)
## 2) Income>=100.5 917 327 No (0.356597601 0.643402399)
## 4) Family>=2.5 242 54 Yes (0.776859504 0.223140496)
## 8) Income>=116.5 162 0 Yes (1.000000000 0.000000000) *
## 9) Income< 116.5 80 26 No (0.325000000 0.675000000)
## 18) CCAvg>=2.7 27 8 Yes (0.703703704 0.296296296)
## 36) Age< 57.5 17 1 Yes (0.941176471 0.058823529) *
## 37) Age>=57.5 10 3 No (0.300000000 0.700000000) *
## 19) CCAvg< 2.7 53 7 No (0.132075472 0.867924528) *
## 5) Family< 2.5 675 139 No (0.205925926 0.794074074)
## 10) Education_Graduate>=0.5 78 12 Yes (0.846153846 0.153846154)
## 20) Income>=116.5 58 0 Yes (1.000000000 0.000000000) *
## 21) Income< 116.5 20 8 No (0.400000000 0.600000000)
## 42) Age>=43.5 8 2 Yes (0.750000000 0.250000000) *
## 43) Age< 43.5 12 2 No (0.166666667 0.833333333) *
## 11) Education_Graduate< 0.5 597 73 No (0.122278057 0.877721943)
## 22) Education_Professonal>=0.5 89 19 Yes (0.786516854 0.213483146)
## 44) Income>=114.5 62 0 Yes (1.000000000 0.000000000) *
## 45) Income< 114.5 27 8 No (0.296296296 0.703703704) *
## 23) Education_Professonal< 0.5 508 3 No (0.005905512 0.994094488) *
## 3) Income< 100.5 2833 27 No (0.009530533 0.990469467)
## 6) CCAvg>=2.95 168 27 No (0.160714286 0.839285714)
## 12) CD_Account_Yes>=0.5 9 0 Yes (1.000000000 0.000000000) *
## 13) CD_Account_Yes< 0.5 159 18 No (0.113207547 0.886792453) *
## 7) CCAvg< 2.95 2665 0 No (0.000000000 1.000000000) *
08 . 훈련모델 검증 해보기
예측 결과 표
predict(예측치, 실제값)
tree_train_pred <-
predict(tree_train_fit,
train_data,
type="prob") %>%
bind_cols(predict(tree_train_fit,
train_data)) %>%
bind_cols(train_data %>%
select(Personal_Loan)) #Personal_Loan은 실제값
tree_train_pred
## # A tibble: 3,750 x 4
## .pred_Yes .pred_No .pred_class Personal_Loan
## <dbl> <dbl> <fct> <fct>
## 1 0 1 No No
## 2 0 1 No No
## 3 0.113 0.887 No No
## 4 0 1 No No
## 5 0 1 No No
## 6 0.113 0.887 No No
## 7 0 1 No No
## 8 1 0 Yes Yes
## 9 0 1 No No
## 10 0 1 No No
## # ... with 3,740 more rows
정오분류표
tree_train_conf <-
tree_train_pred %>%
conf_mat(truth = Personal_Loan,
estimate = .pred_class)
autoplot(tree_train_conf, type = "heatmap")
autoplot(tree_train_conf, type = "mosaic")
summary(tree_train_conf)
## # A tibble: 13 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.988
## 2 kap binary 0.928
## 3 sens binary 0.884
## 4 spec binary 0.999
## 5 ppv binary 0.991
## 6 npv binary 0.988
## 7 mcc binary 0.930
## 8 j_index binary 0.883
## 9 bal_accuracy binary 0.942
## 10 detection_prevalence binary 0.0843
## 11 precision binary 0.991
## 12 recall binary 0.884
## 13 f_meas binary 0.934
AUC
tree_train_pred %>%
roc_auc(truth = Personal_Loan,
.pred_Yes)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.996
ROC
train_auc <-
tree_train_pred %>%
roc_curve(truth = Personal_Loan,
estimate = .pred_Yes) %>%
mutate(model = "train_auc")
autoplot(train_auc)
lift
tree_train_pred %>%
lift_curve(truth = Personal_Loan,
estimate = .pred_Yes) %>%
autoplot
gain
tree_train_pred %>%
gain_curve(truth = Personal_Loan,
estimate = .pred_Yes) %>%
autoplot
중요변수 확인
tree_train_fit %>%
pull_workflow_fit() %>%
vip()
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.
09. 테스트 데이터 검증
tree_test_fit <-
tree_workflow %>%
last_fit(bank_split)
tree_test_fit$.predictions
## [[1]]
## # A tibble: 1,250 x 6
## .pred_Yes .pred_No .row .pred_class Personal_Loan .config
## <dbl> <dbl> <int> <fct> <fct> <chr>
## 1 0 1 4 No No Preprocessor1_Model1
## 2 0 1 6 No No Preprocessor1_Model1
## 3 0 1 8 No No Preprocessor1_Model1
## 4 0 1 9 No No Preprocessor1_Model1
## 5 0 1 23 No No Preprocessor1_Model1
## 6 0 1 24 No No Preprocessor1_Model1
## 7 0 1 38 No No Preprocessor1_Model1
## 8 0 1 47 No No Preprocessor1_Model1
## 9 1 0 48 Yes Yes Preprocessor1_Model1
## 10 0 1 50 No No Preprocessor1_Model1
## # ... with 1,240 more rows
tree_test_pred <-
tree_test_fit %>%
collect_predictions
tree_test_pred
## # A tibble: 1,250 x 7
## id .pred_Yes .pred_No .row .pred_class Personal_Loan .config
## <chr> <dbl> <dbl> <int> <fct> <fct> <chr>
## 1 train/test split 0 1 4 No No Preproce~
## 2 train/test split 0 1 6 No No Preproce~
## 3 train/test split 0 1 8 No No Preproce~
## 4 train/test split 0 1 9 No No Preproce~
## 5 train/test split 0 1 23 No No Preproce~
## 6 train/test split 0 1 24 No No Preproce~
## 7 train/test split 0 1 38 No No Preproce~
## 8 train/test split 0 1 47 No No Preproce~
## 9 train/test split 1 0 48 Yes Yes Preproce~
## 10 train/test split 0 1 50 No No Preproce~
## # ... with 1,240 more rows
정오분류표
tree_test_conf <-
tree_test_pred %>%
conf_mat(truth = Personal_Loan,
estimate = .pred_class)
tree_test_conf
## Truth
## Prediction Yes No
## Yes 111 7
## No 15 1117
autoplot(tree_test_conf, type="heatmap")
autoplot(tree_test_conf, type="mosaic")
summary(tree_test_conf)
## # A tibble: 13 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.982
## 2 kap binary 0.900
## 3 sens binary 0.881
## 4 spec binary 0.994
## 5 ppv binary 0.941
## 6 npv binary 0.987
## 7 mcc binary 0.901
## 8 j_index binary 0.875
## 9 bal_accuracy binary 0.937
## 10 detection_prevalence binary 0.0944
## 11 precision binary 0.941
## 12 recall binary 0.881
## 13 f_meas binary 0.910
AUC
tree_test_pred %>%
roc_auc(truth = Personal_Loan,
estimate = .pred_Yes)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.993
test_auc <-
tree_test_pred %>%
roc_curve(truth = Personal_Loan,
estimate = .pred_Yes) %>%
mutate(model = "test_auc")
autoplot(test_auc)
tree_test_pred %>%
lift_curve(truth = Personal_Loan,
testimate = .pred_Yes) %>%
autoplot()
tree_test_pred %>%
gain_curve(truth = Personal_Loan,
estimate = .pred_Yes) %>%
autoplot()
tree_test_fit %>%
pluck(".workflow",1) %>%
pull_workflow_fit() %>%
vip(num_features =20)
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.
10. 비교
tree_train_conf
## Truth
## Prediction Yes No
## Yes 313 3
## No 41 3393
tree_test_conf
## Truth
## Prediction Yes No
## Yes 111 7
## No 15 1117
summary(tree_train_conf)
## # A tibble: 13 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.988
## 2 kap binary 0.928
## 3 sens binary 0.884
## 4 spec binary 0.999
## 5 ppv binary 0.991
## 6 npv binary 0.988
## 7 mcc binary 0.930
## 8 j_index binary 0.883
## 9 bal_accuracy binary 0.942
## 10 detection_prevalence binary 0.0843
## 11 precision binary 0.991
## 12 recall binary 0.884
## 13 f_meas binary 0.934
summary(tree_test_conf)
## # A tibble: 13 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.982
## 2 kap binary 0.900
## 3 sens binary 0.881
## 4 spec binary 0.994
## 5 ppv binary 0.941
## 6 npv binary 0.987
## 7 mcc binary 0.901
## 8 j_index binary 0.875
## 9 bal_accuracy binary 0.937
## 10 detection_prevalence binary 0.0944
## 11 precision binary 0.941
## 12 recall binary 0.881
## 13 f_meas binary 0.910
bind_rows(train_auc, test_auc) %>%
ggplot(mapping = aes(x=1-specificity,
y=sensitivity,
color = model)) +
geom_path(lwd = 1.5) +
geom_abline(lty = 3) +
coord_equal()
11. decision tree
#install.packages("rpart.plot")
library(rpart.plot)
rpart_fit <-
tree_train_fit %>%
pull_workflow_fit()
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.
수정삽입 방법 :fn+insert
rpart.plot(x =rpart_fit$fit,
yesno = 1,
type = 2,
extra = 1,
split.font =1,
varlen = -10)
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
## Call rpart.plot with roundint=FALSE,
## or rebuild the rpart model with model=TRUE.
prp(x = rpart_fit$fit,
type = 1,
extra = 1,
under = TRUE,
split.font = 1,
varlen = -10,
box.col = ifelse(rpart_fit$fit$frame$var == "<leaf>", "gray", "white"))
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
## Call prp with roundint=FALSE,
## or rebuild the rpart model with model=TRUE.
Leave a comment