Share on

데이터마이닝의 이론과 실제 14주차

초기 모델

#install.packages(“tidyverse”) #install.packages(“tidymodels”) #install.packages(“tidytext”) #install.packages(“skimr”) #install.packages(“vip”)

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(tidymodels)

## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --

## v broom        0.8.0     v rsample      0.1.1
## v dials        0.1.1     v tune         0.2.0
## v infer        1.0.0     v workflows    0.2.6
## v modeldata    0.1.1     v workflowsets 0.2.1
## v parsnip      0.2.1     v yardstick    0.0.9
## v recipes      0.2.0

## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## * Search for functions across packages at https://www.tidymodels.org/find/

library(skimr)
library(vip)

## 
## 다음의 패키지를 부착합니다: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

01. 데이터 불러오기

bank_tb <- read_csv("UniversalBank.csv",
                    col_names = TRUE,
                    locale = locale("ko", encoding = "euc-kr"),
                    na = ".") %>%
  mutate_if(is.character, as.factor)

## Rows: 5000 Columns: 14
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (14): ID, Age, Experience, Income, ZIP Code, Family, CCAvg, Education, M...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

str(bank_tb)

## tibble [5,000 x 14] (S3: tbl_df/tbl/data.frame)
##  $ ID                : num [1:5000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Age               : num [1:5000] 25 45 39 35 35 37 53 50 35 34 ...
##  $ Experience        : num [1:5000] 1 19 15 9 8 13 27 24 10 9 ...
##  $ Income            : num [1:5000] 49 34 11 100 45 29 72 22 81 180 ...
##  $ ZIP Code          : num [1:5000] 91107 90089 94720 94112 91330 ...
##  $ Family            : num [1:5000] 4 3 1 1 4 4 2 1 3 1 ...
##  $ CCAvg             : num [1:5000] 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
##  $ Education         : num [1:5000] 1 1 1 2 2 2 2 3 2 3 ...
##  $ Mortgage          : num [1:5000] 0 0 0 0 0 155 0 0 104 0 ...
##  $ Personal Loan     : num [1:5000] 0 0 0 0 0 0 0 0 0 1 ...
##  $ Securities Account: num [1:5000] 1 1 0 0 0 0 0 0 0 0 ...
##  $ CD Account        : num [1:5000] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Online            : num [1:5000] 0 0 0 0 0 1 1 0 1 0 ...
##  $ CreditCard        : num [1:5000] 0 0 0 0 1 0 0 1 0 0 ...

bank_tb

## # A tibble: 5,000 x 14
##       ID   Age Experience Income `ZIP Code` Family CCAvg Education Mortgage
##    <dbl> <dbl>      <dbl>  <dbl>      <dbl>  <dbl> <dbl>     <dbl>    <dbl>
##  1     1    25          1     49      91107      4   1.6         1        0
##  2     2    45         19     34      90089      3   1.5         1        0
##  3     3    39         15     11      94720      1   1           1        0
##  4     4    35          9    100      94112      1   2.7         2        0
##  5     5    35          8     45      91330      4   1           2        0
##  6     6    37         13     29      92121      4   0.4         2      155
##  7     7    53         27     72      91711      2   1.5         2        0
##  8     8    50         24     22      93943      1   0.3         3        0
##  9     9    35         10     81      90089      3   0.6         2      104
## 10    10    34          9    180      93023      1   8.9         3        0
## # ... with 4,990 more rows, and 5 more variables: `Personal Loan` <dbl>,
## #   `Securities Account` <dbl>, `CD Account` <dbl>, Online <dbl>,
## #   CreditCard <dbl>

전처리 작업

bank_tb <- bank_tb %>%
  rename(c('Personal_Loan' = 'Personal Loan',
           'CD_Account' = 'CD Account',
           'Securities_Account' = 'Securities Account'))

str(bank_tb)

## tibble [5,000 x 14] (S3: tbl_df/tbl/data.frame)
##  $ ID                : num [1:5000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Age               : num [1:5000] 25 45 39 35 35 37 53 50 35 34 ...
##  $ Experience        : num [1:5000] 1 19 15 9 8 13 27 24 10 9 ...
##  $ Income            : num [1:5000] 49 34 11 100 45 29 72 22 81 180 ...
##  $ ZIP Code          : num [1:5000] 91107 90089 94720 94112 91330 ...
##  $ Family            : num [1:5000] 4 3 1 1 4 4 2 1 3 1 ...
##  $ CCAvg             : num [1:5000] 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
##  $ Education         : num [1:5000] 1 1 1 2 2 2 2 3 2 3 ...
##  $ Mortgage          : num [1:5000] 0 0 0 0 0 155 0 0 104 0 ...
##  $ Personal_Loan     : num [1:5000] 0 0 0 0 0 0 0 0 0 1 ...
##  $ Securities_Account: num [1:5000] 1 1 0 0 0 0 0 0 0 0 ...
##  $ CD_Account        : num [1:5000] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Online            : num [1:5000] 0 0 0 0 0 1 1 0 1 0 ...
##  $ CreditCard        : num [1:5000] 0 0 0 0 1 0 0 1 0 0 ...

bank_tb

## # A tibble: 5,000 x 14
##       ID   Age Experience Income `ZIP Code` Family CCAvg Education Mortgage
##    <dbl> <dbl>      <dbl>  <dbl>      <dbl>  <dbl> <dbl>     <dbl>    <dbl>
##  1     1    25          1     49      91107      4   1.6         1        0
##  2     2    45         19     34      90089      3   1.5         1        0
##  3     3    39         15     11      94720      1   1           1        0
##  4     4    35          9    100      94112      1   2.7         2        0
##  5     5    35          8     45      91330      4   1           2        0
##  6     6    37         13     29      92121      4   0.4         2      155
##  7     7    53         27     72      91711      2   1.5         2        0
##  8     8    50         24     22      93943      1   0.3         3        0
##  9     9    35         10     81      90089      3   0.6         2      104
## 10    10    34          9    180      93023      1   8.9         3        0
## # ... with 4,990 more rows, and 5 more variables: Personal_Loan <dbl>,
## #   Securities_Account <dbl>, CD_Account <dbl>, Online <dbl>, CreditCard <dbl>

factor 처리

bank_tb <- bank_tb %>%
  mutate(Personal_Loan = factor(Personal_Loan,
                                levels = c(1,0),
                                labels = c("Yes","No"))) %>%
  mutate(Securities_Account = factor(Securities_Account,
                                     levels = c(0,1),
                                     labels = c("No","Yes"))) %>%
  mutate(CD_Account = factor(CD_Account,
                             levels = c(0,1),
                             labels = c("No","Yes"))) %>%
  mutate(Online = factor(Online,
                         levels = c(0,1),
                         labels = c("No","Yes"))) %>%
  mutate(CreditCard = factor(CreditCard,
                             levels = c(0,1),
                             labels = c("No","Yes"))) %>%
  mutate(Education = factor(Education,
                             levels = c(1:3),
                             labels = c("Undergrad","Graduate",
                                        "Professonal")))

str(bank_tb)

## tibble [5,000 x 14] (S3: tbl_df/tbl/data.frame)
##  $ ID                : num [1:5000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Age               : num [1:5000] 25 45 39 35 35 37 53 50 35 34 ...
##  $ Experience        : num [1:5000] 1 19 15 9 8 13 27 24 10 9 ...
##  $ Income            : num [1:5000] 49 34 11 100 45 29 72 22 81 180 ...
##  $ ZIP Code          : num [1:5000] 91107 90089 94720 94112 91330 ...
##  $ Family            : num [1:5000] 4 3 1 1 4 4 2 1 3 1 ...
##  $ CCAvg             : num [1:5000] 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
##  $ Education         : Factor w/ 3 levels "Undergrad","Graduate",..: 1 1 1 2 2 2 2 3 2 3 ...
##  $ Mortgage          : num [1:5000] 0 0 0 0 0 155 0 0 104 0 ...
##  $ Personal_Loan     : Factor w/ 2 levels "Yes","No": 2 2 2 2 2 2 2 2 2 1 ...
##  $ Securities_Account: Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
##  $ CD_Account        : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Online            : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 2 1 2 1 ...
##  $ CreditCard        : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 2 1 1 ...

필요없는 변수 제거

bank_tb <- bank_tb %>%
  select(-c(ID, 'ZIP Code'))
str(bank_tb)

## tibble [5,000 x 12] (S3: tbl_df/tbl/data.frame)
##  $ Age               : num [1:5000] 25 45 39 35 35 37 53 50 35 34 ...
##  $ Experience        : num [1:5000] 1 19 15 9 8 13 27 24 10 9 ...
##  $ Income            : num [1:5000] 49 34 11 100 45 29 72 22 81 180 ...
##  $ Family            : num [1:5000] 4 3 1 1 4 4 2 1 3 1 ...
##  $ CCAvg             : num [1:5000] 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
##  $ Education         : Factor w/ 3 levels "Undergrad","Graduate",..: 1 1 1 2 2 2 2 3 2 3 ...
##  $ Mortgage          : num [1:5000] 0 0 0 0 0 155 0 0 104 0 ...
##  $ Personal_Loan     : Factor w/ 2 levels "Yes","No": 2 2 2 2 2 2 2 2 2 1 ...
##  $ Securities_Account: Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
##  $ CD_Account        : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Online            : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 2 1 2 1 ...
##  $ CreditCard        : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 2 1 1 ...

03. 데이터 탐색

skim(bank_tb)

Data summary
Name	bank_tb
Number of rows	5000
Number of columns	12
_______________________
Column type frequency:
factor	6
numeric	6
________________________
Group variables	None

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
Education	1	FALSE	3	Und: 2096, Pro: 1501, Gra: 1403
Personal_Loan	1	FALSE	2	No: 4520, Yes: 480
Securities_Account	1	FALSE	2	No: 4478, Yes: 522
CD_Account	1	FALSE	2	No: 4698, Yes: 302
Online	1	FALSE	2	Yes: 2984, No: 2016
CreditCard	1	FALSE	2	No: 3530, Yes: 1470

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Age	1	45.34	11.46	23	35.0	45.0	55.0	67	▅▇▇▇▆
Experience	1	20.10	11.47	-3	10.0	20.0	30.0	43	▅▇▇▇▅
Income	1	73.77	46.03	8	39.0	64.0	98.0	224	▇▇▃▂▁
Family	1	2.40	1.15	1	1.0	2.0	3.0	4	▇▇▁▆▆
CCAvg	1	1.94	1.75	0	0.7	1.5	2.5	10	▇▃▁▁▁
Mortgage	1	56.50	101.71	0	0.0	0.0	101.0	635	▇▂▁▁▁

bank_tb %>%
  group_by(Personal_Loan) %>%
  skim()

Data summary
Name	Piped data
Number of rows	5000
Number of columns	12
_______________________
Column type frequency:
factor	5
numeric	6
________________________
Group variables	Personal_Loan

Variable type: factor

skim_variable	Personal_Loan	complete_rate	ordered	n_unique	top_counts
Education	Yes	1	FALSE	3	Pro: 205, Gra: 182, Und: 93
Education	No	1	FALSE	3	Und: 2003, Pro: 1296, Gra: 1221
Securities_Account	Yes	1	FALSE	2	No: 420, Yes: 60
Securities_Account	No	1	FALSE	2	No: 4058, Yes: 462
CD_Account	Yes	1	FALSE	2	No: 340, Yes: 140
CD_Account	No	1	FALSE	2	No: 4358, Yes: 162
Online	Yes	1	FALSE	2	Yes: 291, No: 189
Online	No	1	FALSE	2	Yes: 2693, No: 1827
CreditCard	Yes	1	FALSE	2	No: 337, Yes: 143
CreditCard	No	1	FALSE	2	No: 3193, Yes: 1327

Variable type: numeric

skim_variable	Personal_Loan	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Age	Yes	1	45.07	11.59	26	35.0	45.0	55.00	65.0	▇▇▇▇▇
Age	No	1	45.37	11.45	23	35.0	45.0	55.00	67.0	▅▇▇▇▅
Experience	Yes	1	19.84	11.58	0	9.0	20.0	30.00	41.0	▇▇▇▇▆
Experience	No	1	20.13	11.46	-3	10.0	20.0	30.00	43.0	▅▇▇▇▅
Income	Yes	1	144.75	31.58	60	122.0	142.5	172.00	203.0	▁▅▇▆▆
Income	No	1	66.24	40.58	8	35.0	59.0	84.00	224.0	▇▇▂▁▁
Family	Yes	1	2.61	1.12	1	2.0	3.0	4.00	4.0	▆▆▁▇▇
Family	No	1	2.37	1.15	1	1.0	2.0	3.00	4.0	▇▇▁▅▆
CCAvg	Yes	1	3.91	2.10	0	2.6	3.8	5.35	10.0	▅▇▇▃▁
CCAvg	No	1	1.73	1.57	0	0.6	1.4	2.30	8.8	▇▅▁▁▁
Mortgage	Yes	1	100.85	160.85	0	0.0	0.0	192.50	617.0	▇▁▁▁▁
Mortgage	No	1	51.79	92.04	0	0.0	0.0	98.00	635.0	▇▂▁▁▁

base accuracy

bank_tb %>%
  count(Personal_Loan) %>%
  mutate(prop = n / sum(n))

## # A tibble: 2 x 3
##   Personal_Loan     n  prop
##   <fct>         <int> <dbl>
## 1 Yes             480 0.096
## 2 No             4520 0.904

04. 훈련용, 테스트용 데이터 분할

set.seed(123)

bank_split <- bank_tb %>%
  initial_split(starata=Personal_Loan)
bank_split # 테스트 1250

## <Analysis/Assess/Total>
## <3750/1250/5000>

train_data <- training(bank_split)
test_data <- testing(bank_split)

str(train_data)

## tibble [3,750 x 12] (S3: tbl_df/tbl/data.frame)
##  $ Age               : num [1:3750] 52 62 25 64 66 48 42 32 39 37 ...
##  $ Experience        : num [1:3750] 28 38 1 38 42 23 17 7 13 13 ...
##  $ Income            : num [1:3750] 23 52 98 79 95 63 91 143 59 38 ...
##  $ Family            : num [1:3750] 3 4 1 2 2 4 1 3 3 1 ...
##  $ CCAvg             : num [1:3750] 0.4 1.3 5.4 2.8 0 3.6 0.1 2.9 0.9 1.5 ...
##  $ Education         : Factor w/ 3 levels "Undergrad","Graduate",..: 1 2 1 1 3 3 2 3 3 2 ...
##  $ Mortgage          : num [1:3750] 0 0 0 179 0 0 199 0 199 116 ...
##  $ Personal_Loan     : Factor w/ 2 levels "Yes","No": 2 2 2 2 2 2 2 1 2 2 ...
##  $ Securities_Account: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 2 1 ...
##  $ CD_Account        : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Online            : Factor w/ 2 levels "No","Yes": 2 2 2 1 2 1 2 2 2 1 ...
##  $ CreditCard        : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 2 1 1 2 ...

str(test_data)

## tibble [1,250 x 12] (S3: tbl_df/tbl/data.frame)
##  $ Age               : num [1:1250] 35 37 50 35 29 44 51 39 37 40 ...
##  $ Experience        : num [1:1250] 9 13 24 10 5 18 25 14 12 16 ...
##  $ Income            : num [1:1250] 100 29 22 81 62 43 71 43 194 49 ...
##  $ Family            : num [1:1250] 1 4 1 3 1 2 1 3 4 1 ...
##  $ CCAvg             : num [1:1250] 2.7 0.4 0.3 0.6 1.2 0.7 1.4 0.7 0.2 1.8 ...
##  $ Education         : Factor w/ 3 levels "Undergrad","Graduate",..: 2 2 3 2 1 1 3 2 3 1 ...
##  $ Mortgage          : num [1:1250] 0 155 0 104 260 163 198 153 211 0 ...
##  $ Personal_Loan     : Factor w/ 2 levels "Yes","No": 2 2 2 2 2 2 2 2 1 2 ...
##  $ Securities_Account: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 1 2 1 ...
##  $ CD_Account        : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 2 1 ...
##  $ Online            : Factor w/ 2 levels "No","Yes": 1 2 1 2 2 1 1 2 2 1 ...
##  $ CreditCard        : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 1 1 1 2 2 ...

05. 모델만들기

args(decision_tree)

## function (mode = "unknown", engine = "rpart", cost_complexity = NULL, 
##     tree_depth = NULL, min_n = NULL) 
## NULL

모델설정

tree_model <- 
  decision_tree()%>%
  set_engine("rpart") %>%
  set_mode("classification")

recipe

tree_recipe <-
  train_data %>% 
  recipe(Personal_Loan ~ .) %>% # 모든변수 . 
  step_dummy(all_nominal(), -all_outcomes()) # personal_Loan 변수만 결과 변수 ,나머지 설명변수  
summary(tree_recipe)

## # A tibble: 12 x 4
##    variable           type    role      source  
##    <chr>              <chr>   <chr>     <chr>   
##  1 Age                numeric predictor original
##  2 Experience         numeric predictor original
##  3 Income             numeric predictor original
##  4 Family             numeric predictor original
##  5 CCAvg              numeric predictor original
##  6 Education          nominal predictor original
##  7 Mortgage           numeric predictor original
##  8 Securities_Account nominal predictor original
##  9 CD_Account         nominal predictor original
## 10 Online             nominal predictor original
## 11 CreditCard         nominal predictor original
## 12 Personal_Loan      nominal outcome   original

06 workflow 만들기

tree_workflow <- 
  workflow() %>%
  add_model(tree_model) %>%
  add_recipe(tree_recipe)
tree_workflow

## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: decision_tree()
## 
## -- Preprocessor ----------------------------------------------------------------
## 1 Recipe Step
## 
## * step_dummy()
## 
## -- Model -----------------------------------------------------------------------
## Decision Tree Model Specification (classification)
## 
## Computational engine: rpart

07 모델 훈련하기

tree_train_fit <-
  tree_workflow %>%
  fit(data = train_data)

tree_train_fit

## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: decision_tree()
## 
## -- Preprocessor ----------------------------------------------------------------
## 1 Recipe Step
## 
## * step_dummy()
## 
## -- Model -----------------------------------------------------------------------
## n= 3750 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 3750 354 No (0.094400000 0.905600000)  
##    2) Income>=100.5 917 327 No (0.356597601 0.643402399)  
##      4) Family>=2.5 242  54 Yes (0.776859504 0.223140496)  
##        8) Income>=116.5 162   0 Yes (1.000000000 0.000000000) *
##        9) Income< 116.5 80  26 No (0.325000000 0.675000000)  
##         18) CCAvg>=2.7 27   8 Yes (0.703703704 0.296296296)  
##           36) Age< 57.5 17   1 Yes (0.941176471 0.058823529) *
##           37) Age>=57.5 10   3 No (0.300000000 0.700000000) *
##         19) CCAvg< 2.7 53   7 No (0.132075472 0.867924528) *
##      5) Family< 2.5 675 139 No (0.205925926 0.794074074)  
##       10) Education_Graduate>=0.5 78  12 Yes (0.846153846 0.153846154)  
##         20) Income>=116.5 58   0 Yes (1.000000000 0.000000000) *
##         21) Income< 116.5 20   8 No (0.400000000 0.600000000)  
##           42) Age>=43.5 8   2 Yes (0.750000000 0.250000000) *
##           43) Age< 43.5 12   2 No (0.166666667 0.833333333) *
##       11) Education_Graduate< 0.5 597  73 No (0.122278057 0.877721943)  
##         22) Education_Professonal>=0.5 89  19 Yes (0.786516854 0.213483146)  
##           44) Income>=114.5 62   0 Yes (1.000000000 0.000000000) *
##           45) Income< 114.5 27   8 No (0.296296296 0.703703704) *
##         23) Education_Professonal< 0.5 508   3 No (0.005905512 0.994094488) *
##    3) Income< 100.5 2833  27 No (0.009530533 0.990469467)  
##      6) CCAvg>=2.95 168  27 No (0.160714286 0.839285714)  
##       12) CD_Account_Yes>=0.5 9   0 Yes (1.000000000 0.000000000) *
##       13) CD_Account_Yes< 0.5 159  18 No (0.113207547 0.886792453) *
##      7) CCAvg< 2.95 2665   0 No (0.000000000 1.000000000) *

08 . 훈련모델 검증 해보기

예측 결과 표

predict(예측치, 실제값)

tree_train_pred <- 
  predict(tree_train_fit,
          train_data,
          type="prob") %>%
  bind_cols(predict(tree_train_fit,
                    train_data)) %>%
  bind_cols(train_data %>%
              select(Personal_Loan))    #Personal_Loan은 실제값

tree_train_pred

## # A tibble: 3,750 x 4
##    .pred_Yes .pred_No .pred_class Personal_Loan
##        <dbl>    <dbl> <fct>       <fct>        
##  1     0        1     No          No           
##  2     0        1     No          No           
##  3     0.113    0.887 No          No           
##  4     0        1     No          No           
##  5     0        1     No          No           
##  6     0.113    0.887 No          No           
##  7     0        1     No          No           
##  8     1        0     Yes         Yes          
##  9     0        1     No          No           
## 10     0        1     No          No           
## # ... with 3,740 more rows

정오분류표

tree_train_conf <- 
  tree_train_pred %>%
  conf_mat(truth = Personal_Loan,
           estimate = .pred_class)

autoplot(tree_train_conf, type = "heatmap")

autoplot(tree_train_conf, type = "mosaic")

summary(tree_train_conf)

## # A tibble: 13 x 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             binary        0.988 
##  2 kap                  binary        0.928 
##  3 sens                 binary        0.884 
##  4 spec                 binary        0.999 
##  5 ppv                  binary        0.991 
##  6 npv                  binary        0.988 
##  7 mcc                  binary        0.930 
##  8 j_index              binary        0.883 
##  9 bal_accuracy         binary        0.942 
## 10 detection_prevalence binary        0.0843
## 11 precision            binary        0.991 
## 12 recall               binary        0.884 
## 13 f_meas               binary        0.934

AUC

tree_train_pred %>%
  roc_auc(truth = Personal_Loan,
          .pred_Yes)

## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.996

ROC

train_auc <- 
  tree_train_pred %>%
  roc_curve(truth = Personal_Loan,
            estimate = .pred_Yes) %>%
  mutate(model = "train_auc")

autoplot(train_auc)

lift

tree_train_pred %>% 
  lift_curve(truth = Personal_Loan,
             estimate = .pred_Yes) %>%
  autoplot

gain

tree_train_pred %>%
  gain_curve(truth = Personal_Loan,
             estimate = .pred_Yes) %>%
  autoplot

중요변수 확인

tree_train_fit %>%
  pull_workflow_fit() %>% 
  vip()

## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.

09. 테스트 데이터 검증

tree_test_fit <-
  tree_workflow %>%
  last_fit(bank_split)

tree_test_fit$.predictions

## [[1]]
## # A tibble: 1,250 x 6
##    .pred_Yes .pred_No  .row .pred_class Personal_Loan .config             
##        <dbl>    <dbl> <int> <fct>       <fct>         <chr>               
##  1         0        1     4 No          No            Preprocessor1_Model1
##  2         0        1     6 No          No            Preprocessor1_Model1
##  3         0        1     8 No          No            Preprocessor1_Model1
##  4         0        1     9 No          No            Preprocessor1_Model1
##  5         0        1    23 No          No            Preprocessor1_Model1
##  6         0        1    24 No          No            Preprocessor1_Model1
##  7         0        1    38 No          No            Preprocessor1_Model1
##  8         0        1    47 No          No            Preprocessor1_Model1
##  9         1        0    48 Yes         Yes           Preprocessor1_Model1
## 10         0        1    50 No          No            Preprocessor1_Model1
## # ... with 1,240 more rows

tree_test_pred <-
tree_test_fit %>% 
  collect_predictions

tree_test_pred

## # A tibble: 1,250 x 7
##    id               .pred_Yes .pred_No  .row .pred_class Personal_Loan .config  
##    <chr>                <dbl>    <dbl> <int> <fct>       <fct>         <chr>    
##  1 train/test split         0        1     4 No          No            Preproce~
##  2 train/test split         0        1     6 No          No            Preproce~
##  3 train/test split         0        1     8 No          No            Preproce~
##  4 train/test split         0        1     9 No          No            Preproce~
##  5 train/test split         0        1    23 No          No            Preproce~
##  6 train/test split         0        1    24 No          No            Preproce~
##  7 train/test split         0        1    38 No          No            Preproce~
##  8 train/test split         0        1    47 No          No            Preproce~
##  9 train/test split         1        0    48 Yes         Yes           Preproce~
## 10 train/test split         0        1    50 No          No            Preproce~
## # ... with 1,240 more rows

정오분류표

tree_test_conf <-
  tree_test_pred %>%
  conf_mat(truth = Personal_Loan,
           estimate = .pred_class)

tree_test_conf

##           Truth
## Prediction  Yes   No
##        Yes  111    7
##        No    15 1117

autoplot(tree_test_conf, type="heatmap")

autoplot(tree_test_conf, type="mosaic")

summary(tree_test_conf)

## # A tibble: 13 x 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             binary        0.982 
##  2 kap                  binary        0.900 
##  3 sens                 binary        0.881 
##  4 spec                 binary        0.994 
##  5 ppv                  binary        0.941 
##  6 npv                  binary        0.987 
##  7 mcc                  binary        0.901 
##  8 j_index              binary        0.875 
##  9 bal_accuracy         binary        0.937 
## 10 detection_prevalence binary        0.0944
## 11 precision            binary        0.941 
## 12 recall               binary        0.881 
## 13 f_meas               binary        0.910

AUC

tree_test_pred %>%
  roc_auc(truth = Personal_Loan,
          estimate = .pred_Yes)

## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.993

test_auc <-
  tree_test_pred %>%
  roc_curve(truth = Personal_Loan,
            estimate = .pred_Yes) %>%
  mutate(model = "test_auc")

autoplot(test_auc)

tree_test_pred %>%
  lift_curve(truth = Personal_Loan,
             testimate = .pred_Yes) %>%
  autoplot()

tree_test_pred %>%
  gain_curve(truth = Personal_Loan,
             estimate = .pred_Yes) %>%
  autoplot()

tree_test_fit %>%
  pluck(".workflow",1) %>%
  pull_workflow_fit() %>%
  vip(num_features =20)

## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.

10. 비교

tree_train_conf

##           Truth
## Prediction  Yes   No
##        Yes  313    3
##        No    41 3393

tree_test_conf

##           Truth
## Prediction  Yes   No
##        Yes  111    7
##        No    15 1117

summary(tree_train_conf)

## # A tibble: 13 x 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             binary        0.988 
##  2 kap                  binary        0.928 
##  3 sens                 binary        0.884 
##  4 spec                 binary        0.999 
##  5 ppv                  binary        0.991 
##  6 npv                  binary        0.988 
##  7 mcc                  binary        0.930 
##  8 j_index              binary        0.883 
##  9 bal_accuracy         binary        0.942 
## 10 detection_prevalence binary        0.0843
## 11 precision            binary        0.991 
## 12 recall               binary        0.884 
## 13 f_meas               binary        0.934

summary(tree_test_conf)

## # A tibble: 13 x 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             binary        0.982 
##  2 kap                  binary        0.900 
##  3 sens                 binary        0.881 
##  4 spec                 binary        0.994 
##  5 ppv                  binary        0.941 
##  6 npv                  binary        0.987 
##  7 mcc                  binary        0.901 
##  8 j_index              binary        0.875 
##  9 bal_accuracy         binary        0.937 
## 10 detection_prevalence binary        0.0944
## 11 precision            binary        0.941 
## 12 recall               binary        0.881 
## 13 f_meas               binary        0.910

bind_rows(train_auc, test_auc) %>% 
  ggplot(mapping = aes(x=1-specificity,
                       y=sensitivity,
                       color = model)) +
  geom_path(lwd = 1.5) +
  geom_abline(lty = 3) +
  coord_equal()

11. decision tree

#install.packages("rpart.plot")
library(rpart.plot)

rpart_fit <-
  tree_train_fit %>%
  pull_workflow_fit()

## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.

수정삽입 방법 :fn+insert

rpart.plot(x =rpart_fit$fit,
          yesno = 1,
          type = 2,
          extra = 1,
          split.font =1,
          varlen = -10)

## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call rpart.plot with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.

prp(x = rpart_fit$fit,
    type = 1,
    extra = 1,
    under = TRUE,
    split.font = 1,
    varlen = -10,
    box.col = ifelse(rpart_fit$fit$frame$var == "<leaf>", "gray", "white"))

## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call prp with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.

Share on

Copy URL

Kakao Facebook Twitter LinkedIn

SeongJae Yoo

Data Mining Theory And Reality14

Share on

데이터마이닝의 이론과 실제 14주차

초기 모델

01. 데이터 불러오기

전처리 작업

factor 처리

필요없는 변수 제거

03. 데이터 탐색

base accuracy

04. 훈련용, 테스트용 데이터 분할

05. 모델만들기

모델설정

recipe

06 workflow 만들기

07 모델 훈련하기

08 . 훈련모델 검증 해보기

예측 결과 표

predict(예측치, 실제값)

정오분류표

AUC

ROC

lift

gain

중요변수 확인

09. 테스트 데이터 검증

정오분류표

AUC

10. 비교

11. decision tree

수정삽입 방법 :fn+insert

Share on

Leave a comment

You may also enjoy

자동주식투자프로그램 세팅방법

Data Mining Theory And Reality Midtestexample04

Data Mining Theory And Reality Midtestexample03

Data Mining Theory And Reality Midtestexample02