연습문제1
유니버셜 은행에서는 target marketing을 활용한 캠페인을 진행하려고 한다.
다음과 같은 변수들 중에서 개인대출 제안에 대한 수락(1)에 영향을 미치는 변수는 무엇인가?
대출유무: 0=대출없음, 1=대출있음
카드보유유무: 0=카드없음, 1=카드있음
로지스틱 회귀분석(Logistic Regression) :반응변수가 범주형인 경우( 0 or 1 ) 적용하는 회귀분석
1.기본 package 설정
1.1 library 로드
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidymodels)
## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --
## v broom 0.8.0 v rsample 0.1.1
## v dials 0.1.1 v tune 0.2.0
## v infer 1.0.0 v workflows 0.2.6
## v modeldata 0.1.1 v workflowsets 0.2.1
## v parsnip 0.2.1 v yardstick 0.0.9
## v recipes 0.2.0
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## * Dig deeper into tidy modeling with R at https://www.tmwr.org
library(rstatix)
##
## 다음의 패키지를 부착합니다: 'rstatix'
## The following objects are masked from 'package:infer':
##
## chisq_test, prop_test, t_test
## The following object is masked from 'package:dials':
##
## get_n
## The following object is masked from 'package:stats':
##
## filter
library(skimr)
library(lm.beta)
2.데이터 불러오기
bank_tb <- read_csv('data1.csv',
col_names = TRUE,
locale=locale('ko', encoding='euc-kr'), # 한글
na=".") %>%
mutate_if(is.character, as.factor) %>%
mutate(카드보유유무 = factor(카드보유유무,
levels=c(0,1),
labels=c("No","Yes"))) %>%
mutate(대출유무 = factor(대출유무,
levels=c(0:1),
labels=c("No","Yes")))
## Rows: 200 Columns: 6
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (6): 나이, 경력, 수입, 카드사용액, 카드보유유무, 대출유무
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(bank_tb)
## tibble [200 x 6] (S3: tbl_df/tbl/data.frame)
## $ 나이 : num [1:200] 25 45 39 35 35 37 53 50 35 34 ...
## $ 경력 : num [1:200] 1 19 15 9 8 13 27 24 10 9 ...
## $ 수입 : num [1:200] 49 34 11 100 45 29 72 22 81 180 ...
## $ 카드사용액 : num [1:200] 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
## $ 카드보유유무: Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 2 1 1 ...
## $ 대출유무 : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 2 ...
bank_tb
## # A tibble: 200 x 6
## 나이 경력 수입 카드사용액 카드보유유무 대출유무
## <dbl> <dbl> <dbl> <dbl> <fct> <fct>
## 1 25 1 49 1.6 No No
## 2 45 19 34 1.5 No No
## 3 39 15 11 1 No No
## 4 35 9 100 2.7 No No
## 5 35 8 45 1 Yes No
## 6 37 13 29 0.4 No No
## 7 53 27 72 1.5 No No
## 8 50 24 22 0.3 Yes No
## 9 35 10 81 0.6 No No
## 10 34 9 180 8.9 No Yes
## # ... with 190 more rows
3.기본통계치 확인
skim(bank_tb)
Data summary
Name |
bank_tb |
Number of rows |
200 |
Number of columns |
6 |
_______________________ |
|
Column type frequency: |
|
factor |
2 |
numeric |
4 |
________________________ |
|
Group variables |
None |
Variable type: factor
카드보유유무 |
0 |
1 |
FALSE |
2 |
No: 147, Yes: 53 |
대출유무 |
0 |
1 |
FALSE |
2 |
No: 181, Yes: 19 |
Variable type: numeric
나이 |
0 |
1 |
45.02 |
11.32 |
24 |
36.0 |
46.0 |
54.00 |
67.0 |
▆▇▆▇▅ |
경력 |
0 |
1 |
19.89 |
11.26 |
-1 |
11.0 |
20.0 |
29.00 |
41.0 |
▆▅▇▇▅ |
수입 |
0 |
1 |
73.83 |
46.15 |
8 |
39.0 |
64.5 |
109.75 |
194.0 |
▇▆▃▂▂ |
카드사용액 |
0 |
1 |
2.06 |
1.85 |
0 |
0.7 |
1.5 |
2.70 |
8.9 |
▇▅▂▁▁ |
bank_tb %>%
get_summary_stats()
## # A tibble: 4 x 13
## variable n min max median q1 q3 iqr mad mean sd se
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 경력 200 -1 41 20 11 29 18 13.3 19.9 11.3 0.796
## 2 나이 200 24 67 46 36 54 18 13.3 45.0 11.3 0.8
## 3 수입 200 8 194 64.5 39 110. 70.8 44.5 73.8 46.1 3.26
## 4 카드사용액 200 0 8.9 1.5 0.7 2.7 2 1.33 2.06 1.85 0.131
## # ... with 1 more variable: ci <dbl>
4.그래프 그리기
pairs( ~ 나이+경력+수입+카드사용액, data=bank_tb)
5.로지스틱 회귀분석
glm(종속변수 ~ 독립변수1+독립변수2,family= , data=)
binomial :주어진 분위수에 대한 확률을 계산한다거나, 아니면 특정 확률에 해당하는 분위수를 계산
lr_fit <- glm(대출유무 ~ .,
family = binomial,
data=bank_tb)
summary(lr_fit)
##
## Call:
## glm(formula = 대출유무 ~ ., family = binomial, data = bank_tb)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.12039 -0.21391 -0.09492 -0.04688 2.11174
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.24439 8.57578 -0.728 0.467
## 나이 -0.04803 0.32527 -0.148 0.883
## 경력 0.01079 0.31403 0.034 0.973
## 수입 0.04781 0.01088 4.395 1.11e-05 ***
## 카드사용액 0.18384 0.14679 1.252 0.210
## 카드보유유무Yes -0.20457 0.68377 -0.299 0.765
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 125.582 on 199 degrees of freedom
## Residual deviance: 64.392 on 194 degrees of freedom
## AIC: 76.392
##
## Number of Fisher Scoring iterations: 7
ANOVA 분석 : 세 집단 이상간의 평균차이 분석
위와 동일한 결과를 반환
anova(lr_fit, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: 대출유무
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 199 125.582
## 나이 1 2.664 198 122.918 0.1026
## 경력 1 0.098 197 122.820 0.7548
## 수입 1 56.831 196 65.989 4.75e-14 ***
## 카드사용액 1 1.507 195 64.483 0.2196
## 카드보유유무 1 0.090 194 64.392 0.7638
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
회귀계수
tidy(lr_fit, conf.int = TRUE)
## # A tibble: 6 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -6.24 8.58 -0.728 0.467 -22.9 11.3
## 2 나이 -0.0480 0.325 -0.148 0.883 -0.733 0.560
## 3 경력 0.0108 0.314 0.0344 0.973 -0.582 0.672
## 4 수입 0.0478 0.0109 4.40 0.0000111 0.0289 0.0724
## 5 카드사용액 0.184 0.147 1.25 0.210 -0.104 0.479
## 6 카드보유유무Yes -0.205 0.684 -0.299 0.765 -1.60 1.11
설명력R2
glance(lr_fit)
## # A tibble: 1 x 8
## null.deviance df.null logLik AIC BIC deviance df.residual nobs
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <int>
## 1 126. 199 -32.2 76.4 96.2 64.4 194 200
Odds 계산 ,오즈비
tidy(lr_fit, conf.int = TRUE) %>%
mutate(odds = exp(coef(lr_fit)))
## # A tibble: 6 x 8
## term estimate std.error statistic p.value conf.low conf.high odds
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -6.24 8.58 -0.728 4.67e-1 -22.9 11.3 0.00194
## 2 나이 -0.0480 0.325 -0.148 8.83e-1 -0.733 0.560 0.953
## 3 경력 0.0108 0.314 0.0344 9.73e-1 -0.582 0.672 1.01
## 4 수입 0.0478 0.0109 4.40 1.11e-5 0.0289 0.0724 1.05
## 5 카드사용액 0.184 0.147 1.25 2.10e-1 -0.104 0.479 1.20
## 6 카드보유유무Y~ -0.205 0.684 -0.299 7.65e-1 -1.60 1.11 0.815
추가 자료
base accuracy
bank_tb %>%
count(대출유무) %>%
mutate(prop=n/sum(n))
## # A tibble: 2 x 3
## 대출유무 n prop
## <fct> <int> <dbl>
## 1 No 181 0.905
## 2 Yes 19 0.095
데이터 셋을 훈련용,테스트용 데이터 분할
set.seed(123)
bank_split <- bank_tb %>%
initial_split(starata=대출유뮤)
bank_split
## <Analysis/Assess/Total>
## <150/50/200>
train_data <- training(bank_split)
test_data <- testing(bank_split)
str(train_data)
## tibble [150 x 6] (S3: tbl_df/tbl/data.frame)
## $ 나이 : num [1:150] 32 53 59 53 27 40 58 32 55 48 ...
## $ 경력 : num [1:150] 6 29 32 29 1 16 33 7 31 24 ...
## $ 수입 : num [1:150] 79 24 40 144 112 49 61 132 9 21 ...
## $ 카드사용액 : num [1:150] 1.5 0.2 2.5 6.8 2.1 1.8 2.3 1.1 0.7 0.6 ...
## $ 카드보유유무: Factor w/ 2 levels "No","Yes": 2 2 1 1 2 2 1 1 1 2 ...
## $ 대출유무 : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 2 1 1 ...
str(test_data)
## tibble [50 x 6] (S3: tbl_df/tbl/data.frame)
## $ 나이 : num [1:50] 45 34 67 42 46 46 53 39 46 39 ...
## $ 경력 : num [1:50] 19 9 41 18 21 20 28 15 20 14 ...
## $ 수입 : num [1:50] 34 180 112 81 193 158 41 45 104 43 ...
## $ 카드사용액 : num [1:50] 1.5 8.9 2 2.4 8.1 2.4 0.6 0.7 5.7 0.7 ...
## $ 카드보유유무: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 1 2 1 ...
## $ 대출유무 : Factor w/ 2 levels "No","Yes": 1 2 1 1 2 1 1 1 1 1 ...
AIC
library(dplyr)
AIC(lr_fit)
## [1] 76.3925
glm (family= binomial(link=‘logit’)) 다른 방식
#base1 <- glm(대출유무 ~ ., family=binomial, data =train_data)
lrfit <- glm(formula = 대출유무 ~ .,family = binomial, data=train_data)
lrfit
##
## Call: glm(formula = 대출유무 ~ ., family = binomial, data = train_data)
##
## Coefficients:
## (Intercept) 나이 경력 수입
## 9.77609 -0.66954 0.63526 0.04341
## 카드사용액 카드보유유무Yes
## 0.08244 0.51712
##
## Degrees of Freedom: 149 Total (i.e. Null); 144 Residual
## Null Deviance: 88.43
## Residual Deviance: 48.17 AIC: 60.17
분산차이
anova(lrfit)
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: 대출유무
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 149 88.427
## 나이 1 1.003 148 87.424
## 경력 1 2.366 147 85.057
## 수입 1 36.209 146 48.849
## 카드사용액 1 0.265 145 48.584
## 카드보유유무 1 0.412 144 48.171
summary(lrfit)
##
## Call:
## glm(formula = 대출유무 ~ ., family = binomial, data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.61012 -0.22210 -0.09981 -0.05763 2.23284
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 9.77609 13.08984 0.747 0.455157
## 나이 -0.66954 0.52205 -1.283 0.199665
## 경력 0.63526 0.50830 1.250 0.211384
## 수입 0.04341 0.01134 3.829 0.000128 ***
## 카드사용액 0.08244 0.18588 0.444 0.657399
## 카드보유유무Yes 0.51712 0.80064 0.646 0.518354
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 88.427 on 149 degrees of freedom
## Residual deviance: 48.171 on 144 degrees of freedom
## AIC: 60.171
##
## Number of Fisher Scoring iterations: 7
모델 성능 평가
response는 확률값을 출력
pred <- predict(lrfit, newdata = test_data, type = "response" )
head(pred, 10)
## 1 2 3 4 5 6
## 0.001249827 0.781904641 0.017805080 0.039329644 0.796390129 0.322231065
## 7 8 9 10
## 0.002254070 0.008196985 0.056483791 0.003998450
cat('예측값 :', head(ifelse(pred > 0.5,1,0), 10))
## 예측값 : 0 1 0 0 1 0 0 0 0 0
cat('실제값 :', head(test_data$대출유무, 10))
## 실제값 : 1 2 1 1 2 1 1 1 1 1
result_pred <- ifelse(pred > 0.5,1, 0)
혼동 행렬 (confusion Matrix)
t <- table(result_pred, test_data$대출유무)
t
##
## result_pred No Yes
## 0 44 3
## 1 0 3
분류 정확도
testac <- ((45+1)/ nrow(test_data))
testac2 <- ((t[1,1] + t[2,2]) / nrow(test_data))
testac
## [1] 0.92
sum(diag(t) / nrow(test_data))
## [1] 0.94
ROC curve
#install.packages("ROCR")
library(ROCR)
pr <- ROCR::prediction(pred, test_data$대출유무)
pr
## A prediction instance
## with 50 data points
prf <- ROCR::performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
AUC : 좋은 모델인지 아닌지 판단 , 정량화 작업
auc <- performance(pr, measure = 'auc')
auc
## A performance instance
## 'Area under the ROC curve'
auc <- auc@y.values
0.92 이므로 excellent 모델임
auc[[1]]
## [1] 0.9090909
Leave a comment