데이터마이닝의 이론과 실제 11주차

로지스틱 회귀분석

다중회귀분석

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidymodels)
## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --
## v broom        0.8.0     v rsample      0.1.1
## v dials        0.1.1     v tune         0.2.0
## v infer        1.0.0     v workflows    0.2.6
## v modeldata    0.1.1     v workflowsets 0.2.1
## v parsnip      0.2.1     v yardstick    0.0.9
## v recipes      0.2.0
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## * Learn how to get started at https://www.tidymodels.org/start/
library(rstatix)
## 
## 다음의 패키지를 부착합니다: 'rstatix'
## The following objects are masked from 'package:infer':
## 
##     chisq_test, prop_test, t_test
## The following object is masked from 'package:dials':
## 
##     get_n
## The following object is masked from 'package:stats':
## 
##     filter
library(skimr)

데이터 가져오기

lr_tb<- read_csv("15.LR.csv",
                 col_names = TRUE,
                 na =".",
                 locale = locale("ko", encoding = "euc-kr")) %>%
    mutate_if(is.character,as.factor) %>%
    mutate(exp=factor(exp,
                      levels = c(0,1),
                      labels = c("NO","YES"))) %>%
    mutate(chun = factor(chun,
                         levels = c(0,1),
                         labels = c("NO","YES")))
## Rows: 100 Columns: 5
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (5): phy, psy, cmmt, exp, chun
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(lr_tb)
## tibble [100 x 5] (S3: tbl_df/tbl/data.frame)
##  $ phy : num [1:100] 43 54 60 57 60 42 48 46 57 59 ...
##  $ psy : num [1:100] 18 27 30 17 30 27 21 18 30 24 ...
##  $ cmmt: num [1:100] 28 28 26 23 29 26 23 28 23 26 ...
##  $ exp : Factor w/ 2 levels "NO","YES": 2 2 1 1 2 2 1 1 1 1 ...
##  $ chun: Factor w/ 2 levels "NO","YES": 1 1 1 1 1 2 1 1 1 1 ...
lr_tb
## # A tibble: 100 x 5
##      phy   psy  cmmt exp   chun 
##    <dbl> <dbl> <dbl> <fct> <fct>
##  1    43    18    28 YES   NO   
##  2    54    27    28 YES   NO   
##  3    60    30    26 NO    NO   
##  4    57    17    23 NO    NO   
##  5    60    30    29 YES   NO   
##  6    42    27    26 YES   YES  
##  7    48    21    23 NO    NO   
##  8    46    18    28 NO    NO   
##  9    57    30    23 NO    NO   
## 10    59    24    26 NO    NO   
## # ... with 90 more rows

기술 통계분석

skim(lr_tb)
Data summary
Name lr_tb
Number of rows 100
Number of columns 5
_______________________
Column type frequency:
factor 2
numeric 3
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
exp 0 1 FALSE 2 YES: 52, NO: 48
chun 0 1 FALSE 2 NO: 72, YES: 28

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
phy 0 1 44.86 7.45 26 41 45 48 60 ▁▂▇▃▃
psy 0 1 21.78 4.88 9 18 22 24 30 ▁▅▇▇▆
cmmt 0 1 22.95 4.21 12 21 24 26 32 ▂▃▇▇▁
lr_tb %>%
  get_summary_stats()
## # A tibble: 3 x 13
##   variable     n   min   max median    q1    q3   iqr   mad  mean    sd    se
##   <chr>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 cmmt       100    12    32     24    21    26     5  4.45  23.0  4.21 0.421
## 2 phy        100    26    60     45    41    48     7  4.45  44.9  7.45 0.745
## 3 psy        100     9    30     22    18    24     6  4.45  21.8  4.88 0.488
## # ... with 1 more variable: ci <dbl>

그래프 그리기

pairs(~ phy+psy+cmmt, data=lr_tb)

회귀분석

lr_fit <- glm(chun ~ phy+psy+cmmt+exp,
              family = binomial,
              data = lr_tb)
lr_fit
## 
## Call:  glm(formula = chun ~ phy + psy + cmmt + exp, family = binomial, 
##     data = lr_tb)
## 
## Coefficients:
## (Intercept)          phy          psy         cmmt       expYES  
##    14.80336     -0.05610     -0.06552     -0.60321      2.02826  
## 
## Degrees of Freedom: 99 Total (i.e. Null);  95 Residual
## Null Deviance:       118.6 
## Residual Deviance: 55.45     AIC: 65.45
summary(lr_fit)
## 
## Call:
## glm(formula = chun ~ phy + psy + cmmt + exp, family = binomial, 
##     data = lr_tb)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0764  -0.4233  -0.1697   0.2132   2.5153  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 14.80336    3.60443   4.107 4.01e-05 ***
## phy         -0.05610    0.06693  -0.838   0.4019    
## psy         -0.06552    0.09872  -0.664   0.5069    
## cmmt        -0.60321    0.12860  -4.691 2.72e-06 ***
## expYES       2.02826    0.85186   2.381   0.0173 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 118.591  on 99  degrees of freedom
## Residual deviance:  55.449  on 95  degrees of freedom
## AIC: 65.449
## 
## Number of Fisher Scoring iterations: 6
tidy(lr_fit)
## # A tibble: 5 x 5
##   term        estimate std.error statistic    p.value
##   <chr>          <dbl>     <dbl>     <dbl>      <dbl>
## 1 (Intercept)  14.8       3.60       4.11  0.0000401 
## 2 phy          -0.0561    0.0669    -0.838 0.402     
## 3 psy          -0.0655    0.0987    -0.664 0.507     
## 4 cmmt         -0.603     0.129     -4.69  0.00000272
## 5 expYES        2.03      0.852      2.38  0.0173
glance(lr_fit)
## # A tibble: 1 x 8
##   null.deviance df.null logLik   AIC   BIC deviance df.residual  nobs
##           <dbl>   <int>  <dbl> <dbl> <dbl>    <dbl>       <int> <int>
## 1          119.      99  -27.7  65.4  78.5     55.4          95   100

odds

tidy(lr_fit) %>%
  mutate(odds = exp(coef(lr_fit)))
## # A tibble: 5 x 6
##   term        estimate std.error statistic    p.value        odds
##   <chr>          <dbl>     <dbl>     <dbl>      <dbl>       <dbl>
## 1 (Intercept)  14.8       3.60       4.11  0.0000401  2685443.   
## 2 phy          -0.0561    0.0669    -0.838 0.402            0.945
## 3 psy          -0.0655    0.0987    -0.664 0.507            0.937
## 4 cmmt         -0.603     0.129     -4.69  0.00000272       0.547
## 5 expYES        2.03      0.852      2.38  0.0173           7.60

동질성 검정:사전실험설계

library(tidyverse)
library(tidymodels)
library(rstatix)
library(skimr)

데이터 불러오기

prech_tb <- read_csv("16.PRECH.csv",
                     col_names = TRUE,
                     na = ".",
                     locale = locale("ko", encoding = "euc-kr")) %>%
      mutate(group= factor(group,
                           levels = c(1,2),
                           labels = c("비타민","Placebo"))) %>%
      mutate(cold=factor(cold,
                         levels = c(1,2),
                         labels = c("noCold","Cold")))
## Rows: 100 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (2): group, cold
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(prech_tb)
## tibble [100 x 2] (S3: tbl_df/tbl/data.frame)
##  $ group: Factor w/ 2 levels "비타민","Placebo": 1 1 1 1 1 1 1 1 1 1 ...
##  $ cold : Factor w/ 2 levels "noCold","Cold": 1 1 1 1 1 1 1 1 1 2 ...
prech_tb
## # A tibble: 100 x 2
##    group  cold  
##    <fct>  <fct> 
##  1 비타민 noCold
##  2 비타민 noCold
##  3 비타민 noCold
##  4 비타민 noCold
##  5 비타민 noCold
##  6 비타민 noCold
##  7 비타민 noCold
##  8 비타민 noCold
##  9 비타민 noCold
## 10 비타민 Cold  
## # ... with 90 more rows

기본 통계량

skim(prech_tb)
Data summary
Name prech_tb
Number of rows 100
Number of columns 2
_______________________
Column type frequency:
factor 2
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
group 0 1 FALSE 2 비타민: 50, Pla: 50
cold 0 1 FALSE 2 Col: 55, noC: 45

그래프 그리기

mosaicplot(~ group+cold,
           data=prech_tb,
           color = TRUE,
           cex =1,.2)

카이스퀘어 분석

#install.packages("gmodels")
library(gmodels)
prech_fit <- CrossTable(prech_tb$group,
                        prech_tb$cold,
                        expected = TRUE,
                        chisq = TRUE,
                        asresid = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |              Expected N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                | prech_tb$cold 
## prech_tb$group |    noCold |      Cold | Row Total | 
## ---------------|-----------|-----------|-----------|
##         비타민 |        33 |        17 |        50 | 
##                |    22.500 |    27.500 |           | 
##                |     4.900 |     4.009 |           | 
##                |     0.660 |     0.340 |     0.500 | 
##                |     0.733 |     0.309 |           | 
##                |     0.330 |     0.170 |           | 
## ---------------|-----------|-----------|-----------|
##        Placebo |        12 |        38 |        50 | 
##                |    22.500 |    27.500 |           | 
##                |     4.900 |     4.009 |           | 
##                |     0.240 |     0.760 |     0.500 | 
##                |     0.267 |     0.691 |           | 
##                |     0.120 |     0.380 |           | 
## ---------------|-----------|-----------|-----------|
##   Column Total |        45 |        55 |       100 | 
##                |     0.450 |     0.550 |           | 
## ---------------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  17.81818     d.f. =  1     p =  2.430496e-05 
## 
## Pearson's Chi-squared test with Yates' continuity correction 
## ------------------------------------------------------------
## Chi^2 =  16.16162     d.f. =  1     p =  5.816079e-05 
## 
## 

수정된 표준잔차

prech_fit$chisq$stdres
##          y
## x            noCold      Cold
##   비타민   4.221159 -4.221159
##   Placebo -4.221159  4.221159

상대위험율

prech_fit$prop.row[1,1]/prech_fit$prop.row[2,1]
## [1] 2.75

독립성 검정 :사후사례대조

library(tidyverse)
library(tidymodels)
library(rstatix)
library(skimr)

데이터 불러오기

postch_tb <- read_csv("17.PostCH.csv",
                     col_names = TRUE,
                     na = ".",
                     locale = locale("ko", encoding = "euc-kr")) %>%
  mutate(cancer= factor(cancer,
                       levels = c(1,2),
                       labels = c("NO","YES"))) %>%
  mutate(smoking=factor(smoking,
                     levels = c(1:5),
                     labels = c("비흡연군","장기금연군","단기금연군","재흡연군","흡연군")))
## Rows: 10 Columns: 3
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (3): cancer, smoking, count
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(postch_tb)
## tibble [10 x 3] (S3: tbl_df/tbl/data.frame)
##  $ cancer : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 2 2 2 2 2
##  $ smoking: Factor w/ 5 levels "비흡연군","장기금연군",..: 1 2 3 4 5 1 2 3 4 5
##  $ count  : num [1:10] 170867 51690 46598 29178 27784 ...
postch_tb
## # A tibble: 10 x 3
##    cancer smoking     count
##    <fct>  <fct>       <dbl>
##  1 NO     비흡연군   170867
##  2 NO     장기금연군  51690
##  3 NO     단기금연군  46598
##  4 NO     재흡연군    29178
##  5 NO     흡연군      27784
##  6 YES    비흡연군      723
##  7 YES    장기금연군    370
##  8 YES    단기금연군    497
##  9 YES    재흡연군      319
## 10 YES    흡연군        504
skim(postch_tb)
Data summary
Name postch_tb
Number of rows 10
Number of columns 3
_______________________
Column type frequency:
factor 2
numeric 1
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
cancer 0 1 FALSE 2 NO: 5, YES: 5
smoking 0 1 FALSE 5 비흡연: 2, 장기금: 2, 단기금: 2, 재흡연: 2

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
count 0 1 32853 52567.56 319 498.75 14253.5 42243 170867 ▇▂▁▁▁

2차 데이터 처리

postch_table <- xtabs(count ~ cancer + smoking,
                      data = postch_tb)
postch_table
##       smoking
## cancer 비흡연군 장기금연군 단기금연군 재흡연군 흡연군
##    NO    170867      51690      46598    29178  27784
##    YES      723        370        497      319    504

그래프 그리기

mosaicplot(~ smoking + cancer,
           data = postch_table,
           color = TRUE,
           cex =1, .2)

카이스퀘어 분석

library(gmodels)

postch_fit <- CrossTable(postch_table,
                         expected = TRUE,
                         chisq = TRUE,
                         asresid = F)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |              Expected N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  328530 
## 
##  
##              | smoking 
##       cancer |   비흡연군 | 장기금연군 | 단기금연군 |   재흡연군 |     흡연군 | Row Total | 
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
##           NO |    170867 |     51690 |     46598 |     29178 |     27784 |    326117 | 
##              | 170329.699 | 51677.628 | 46749.095 | 29280.349 | 28080.229 |           | 
##              |     1.695 |     0.003 |     0.488 |     0.358 |     3.125 |           | 
##              |     0.524 |     0.159 |     0.143 |     0.089 |     0.085 |     0.993 | 
##              |     0.996 |     0.993 |     0.989 |     0.989 |     0.982 |           | 
##              |     0.520 |     0.157 |     0.142 |     0.089 |     0.085 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
##          YES |       723 |       370 |       497 |       319 |       504 |      2413 | 
##              |  1260.301 |   382.372 |   345.905 |   216.651 |   207.771 |           | 
##              |   229.066 |     0.400 |    66.000 |    48.351 |   422.349 |           | 
##              |     0.300 |     0.153 |     0.206 |     0.132 |     0.209 |     0.007 | 
##              |     0.004 |     0.007 |     0.011 |     0.011 |     0.018 |           | 
##              |     0.002 |     0.001 |     0.002 |     0.001 |     0.002 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total |    171590 |     52060 |     47095 |     29497 |     28288 |    328530 | 
##              |     0.522 |     0.158 |     0.143 |     0.090 |     0.086 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  771.8354     d.f. =  4     p =  9.67611e-166 
## 
## 
## 

그룹간 차이검정

postch_fit$chisq$stdres
##       smoking
## cancer    비흡연군  장기금연군  단기금연군    재흡연군      흡연군
##    NO   21.9786982   0.6922651  -8.8098850  -7.3153225 -21.5768568
##    YES -21.9786982  -0.6922651   8.8098850   7.3153225  21.5768568

odds 오즈비

norm_odds <- postch_fit$t[2,1]/postch_fit$t[1,1]
norm_odds
## [1] 0.004231361
smok_odds <- postch_fit$t[2,5]/postch_fit$t[1,5]
smok_odds
## [1] 0.01813994
smok_odds/norm_odds
## [1] 4.287022

Leave a comment