Share on

데이터마이닝의 이론과 실제 11주차

로지스틱 회귀분석

다중회귀분석

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(tidymodels)

## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --

## v broom        0.8.0     v rsample      0.1.1
## v dials        0.1.1     v tune         0.2.0
## v infer        1.0.0     v workflows    0.2.6
## v modeldata    0.1.1     v workflowsets 0.2.1
## v parsnip      0.2.1     v yardstick    0.0.9
## v recipes      0.2.0

## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## * Learn how to get started at https://www.tidymodels.org/start/

library(rstatix)

## 
## 다음의 패키지를 부착합니다: 'rstatix'

## The following objects are masked from 'package:infer':
## 
##     chisq_test, prop_test, t_test

## The following object is masked from 'package:dials':
## 
##     get_n

## The following object is masked from 'package:stats':
## 
##     filter

library(skimr)

데이터 가져오기

lr_tb<- read_csv("15.LR.csv",
                 col_names = TRUE,
                 na =".",
                 locale = locale("ko", encoding = "euc-kr")) %>%
    mutate_if(is.character,as.factor) %>%
    mutate(exp=factor(exp,
                      levels = c(0,1),
                      labels = c("NO","YES"))) %>%
    mutate(chun = factor(chun,
                         levels = c(0,1),
                         labels = c("NO","YES")))

## Rows: 100 Columns: 5
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (5): phy, psy, cmmt, exp, chun
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

str(lr_tb)

## tibble [100 x 5] (S3: tbl_df/tbl/data.frame)
##  $ phy : num [1:100] 43 54 60 57 60 42 48 46 57 59 ...
##  $ psy : num [1:100] 18 27 30 17 30 27 21 18 30 24 ...
##  $ cmmt: num [1:100] 28 28 26 23 29 26 23 28 23 26 ...
##  $ exp : Factor w/ 2 levels "NO","YES": 2 2 1 1 2 2 1 1 1 1 ...
##  $ chun: Factor w/ 2 levels "NO","YES": 1 1 1 1 1 2 1 1 1 1 ...

lr_tb

## # A tibble: 100 x 5
##      phy   psy  cmmt exp   chun 
##    <dbl> <dbl> <dbl> <fct> <fct>
##  1    43    18    28 YES   NO   
##  2    54    27    28 YES   NO   
##  3    60    30    26 NO    NO   
##  4    57    17    23 NO    NO   
##  5    60    30    29 YES   NO   
##  6    42    27    26 YES   YES  
##  7    48    21    23 NO    NO   
##  8    46    18    28 NO    NO   
##  9    57    30    23 NO    NO   
## 10    59    24    26 NO    NO   
## # ... with 90 more rows

기술 통계분석

skim(lr_tb)

Data summary
Name	lr_tb
Number of rows	100
Number of columns	5
_______________________
Column type frequency:
factor	2
numeric	3
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
exp	0	1	FALSE	2	YES: 52, NO: 48
chun	0	1	FALSE	2	NO: 72, YES: 28

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
phy	1	44.86	7.45	26	41	45	48	60	▁▂▇▃▃
psy	1	21.78	4.88	9	18	22	24	30	▁▅▇▇▆
cmmt	1	22.95	4.21	12	21	24	26	32	▂▃▇▇▁

lr_tb %>%
  get_summary_stats()

## # A tibble: 3 x 13
##   variable     n   min   max median    q1    q3   iqr   mad  mean    sd    se
##   <chr>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 cmmt       100    12    32     24    21    26     5  4.45  23.0  4.21 0.421
## 2 phy        100    26    60     45    41    48     7  4.45  44.9  7.45 0.745
## 3 psy        100     9    30     22    18    24     6  4.45  21.8  4.88 0.488
## # ... with 1 more variable: ci <dbl>

그래프 그리기

pairs(~ phy+psy+cmmt, data=lr_tb)

회귀분석

lr_fit <- glm(chun ~ phy+psy+cmmt+exp,
              family = binomial,
              data = lr_tb)
lr_fit

## 
## Call:  glm(formula = chun ~ phy + psy + cmmt + exp, family = binomial, 
##     data = lr_tb)
## 
## Coefficients:
## (Intercept)          phy          psy         cmmt       expYES  
##    14.80336     -0.05610     -0.06552     -0.60321      2.02826  
## 
## Degrees of Freedom: 99 Total (i.e. Null);  95 Residual
## Null Deviance:       118.6 
## Residual Deviance: 55.45     AIC: 65.45

summary(lr_fit)

## 
## Call:
## glm(formula = chun ~ phy + psy + cmmt + exp, family = binomial, 
##     data = lr_tb)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0764  -0.4233  -0.1697   0.2132   2.5153  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 14.80336    3.60443   4.107 4.01e-05 ***
## phy         -0.05610    0.06693  -0.838   0.4019    
## psy         -0.06552    0.09872  -0.664   0.5069    
## cmmt        -0.60321    0.12860  -4.691 2.72e-06 ***
## expYES       2.02826    0.85186   2.381   0.0173 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 118.591  on 99  degrees of freedom
## Residual deviance:  55.449  on 95  degrees of freedom
## AIC: 65.449
## 
## Number of Fisher Scoring iterations: 6

tidy(lr_fit)

## # A tibble: 5 x 5
##   term        estimate std.error statistic    p.value
##   <chr>          <dbl>     <dbl>     <dbl>      <dbl>
## 1 (Intercept)  14.8       3.60       4.11  0.0000401 
## 2 phy          -0.0561    0.0669    -0.838 0.402     
## 3 psy          -0.0655    0.0987    -0.664 0.507     
## 4 cmmt         -0.603     0.129     -4.69  0.00000272
## 5 expYES        2.03      0.852      2.38  0.0173

glance(lr_fit)

## # A tibble: 1 x 8
##   null.deviance df.null logLik   AIC   BIC deviance df.residual  nobs
##           <dbl>   <int>  <dbl> <dbl> <dbl>    <dbl>       <int> <int>
## 1          119.      99  -27.7  65.4  78.5     55.4          95   100

odds

tidy(lr_fit) %>%
  mutate(odds = exp(coef(lr_fit)))

## # A tibble: 5 x 6
##   term        estimate std.error statistic    p.value        odds
##   <chr>          <dbl>     <dbl>     <dbl>      <dbl>       <dbl>
## 1 (Intercept)  14.8       3.60       4.11  0.0000401  2685443.   
## 2 phy          -0.0561    0.0669    -0.838 0.402            0.945
## 3 psy          -0.0655    0.0987    -0.664 0.507            0.937
## 4 cmmt         -0.603     0.129     -4.69  0.00000272       0.547
## 5 expYES        2.03      0.852      2.38  0.0173           7.60

동질성 검정:사전실험설계

library(tidyverse)
library(tidymodels)
library(rstatix)
library(skimr)

데이터 불러오기

prech_tb <- read_csv("16.PRECH.csv",
                     col_names = TRUE,
                     na = ".",
                     locale = locale("ko", encoding = "euc-kr")) %>%
      mutate(group= factor(group,
                           levels = c(1,2),
                           labels = c("비타민","Placebo"))) %>%
      mutate(cold=factor(cold,
                         levels = c(1,2),
                         labels = c("noCold","Cold")))

## Rows: 100 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (2): group, cold
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

str(prech_tb)

## tibble [100 x 2] (S3: tbl_df/tbl/data.frame)
##  $ group: Factor w/ 2 levels "비타민","Placebo": 1 1 1 1 1 1 1 1 1 1 ...
##  $ cold : Factor w/ 2 levels "noCold","Cold": 1 1 1 1 1 1 1 1 1 2 ...

prech_tb

## # A tibble: 100 x 2
##    group  cold  
##    <fct>  <fct> 
##  1 비타민 noCold
##  2 비타민 noCold
##  3 비타민 noCold
##  4 비타민 noCold
##  5 비타민 noCold
##  6 비타민 noCold
##  7 비타민 noCold
##  8 비타민 noCold
##  9 비타민 noCold
## 10 비타민 Cold  
## # ... with 90 more rows

기본 통계량

skim(prech_tb)

Data summary
Name	prech_tb
Number of rows	100
Number of columns	2
_______________________
Column type frequency:
factor	2
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
group	0	1	FALSE	2	비타민: 50, Pla: 50
cold	0	1	FALSE	2	Col: 55, noC: 45

그래프 그리기

mosaicplot(~ group+cold,
           data=prech_tb,
           color = TRUE,
           cex =1,.2)

카이스퀘어 분석

#install.packages("gmodels")
library(gmodels)

prech_fit <- CrossTable(prech_tb$group,
                        prech_tb$cold,
                        expected = TRUE,
                        chisq = TRUE,
                        asresid = FALSE)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |              Expected N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                | prech_tb$cold 
## prech_tb$group |    noCold |      Cold | Row Total | 
## ---------------|-----------|-----------|-----------|
##         비타민 |        33 |        17 |        50 | 
##                |    22.500 |    27.500 |           | 
##                |     4.900 |     4.009 |           | 
##                |     0.660 |     0.340 |     0.500 | 
##                |     0.733 |     0.309 |           | 
##                |     0.330 |     0.170 |           | 
## ---------------|-----------|-----------|-----------|
##        Placebo |        12 |        38 |        50 | 
##                |    22.500 |    27.500 |           | 
##                |     4.900 |     4.009 |           | 
##                |     0.240 |     0.760 |     0.500 | 
##                |     0.267 |     0.691 |           | 
##                |     0.120 |     0.380 |           | 
## ---------------|-----------|-----------|-----------|
##   Column Total |        45 |        55 |       100 | 
##                |     0.450 |     0.550 |           | 
## ---------------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  17.81818     d.f. =  1     p =  2.430496e-05 
## 
## Pearson's Chi-squared test with Yates' continuity correction 
## ------------------------------------------------------------
## Chi^2 =  16.16162     d.f. =  1     p =  5.816079e-05 
## 
##

수정된 표준잔차

prech_fit$chisq$stdres

##          y
## x            noCold      Cold
##   비타민   4.221159 -4.221159
##   Placebo -4.221159  4.221159

상대위험율

prech_fit$prop.row[1,1]/prech_fit$prop.row[2,1]

## [1] 2.75

독립성 검정 :사후사례대조

library(tidyverse)
library(tidymodels)
library(rstatix)
library(skimr)

데이터 불러오기

postch_tb <- read_csv("17.PostCH.csv",
                     col_names = TRUE,
                     na = ".",
                     locale = locale("ko", encoding = "euc-kr")) %>%
  mutate(cancer= factor(cancer,
                       levels = c(1,2),
                       labels = c("NO","YES"))) %>%
  mutate(smoking=factor(smoking,
                     levels = c(1:5),
                     labels = c("비흡연군","장기금연군","단기금연군","재흡연군","흡연군")))

## Rows: 10 Columns: 3
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (3): cancer, smoking, count
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

str(postch_tb)

## tibble [10 x 3] (S3: tbl_df/tbl/data.frame)
##  $ cancer : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 2 2 2 2 2
##  $ smoking: Factor w/ 5 levels "비흡연군","장기금연군",..: 1 2 3 4 5 1 2 3 4 5
##  $ count  : num [1:10] 170867 51690 46598 29178 27784 ...

postch_tb

## # A tibble: 10 x 3
##    cancer smoking     count
##    <fct>  <fct>       <dbl>
##  1 NO     비흡연군   170867
##  2 NO     장기금연군  51690
##  3 NO     단기금연군  46598
##  4 NO     재흡연군    29178
##  5 NO     흡연군      27784
##  6 YES    비흡연군      723
##  7 YES    장기금연군    370
##  8 YES    단기금연군    497
##  9 YES    재흡연군      319
## 10 YES    흡연군        504

skim(postch_tb)

Data summary
Name	postch_tb
Number of rows	10
Number of columns	3
_______________________
Column type frequency:
factor	2
numeric	1
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
cancer	0	1	FALSE	2	NO: 5, YES: 5
smoking	0	1	FALSE	5	비흡연: 2, 장기금: 2, 단기금: 2, 재흡연: 2

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
count	0	1	32853	52567.56	319	498.75	14253.5	42243	170867	▇▂▁▁▁

2차 데이터 처리

postch_table <- xtabs(count ~ cancer + smoking,
                      data = postch_tb)

postch_table

##       smoking
## cancer 비흡연군 장기금연군 단기금연군 재흡연군 흡연군
##    NO    170867      51690      46598    29178  27784
##    YES      723        370        497      319    504

그래프 그리기

mosaicplot(~ smoking + cancer,
           data = postch_table,
           color = TRUE,
           cex =1, .2)

카이스퀘어 분석

library(gmodels)

postch_fit <- CrossTable(postch_table,
                         expected = TRUE,
                         chisq = TRUE,
                         asresid = F)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |              Expected N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  328530 
## 
##  
##              | smoking 
##       cancer |   비흡연군 | 장기금연군 | 단기금연군 |   재흡연군 |     흡연군 | Row Total | 
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
##           NO |    170867 |     51690 |     46598 |     29178 |     27784 |    326117 | 
##              | 170329.699 | 51677.628 | 46749.095 | 29280.349 | 28080.229 |           | 
##              |     1.695 |     0.003 |     0.488 |     0.358 |     3.125 |           | 
##              |     0.524 |     0.159 |     0.143 |     0.089 |     0.085 |     0.993 | 
##              |     0.996 |     0.993 |     0.989 |     0.989 |     0.982 |           | 
##              |     0.520 |     0.157 |     0.142 |     0.089 |     0.085 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
##          YES |       723 |       370 |       497 |       319 |       504 |      2413 | 
##              |  1260.301 |   382.372 |   345.905 |   216.651 |   207.771 |           | 
##              |   229.066 |     0.400 |    66.000 |    48.351 |   422.349 |           | 
##              |     0.300 |     0.153 |     0.206 |     0.132 |     0.209 |     0.007 | 
##              |     0.004 |     0.007 |     0.011 |     0.011 |     0.018 |           | 
##              |     0.002 |     0.001 |     0.002 |     0.001 |     0.002 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total |    171590 |     52060 |     47095 |     29497 |     28288 |    328530 | 
##              |     0.522 |     0.158 |     0.143 |     0.090 |     0.086 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  771.8354     d.f. =  4     p =  9.67611e-166 
## 
## 
##

그룹간 차이검정

postch_fit$chisq$stdres

##       smoking
## cancer    비흡연군  장기금연군  단기금연군    재흡연군      흡연군
##    NO   21.9786982   0.6922651  -8.8098850  -7.3153225 -21.5768568
##    YES -21.9786982  -0.6922651   8.8098850   7.3153225  21.5768568

odds 오즈비

norm_odds <- postch_fit$t[2,1]/postch_fit$t[1,1]
norm_odds

## [1] 0.004231361

smok_odds <- postch_fit$t[2,5]/postch_fit$t[1,5]
smok_odds

## [1] 0.01813994

smok_odds/norm_odds

## [1] 4.287022

Share on

Copy URL

Kakao Facebook Twitter LinkedIn

SeongJae Yoo

Data Mining Theory And Reality11

Share on

11주차

로지스틱 회귀분석

다중회귀분석

데이터 가져오기

기술 통계분석

그래프 그리기

회귀분석

odds

동질성 검정:사전실험설계

데이터 불러오기

기본 통계량

그래프 그리기

카이스퀘어 분석

수정된 표준잔차

상대위험율

독립성 검정 :사후사례대조

데이터 불러오기

2차 데이터 처리

그래프 그리기

카이스퀘어 분석

그룹간 차이검정

odds 오즈비

Share on

Leave a comment

You may also enjoy

자동주식투자프로그램 세팅방법

Data Mining Theory And Reality Midtestexample04

Data Mining Theory And Reality Midtestexample03

Data Mining Theory And Reality Midtestexample02