로지스틱 회귀분석
다중회귀분석
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidymodels)
## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --
## v broom 0.8.0 v rsample 0.1.1
## v dials 0.1.1 v tune 0.2.0
## v infer 1.0.0 v workflows 0.2.6
## v modeldata 0.1.1 v workflowsets 0.2.1
## v parsnip 0.2.1 v yardstick 0.0.9
## v recipes 0.2.0
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## * Learn how to get started at https://www.tidymodels.org/start/
library(rstatix)
##
## 다음의 패키지를 부착합니다: 'rstatix'
## The following objects are masked from 'package:infer':
##
## chisq_test, prop_test, t_test
## The following object is masked from 'package:dials':
##
## get_n
## The following object is masked from 'package:stats':
##
## filter
library(skimr)
데이터 가져오기
lr_tb<- read_csv("15.LR.csv",
col_names = TRUE,
na =".",
locale = locale("ko", encoding = "euc-kr")) %>%
mutate_if(is.character,as.factor) %>%
mutate(exp=factor(exp,
levels = c(0,1),
labels = c("NO","YES"))) %>%
mutate(chun = factor(chun,
levels = c(0,1),
labels = c("NO","YES")))
## Rows: 100 Columns: 5
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (5): phy, psy, cmmt, exp, chun
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(lr_tb)
## tibble [100 x 5] (S3: tbl_df/tbl/data.frame)
## $ phy : num [1:100] 43 54 60 57 60 42 48 46 57 59 ...
## $ psy : num [1:100] 18 27 30 17 30 27 21 18 30 24 ...
## $ cmmt: num [1:100] 28 28 26 23 29 26 23 28 23 26 ...
## $ exp : Factor w/ 2 levels "NO","YES": 2 2 1 1 2 2 1 1 1 1 ...
## $ chun: Factor w/ 2 levels "NO","YES": 1 1 1 1 1 2 1 1 1 1 ...
lr_tb
## # A tibble: 100 x 5
## phy psy cmmt exp chun
## <dbl> <dbl> <dbl> <fct> <fct>
## 1 43 18 28 YES NO
## 2 54 27 28 YES NO
## 3 60 30 26 NO NO
## 4 57 17 23 NO NO
## 5 60 30 29 YES NO
## 6 42 27 26 YES YES
## 7 48 21 23 NO NO
## 8 46 18 28 NO NO
## 9 57 30 23 NO NO
## 10 59 24 26 NO NO
## # ... with 90 more rows
기술 통계분석
skim(lr_tb)
Data summary
Name |
lr_tb |
Number of rows |
100 |
Number of columns |
5 |
_______________________ |
|
Column type frequency: |
|
factor |
2 |
numeric |
3 |
________________________ |
|
Group variables |
None |
Variable type: factor
exp |
0 |
1 |
FALSE |
2 |
YES: 52, NO: 48 |
chun |
0 |
1 |
FALSE |
2 |
NO: 72, YES: 28 |
Variable type: numeric
phy |
0 |
1 |
44.86 |
7.45 |
26 |
41 |
45 |
48 |
60 |
▁▂▇▃▃ |
psy |
0 |
1 |
21.78 |
4.88 |
9 |
18 |
22 |
24 |
30 |
▁▅▇▇▆ |
cmmt |
0 |
1 |
22.95 |
4.21 |
12 |
21 |
24 |
26 |
32 |
▂▃▇▇▁ |
lr_tb %>%
get_summary_stats()
## # A tibble: 3 x 13
## variable n min max median q1 q3 iqr mad mean sd se
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 cmmt 100 12 32 24 21 26 5 4.45 23.0 4.21 0.421
## 2 phy 100 26 60 45 41 48 7 4.45 44.9 7.45 0.745
## 3 psy 100 9 30 22 18 24 6 4.45 21.8 4.88 0.488
## # ... with 1 more variable: ci <dbl>
그래프 그리기
pairs(~ phy+psy+cmmt, data=lr_tb)
회귀분석
lr_fit <- glm(chun ~ phy+psy+cmmt+exp,
family = binomial,
data = lr_tb)
lr_fit
##
## Call: glm(formula = chun ~ phy + psy + cmmt + exp, family = binomial,
## data = lr_tb)
##
## Coefficients:
## (Intercept) phy psy cmmt expYES
## 14.80336 -0.05610 -0.06552 -0.60321 2.02826
##
## Degrees of Freedom: 99 Total (i.e. Null); 95 Residual
## Null Deviance: 118.6
## Residual Deviance: 55.45 AIC: 65.45
summary(lr_fit)
##
## Call:
## glm(formula = chun ~ phy + psy + cmmt + exp, family = binomial,
## data = lr_tb)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0764 -0.4233 -0.1697 0.2132 2.5153
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 14.80336 3.60443 4.107 4.01e-05 ***
## phy -0.05610 0.06693 -0.838 0.4019
## psy -0.06552 0.09872 -0.664 0.5069
## cmmt -0.60321 0.12860 -4.691 2.72e-06 ***
## expYES 2.02826 0.85186 2.381 0.0173 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 118.591 on 99 degrees of freedom
## Residual deviance: 55.449 on 95 degrees of freedom
## AIC: 65.449
##
## Number of Fisher Scoring iterations: 6
tidy(lr_fit)
## # A tibble: 5 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 14.8 3.60 4.11 0.0000401
## 2 phy -0.0561 0.0669 -0.838 0.402
## 3 psy -0.0655 0.0987 -0.664 0.507
## 4 cmmt -0.603 0.129 -4.69 0.00000272
## 5 expYES 2.03 0.852 2.38 0.0173
glance(lr_fit)
## # A tibble: 1 x 8
## null.deviance df.null logLik AIC BIC deviance df.residual nobs
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <int>
## 1 119. 99 -27.7 65.4 78.5 55.4 95 100
odds
tidy(lr_fit) %>%
mutate(odds = exp(coef(lr_fit)))
## # A tibble: 5 x 6
## term estimate std.error statistic p.value odds
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 14.8 3.60 4.11 0.0000401 2685443.
## 2 phy -0.0561 0.0669 -0.838 0.402 0.945
## 3 psy -0.0655 0.0987 -0.664 0.507 0.937
## 4 cmmt -0.603 0.129 -4.69 0.00000272 0.547
## 5 expYES 2.03 0.852 2.38 0.0173 7.60
동질성 검정:사전실험설계
library(tidyverse)
library(tidymodels)
library(rstatix)
library(skimr)
데이터 불러오기
prech_tb <- read_csv("16.PRECH.csv",
col_names = TRUE,
na = ".",
locale = locale("ko", encoding = "euc-kr")) %>%
mutate(group= factor(group,
levels = c(1,2),
labels = c("비타민","Placebo"))) %>%
mutate(cold=factor(cold,
levels = c(1,2),
labels = c("noCold","Cold")))
## Rows: 100 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (2): group, cold
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(prech_tb)
## tibble [100 x 2] (S3: tbl_df/tbl/data.frame)
## $ group: Factor w/ 2 levels "비타민","Placebo": 1 1 1 1 1 1 1 1 1 1 ...
## $ cold : Factor w/ 2 levels "noCold","Cold": 1 1 1 1 1 1 1 1 1 2 ...
prech_tb
## # A tibble: 100 x 2
## group cold
## <fct> <fct>
## 1 비타민 noCold
## 2 비타민 noCold
## 3 비타민 noCold
## 4 비타민 noCold
## 5 비타민 noCold
## 6 비타민 noCold
## 7 비타민 noCold
## 8 비타민 noCold
## 9 비타민 noCold
## 10 비타민 Cold
## # ... with 90 more rows
기본 통계량
skim(prech_tb)
Data summary
Name |
prech_tb |
Number of rows |
100 |
Number of columns |
2 |
_______________________ |
|
Column type frequency: |
|
factor |
2 |
________________________ |
|
Group variables |
None |
Variable type: factor
group |
0 |
1 |
FALSE |
2 |
비타민: 50, Pla: 50 |
cold |
0 |
1 |
FALSE |
2 |
Col: 55, noC: 45 |
그래프 그리기
mosaicplot(~ group+cold,
data=prech_tb,
color = TRUE,
cex =1,.2)
카이스퀘어 분석
#install.packages("gmodels")
library(gmodels)
prech_fit <- CrossTable(prech_tb$group,
prech_tb$cold,
expected = TRUE,
chisq = TRUE,
asresid = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Expected N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 100
##
##
## | prech_tb$cold
## prech_tb$group | noCold | Cold | Row Total |
## ---------------|-----------|-----------|-----------|
## 비타민 | 33 | 17 | 50 |
## | 22.500 | 27.500 | |
## | 4.900 | 4.009 | |
## | 0.660 | 0.340 | 0.500 |
## | 0.733 | 0.309 | |
## | 0.330 | 0.170 | |
## ---------------|-----------|-----------|-----------|
## Placebo | 12 | 38 | 50 |
## | 22.500 | 27.500 | |
## | 4.900 | 4.009 | |
## | 0.240 | 0.760 | 0.500 |
## | 0.267 | 0.691 | |
## | 0.120 | 0.380 | |
## ---------------|-----------|-----------|-----------|
## Column Total | 45 | 55 | 100 |
## | 0.450 | 0.550 | |
## ---------------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 17.81818 d.f. = 1 p = 2.430496e-05
##
## Pearson's Chi-squared test with Yates' continuity correction
## ------------------------------------------------------------
## Chi^2 = 16.16162 d.f. = 1 p = 5.816079e-05
##
##
수정된 표준잔차
prech_fit$chisq$stdres
## y
## x noCold Cold
## 비타민 4.221159 -4.221159
## Placebo -4.221159 4.221159
상대위험율
prech_fit$prop.row[1,1]/prech_fit$prop.row[2,1]
## [1] 2.75
독립성 검정 :사후사례대조
library(tidyverse)
library(tidymodels)
library(rstatix)
library(skimr)
데이터 불러오기
postch_tb <- read_csv("17.PostCH.csv",
col_names = TRUE,
na = ".",
locale = locale("ko", encoding = "euc-kr")) %>%
mutate(cancer= factor(cancer,
levels = c(1,2),
labels = c("NO","YES"))) %>%
mutate(smoking=factor(smoking,
levels = c(1:5),
labels = c("비흡연군","장기금연군","단기금연군","재흡연군","흡연군")))
## Rows: 10 Columns: 3
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (3): cancer, smoking, count
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(postch_tb)
## tibble [10 x 3] (S3: tbl_df/tbl/data.frame)
## $ cancer : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 2 2 2 2 2
## $ smoking: Factor w/ 5 levels "비흡연군","장기금연군",..: 1 2 3 4 5 1 2 3 4 5
## $ count : num [1:10] 170867 51690 46598 29178 27784 ...
postch_tb
## # A tibble: 10 x 3
## cancer smoking count
## <fct> <fct> <dbl>
## 1 NO 비흡연군 170867
## 2 NO 장기금연군 51690
## 3 NO 단기금연군 46598
## 4 NO 재흡연군 29178
## 5 NO 흡연군 27784
## 6 YES 비흡연군 723
## 7 YES 장기금연군 370
## 8 YES 단기금연군 497
## 9 YES 재흡연군 319
## 10 YES 흡연군 504
skim(postch_tb)
Data summary
Name |
postch_tb |
Number of rows |
10 |
Number of columns |
3 |
_______________________ |
|
Column type frequency: |
|
factor |
2 |
numeric |
1 |
________________________ |
|
Group variables |
None |
Variable type: factor
cancer |
0 |
1 |
FALSE |
2 |
NO: 5, YES: 5 |
smoking |
0 |
1 |
FALSE |
5 |
비흡연: 2, 장기금: 2, 단기금: 2, 재흡연: 2 |
Variable type: numeric
count |
0 |
1 |
32853 |
52567.56 |
319 |
498.75 |
14253.5 |
42243 |
170867 |
▇▂▁▁▁ |
2차 데이터 처리
postch_table <- xtabs(count ~ cancer + smoking,
data = postch_tb)
postch_table
## smoking
## cancer 비흡연군 장기금연군 단기금연군 재흡연군 흡연군
## NO 170867 51690 46598 29178 27784
## YES 723 370 497 319 504
그래프 그리기
mosaicplot(~ smoking + cancer,
data = postch_table,
color = TRUE,
cex =1, .2)
카이스퀘어 분석
library(gmodels)
postch_fit <- CrossTable(postch_table,
expected = TRUE,
chisq = TRUE,
asresid = F)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Expected N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 328530
##
##
## | smoking
## cancer | 비흡연군 | 장기금연군 | 단기금연군 | 재흡연군 | 흡연군 | Row Total |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## NO | 170867 | 51690 | 46598 | 29178 | 27784 | 326117 |
## | 170329.699 | 51677.628 | 46749.095 | 29280.349 | 28080.229 | |
## | 1.695 | 0.003 | 0.488 | 0.358 | 3.125 | |
## | 0.524 | 0.159 | 0.143 | 0.089 | 0.085 | 0.993 |
## | 0.996 | 0.993 | 0.989 | 0.989 | 0.982 | |
## | 0.520 | 0.157 | 0.142 | 0.089 | 0.085 | |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## YES | 723 | 370 | 497 | 319 | 504 | 2413 |
## | 1260.301 | 382.372 | 345.905 | 216.651 | 207.771 | |
## | 229.066 | 0.400 | 66.000 | 48.351 | 422.349 | |
## | 0.300 | 0.153 | 0.206 | 0.132 | 0.209 | 0.007 |
## | 0.004 | 0.007 | 0.011 | 0.011 | 0.018 | |
## | 0.002 | 0.001 | 0.002 | 0.001 | 0.002 | |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 171590 | 52060 | 47095 | 29497 | 28288 | 328530 |
## | 0.522 | 0.158 | 0.143 | 0.090 | 0.086 | |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 771.8354 d.f. = 4 p = 9.67611e-166
##
##
##
그룹간 차이검정
postch_fit$chisq$stdres
## smoking
## cancer 비흡연군 장기금연군 단기금연군 재흡연군 흡연군
## NO 21.9786982 0.6922651 -8.8098850 -7.3153225 -21.5768568
## YES -21.9786982 -0.6922651 8.8098850 7.3153225 21.5768568
odds 오즈비
norm_odds <- postch_fit$t[2,1]/postch_fit$t[1,1]
norm_odds
## [1] 0.004231361
smok_odds <- postch_fit$t[2,5]/postch_fit$t[1,5]
smok_odds
## [1] 0.01813994
smok_odds/norm_odds
## [1] 4.287022
Leave a comment