Share on

데이터마이닝의 이론과 실제 9주차

반복측정 분산분석 (Repeated Measures ANOVA)

시점이 여러개 일때 사용

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(tidymodels)

## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --

## v broom        0.8.0     v rsample      0.1.1
## v dials        0.1.1     v tune         0.2.0
## v infer        1.0.0     v workflows    0.2.6
## v modeldata    0.1.1     v workflowsets 0.2.1
## v parsnip      0.2.1     v yardstick    0.0.9
## v recipes      0.2.0

## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## * Use suppressPackageStartupMessages() to eliminate package startup messages

library(rstatix)

## 
## 다음의 패키지를 부착합니다: 'rstatix'

## The following objects are masked from 'package:infer':
## 
##     chisq_test, prop_test, t_test

## The following object is masked from 'package:dials':
## 
##     get_n

## The following object is masked from 'package:stats':
## 
##     filter

library(skimr)

#데이터 가져오기

rma_tb <- read_csv("09.RMA.csv",
                   col_names = TRUE,
                   na =".",
                   locale = locale("ko", encoding = "euc-kr")) %>%
  mutate_if(is.character, as.factor)%>%
  mutate(시점 = factor(시점,
                         levels = c(1:3),
                         labels = c("사전","3개월후","6개월후")))

## Rows: 135 Columns: 3
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (3): id, 시점, 점수
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

str(rma_tb)

## tibble [135 x 3] (S3: tbl_df/tbl/data.frame)
##  $ id  : num [1:135] 1 1 1 2 2 2 3 3 3 4 ...
##  $ 시점: Factor w/ 3 levels "사전","3개월후",..: 1 2 3 1 2 3 1 2 3 1 ...
##  $ 점수: num [1:135] 63 64 68 60 61 67 61 67 70 57 ...

rma_tb

## # A tibble: 135 x 3
##       id 시점     점수
##    <dbl> <fct>   <dbl>
##  1     1 사전       63
##  2     1 3개월후    64
##  3     1 6개월후    68
##  4     2 사전       60
##  5     2 3개월후    61
##  6     2 6개월후    67
##  7     3 사전       61
##  8     3 3개월후    67
##  9     3 6개월후    70
## 10     4 사전       57
## # ... with 125 more rows

기술 통계

rma_tb %>% 
  group_by(시점) %>%
  get_summary_stats(점수)

## # A tibble: 3 x 14
##   시점    variable     n   min   max median    q1    q3   iqr   mad  mean    sd
##   <fct>   <chr>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 사전    점수        45    53    67     60    58    63     5  2.96  60.2  3.22
## 2 3개월후 점수        45    55    70     61    59    64     5  4.45  61.6  3.60
## 3 6개월후 점수        45    65    73     68    67    69     2  1.48  68.1  1.83
## # ... with 2 more variables: se <dbl>, ci <dbl>

그래프 그리기

rma_tb %>% 
  ggplot(mapping = aes(x=시점,
                       y=점수)) +
  geom_boxplot()

rma_tb %>% 
    ggplot(mapping = aes(x=점수))+
    geom_histogram(bins = 10) +
    facet_wrap(~시점)

이상치 제거

rma_tb %>% 
  group_by(시점) %>%
  identify_outliers(점수)

## # A tibble: 1 x 5
##   시점       id  점수 is.outlier is.extreme
##   <fct>   <dbl> <dbl> <lgl>      <lgl>     
## 1 6개월후    37    73 TRUE       FALSE

정규분포 검정

rma_tb %>%
  group_by(시점) %>%
  shapiro_test(점수)

## # A tibble: 3 x 4
##   시점    variable statistic      p
##   <fct>   <chr>        <dbl>  <dbl>
## 1 사전    점수         0.972 0.339 
## 2 3개월후 점수         0.962 0.149 
## 3 6개월후 점수         0.954 0.0737

구형성 (p값이 0.05보다 크면 구형성 성립,Mauchly’s Test for Sphericity)

rma_results <- rma_tb %>%
  anova_test(dv = 점수,
             wid = id,
             within = 시점)

rma_results

## ANOVA Table (type III tests)
## 
## $ANOVA
##   Effect DFn DFd       F        p p<.05   ges
## 1   시점   2  88 248.614 6.24e-37     * 0.579
## 
## $`Mauchly's Test for Sphericity`
##   Effect     W     p p<.05
## 1   시점 0.963 0.446      
## 
## $`Sphericity Corrections`
##   Effect   GGe      DF[GG]    p[GG] p[GG]<.05   HFe      DF[HF]    p[HF]
## 1   시점 0.964 1.93, 84.88 1.04e-35         * 1.008 2.02, 88.69 6.24e-37
##   p[HF]<.05
## 1         *

다중비교

rma_tb %>%
  pairwise_t_test(점수~시점,
                    paired = TRUE,
                    p.adj = "bonferroni")

## # A tibble: 3 x 10
##   .y.   group1 group2    n1    n2 statistic    df        p    p.adj p.adj.signif
## * <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>    <dbl>    <dbl> <chr>       
## 1 점수  사전   3개월~    45    45     -3.44    44 1   e- 3 4   e- 3 **          
## 2 점수  사전   6개월~    45    45    -23.0     44 3.48e-26 1.04e-25 ****        
## 3 점수  3개월~ 6개월~    45    45    -17.0     44 6.18e-21 1.85e-20 ****

일변량분석

rma_result_ow <- aov(점수~시점 + Error(id/시점),
                       data= rma_tb)

summary(rma_result_ow)

## 
## Error: id
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals  1  13.24   13.24               
## 
## Error: id:시점
##      Df Sum Sq Mean Sq
## 시점  2   1136     568
## 
## Error: Within
##            Df Sum Sq Mean Sq F value  Pr(>F)    
## 시점        2  500.4   250.2   28.43 5.9e-11 ***
## Residuals 129 1135.4     8.8                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

이원 분산분석 (Two-way ANOVA)

library(tidyverse)
library(tidymodels)
library(rstatix)
library(skimr)

#데이터 가져오기

twa_tb <- read_csv("10.TWA.csv",
                   col_names = TRUE,
                   na =".",
                   locale = locale("ko", encoding = "euc-kr")) %>%
  mutate_if(is.character, as.factor)%>%
  mutate(방법 = factor(방법,
                       levels = c(1:2),
                       labels = c("오븐","기름"))) %>%
  mutate(온도 = factor(온도,
                       levels = c(1:2),
                       labels = c("200도","300도")))

## Rows: 60 Columns: 3
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (3): 방법, 온도, 맛점수
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

str(twa_tb)

## tibble [60 x 3] (S3: tbl_df/tbl/data.frame)
##  $ 방법  : Factor w/ 2 levels "오븐","기름": 1 1 1 1 1 1 1 1 1 1 ...
##  $ 온도  : Factor w/ 2 levels "200도","300도": 2 2 2 2 2 2 2 2 2 2 ...
##  $ 맛점수: num [1:60] 95 93 94 98 97 94 96 92 91 94 ...

twa_tb

## # A tibble: 60 x 3
##    방법  온도  맛점수
##    <fct> <fct>  <dbl>
##  1 오븐  300도     95
##  2 오븐  300도     93
##  3 오븐  300도     94
##  4 오븐  300도     98
##  5 오븐  300도     97
##  6 오븐  300도     94
##  7 오븐  300도     96
##  8 오븐  300도     92
##  9 오븐  300도     91
## 10 오븐  300도     94
## # ... with 50 more rows

기술통계

skim(twa_tb)

Data summary
Name	twa_tb
Number of rows	60
Number of columns	3
_______________________
Column type frequency:
factor	2
numeric	1
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
방법	0	1	FALSE	2	오븐: 30, 기름: 30
온도	0	1	FALSE	2	300: 31, 200: 29

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
맛점수	0	1	90.53	4.2	84	87	91	94	98	▇▇▅▇▅

twa_tb %>%
  group_by(방법, 온도) %>%
  get_summary_stats(맛점수)

## # A tibble: 4 x 15
##   방법  온도  variable     n   min   max median    q1    q3   iqr   mad  mean
##   <fct> <fct> <chr>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 오븐  200도 맛점수      14    84    91   87.5  86.2  89    2.75  2.22  87.5
## 2 오븐  300도 맛점수      16    91    98   94    92.8  95.2  2.5   2.22  94.2
## 3 기름  200도 맛점수      15    90    98   94    92.5  95.5  3     2.96  94  
## 4 기름  300도 맛점수      15    84    88   86    85    87    2     1.48  86  
## # ... with 3 more variables: sd <dbl>, se <dbl>, ci <dbl>

그래프 그리기

twa_tb %>% 
  ggplot(mapping = aes(x=방법,
                       y=맛점수,
                       color = 방법)) +
  geom_boxplot() +
  facet_wrap(~온도)

twa_tb %>% 
  ggplot(mapping = aes(x=맛점수)) + 
  geom_histogram(bins = 10) +
  facet_wrap(~ 방법*온도)

twa_tb %>% 
  group_by(온도, 방법) %>%
  summarise(맛점수= mean(맛점수)) %>%
  ggplot(mapping = aes(x=온도, 
                       y=맛점수,
                       color= 방법)) +
  geom_line(mapping = aes(group=방법)) +
  geom_point()

## `summarise()` has grouped output by '온도'. You can override using the
## `.groups` argument.

이상치 제거

twa_tb %>%
  group_by(방법, 온도) %>%
  identify_outliers(맛점수)

## [1] 방법       온도       맛점수     is.outlier is.extreme
## <0 행> <또는 row.names의 길이가 0입니다>

정규분포

twa_tb %>% 
  group_by(방법,온도) %>%
  shapiro_test(맛점수)

## # A tibble: 4 x 5
##   방법  온도  variable statistic     p
##   <fct> <fct> <chr>        <dbl> <dbl>
## 1 오븐  200도 맛점수       0.970 0.876
## 2 오븐  300도 맛점수       0.961 0.683
## 3 기름  200도 맛점수       0.967 0.817
## 4 기름  300도 맛점수       0.918 0.181

등분산검정

twa_tb %>% 
  levene_test(맛점수 ~ 방법*온도)

## # A tibble: 1 x 4
##     df1   df2 statistic     p
##   <int> <int>     <dbl> <dbl>
## 1     3    56      1.46 0.235

검증

twa_results <- aov(맛점수 ~ 방법*온도,
                    data = twa_tb)
tidy(twa_results)

## # A tibble: 4 x 6
##   term         df  sumsq meansq statistic   p.value
##   <chr>     <dbl>  <dbl>  <dbl>     <dbl>     <dbl>
## 1 방법          1  17.1   17.1       4.55  3.73e- 2
## 2 온도          1   6.77   6.77      1.81  1.84e- 1
## 3 방법:온도     1 807.   807.      215.    7.70e-21
## 4 Residuals    56 210.     3.75     NA    NA

twa_results <- aov(맛점수 ~ 방법+온도,
                      data = twa_tb)

사후검정

twa_tb %>%
  group_by(온도) %>%
  pairwise_t_test(맛점수 ~ 방법,
                  p.adj = "bonferroni")

## # A tibble: 2 x 10
##   온도  .y.    group1 group2    n1    n2        p p.signif    p.adj p.adj.signif
## * <fct> <chr>  <chr>  <chr>  <int> <int>    <dbl> <chr>       <dbl> <chr>       
## 1 200도 맛점수 오븐   기름      14    15 1.2 e- 8 ****     1.2 e- 8 ****        
## 2 300도 맛점수 오븐   기름      16    15 4.81e-14 ****     4.81e-14 ****

상호작용이 없을경우

twa_results <- aov(맛점수 ~ 방법 + 온도,
                      data = twa_tb)
tidy(twa_results)

## # A tibble: 3 x 6
##   term         df   sumsq meansq statistic p.value
##   <chr>     <dbl>   <dbl>  <dbl>     <dbl>   <dbl>
## 1 방법          1   17.1   17.1      0.956   0.332
## 2 온도          1    6.77   6.77     0.379   0.540
## 3 Residuals    57 1017.    17.8     NA      NA

twa_tb %>%
  pairwise_t_test(맛점수 ~ 방법,
                     p.adj = "bonferroni")

## # A tibble: 1 x 9
##   .y.    group1 group2    n1    n2     p p.signif p.adj p.adj.signif
## * <chr>  <chr>  <chr>  <int> <int> <dbl> <chr>    <dbl> <chr>       
## 1 맛점수 오븐   기름      30    30  0.33 ns        0.33 ns

이원 반복측정 분산분석 (Two way repeated measures ANOVA)

library(tidyverse)
library(tidymodels)
library(rstatix)
library(skimr)

#데이터 가져오기

twrma_tb <- read_csv("11.TWRMA.csv",
                   col_names = TRUE,
                   na =".",
                   locale = locale("ko", encoding = "euc-kr")) %>%
  mutate_if(is.character, as.factor)%>%
  mutate(실험그룹 = factor(실험그룹,
                       levels = c(1:2),
                       labels = c("대조군","실험군")))

## Rows: 127 Columns: 4
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (4): id, 실험그룹, 사전, 사후
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

str(twrma_tb)

## tibble [127 x 4] (S3: tbl_df/tbl/data.frame)
##  $ id      : num [1:127] 1 2 3 4 5 6 7 8 9 10 ...
##  $ 실험그룹: Factor w/ 2 levels "대조군","실험군": 1 1 1 1 1 1 1 1 1 1 ...
##  $ 사전    : num [1:127] 12.3 57.2 12.3 16.6 18.6 ...
##  $ 사후    : num [1:127] 23.1 40.5 23.1 45 40.8 ...

twrma_tb

## # A tibble: 127 x 4
##       id 실험그룹  사전  사후
##    <dbl> <fct>    <dbl> <dbl>
##  1     1 대조군    12.3  23.1
##  2     2 대조군    57.2  40.4
##  3     3 대조군    12.3  23.1
##  4     4 대조군    16.6  45.0
##  5     5 대조군    18.6  40.8
##  6     6 대조군    20.1  35.1
##  7     7 대조군    20.1  35.1
##  8     8 대조군    22.8  42.1
##  9     9 대조군    22.8  42.1
## 10    10 대조군    24.4  31.1
## # ... with 117 more rows

long 형 변환

twrma_tb <- twrma_tb %>%
  pivot_longer(c("사전","사후"),
               names_to = "시점",
               values_to = "통증")

twrma_tb

## # A tibble: 254 x 4
##       id 실험그룹 시점   통증
##    <dbl> <fct>    <chr> <dbl>
##  1     1 대조군   사전   12.3
##  2     1 대조군   사후   23.1
##  3     2 대조군   사전   57.2
##  4     2 대조군   사후   40.4
##  5     3 대조군   사전   12.3
##  6     3 대조군   사후   23.1
##  7     4 대조군   사전   16.6
##  8     4 대조군   사후   45.0
##  9     5 대조군   사전   18.6
## 10     5 대조군   사후   40.8
## # ... with 244 more rows

기술통계

twrma_tb %>%
  group_by(실험그룹, 시점) %>%
  get_summary_stats(통증)

## # A tibble: 4 x 15
##   실험그룹 시점  variable     n   min   max median    q1    q3   iqr   mad  mean
##   <fct>    <chr> <chr>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 대조군   사전  통증        58 12.3   73.0   41.4  33.5  49.9  16.4  12.4  41.9
## 2 대조군   사후  통증        58 21.5   57.5   42.1  36.2  49.3  13.0  10.3  41.5
## 3 실험군   사전  통증        69  5.08  72.6   42.4  31.2  51.2  19.9  16.6  41.6
## 4 실험군   사후  통증        69 10.8   54.6   32.6  25.4  38.8  13.4  10.7  32.8
## # ... with 3 more variables: sd <dbl>, se <dbl>, ci <dbl>

그래프 그리기

twrma_tb %>% 
  ggplot(mapping = aes(x=실험그룹,
                       y=통증,
                       color = 실험그룹)) +
  geom_boxplot() +
  facet_wrap(~시점)

twrma_tb %>% 
  ggplot(mapping = aes(x=통증)) + 
  geom_histogram(bins = 10) +
  facet_wrap(~ 실험그룹*시점)

twrma_tb %>% 
  group_by(시점, 실험그룹) %>%
  summarise(통증= mean(통증)) %>%
  ggplot(mapping = aes(x=시점, 
                       y=통증,
                       color= 시점)) +
  geom_line(mapping = aes(group=실험그룹)) +
  geom_point()

## `summarise()` has grouped output by '시점'. You can override using the
## `.groups` argument.

이상치 제거

twrma_tb %>% 
  group_by(실험그룹, 시점) %>%
  identify_outliers(통증)

## [1] 실험그룹   시점       id         통증       is.outlier is.extreme
## <0 행> <또는 row.names의 길이가 0입니다>

# 정규분포 검정  (3개가 정규분포 만족하므로 정규분포로 설정 )

twrma_tb %>%
  group_by(실험그룹, 시점) %>%
  shapiro_test(통증)

## # A tibble: 4 x 5
##   실험그룹 시점  variable statistic      p
##   <fct>    <chr> <chr>        <dbl>  <dbl>
## 1 대조군   사전  통증         0.977 0.340 
## 2 대조군   사후  통증         0.956 0.0351
## 3 실험군   사전  통증         0.983 0.457 
## 4 실험군   사후  통증         0.983 0.487

분석

twrma_tb %>% 
  anova_test(dv = 통증, 
             wid = id,
             within = 시점, # 시점이 두개 이므로 구현성 가정 안함 
             between = 실험그룹)

## Warning: The 'wid' column contains duplicate ids across between-subjects
## variables. Automatic unique id will be created

## ANOVA Table (type III tests)
## 
##          Effect DFn DFd      F        p p<.05   ges
## 1      실험그룹   1 125  4.666 3.30e-02     * 0.029
## 2          시점   1 125 20.902 1.15e-05     * 0.032
## 3 실험그룹:시점   1 125 17.011 6.73e-05     * 0.026

사후 검정

twrma_tb %>%
  group_by(시점) %>%
  pairwise_t_test(통증 ~ 실험그룹,
                    p.adj ="bonferroni")

## # A tibble: 2 x 10
##   시점  .y.   group1 group2    n1    n2          p p.signif   p.adj p.adj.signif
## * <chr> <chr> <chr>  <chr>  <int> <int>      <dbl> <chr>      <dbl> <chr>       
## 1 사전  통증  대조군 실험군    58    69 0.921      ns       9.21e-1 ns          
## 2 사후  통증  대조군 실험군    58    69 0.00000226 ****     2.26e-6 ****

twrma_tb %>%
  group_by(실험그룹) %>%
  pairwise_t_test(통증~ 시점,
                  paired = TRUE,
                  p.adj = "bonferroni")

## # A tibble: 2 x 11
##   실험그룹 .y.   group1 group2    n1    n2 statistic    df        p    p.adj
## * <fct>    <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>    <dbl>    <dbl>
## 1 대조군   통증  사전   사후      58    58     0.243    57 8.09e- 1 8.09e- 1
## 2 실험군   통증  사전   사후      69    69     8.85     68 6.33e-13 6.33e-13
## # ... with 1 more variable: p.adj.signif <chr>

Share on

Copy URL

Kakao Facebook Twitter LinkedIn

SeongJae Yoo

Data Mining Theory And Reality09

Share on

데이터마이닝의 이론과 실제 9주차

반복측정 분산분석 (Repeated Measures ANOVA)

시점이 여러개 일때 사용

기술 통계

그래프 그리기

이상치 제거

정규분포 검정

구형성 (p값이 0.05보다 크면 구형성 성립,Mauchly’s Test for Sphericity)

다중비교

일변량분석

이원 분산분석 (Two-way ANOVA)

기술통계

그래프 그리기

이상치 제거

정규분포

등분산검정

검증

사후검정

상호작용이 없을경우

이원 반복측정 분산분석 (Two way repeated measures ANOVA)

long 형 변환

기술통계

그래프 그리기

이상치 제거

분석

사후 검정

Share on

Leave a comment

You may also enjoy

자동주식투자프로그램 세팅방법

Data Mining Theory And Reality Midtestexample04

Data Mining Theory And Reality Midtestexample03

Data Mining Theory And Reality Midtestexample02