Share on

데이터마이닝 이론 및 실제 07

독립표본 t-test

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(tidymodels)

## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --

## v broom        0.8.0     v rsample      0.1.1
## v dials        0.1.1     v tune         0.2.0
## v infer        1.0.0     v workflows    0.2.6
## v modeldata    0.1.1     v workflowsets 0.2.1
## v parsnip      0.2.1     v yardstick    0.0.9
## v recipes      0.2.0

## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## * Use tidymodels_prefer() to resolve common conflicts.

library(rstatix)

## 
## 다음의 패키지를 부착합니다: 'rstatix'

## The following objects are masked from 'package:infer':
## 
##     chisq_test, prop_test, t_test

## The following object is masked from 'package:dials':
## 
##     get_n

## The following object is masked from 'package:stats':
## 
##     filter

library(skimr)

#데이터 가져오기

ist_tb <- read_csv("06.IST.csv",
                   col_names = TRUE,
                   na = ".",
                   locale = locale("ko",encoding = "euc-kr")) %>%
  mutate_if(is.character, as.factor) %>%
  mutate(자동차회사 = factor(자동차회사,
                             levels = c(1,2),
                             labels = c("A자동차", "B자동차")))

## Rows: 60 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (2): 자동차회사, 타이어수명
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

str(ist_tb)

## tibble [60 x 2] (S3: tbl_df/tbl/data.frame)
##  $ 자동차회사: Factor w/ 2 levels "A자동차","B자동차": 1 1 1 1 1 1 1 1 1 1 ...
##  $ 타이어수명: num [1:60] 48187 47245 51020 50732 52416 ...

ist_tb

## # A tibble: 60 x 2
##    자동차회사 타이어수명
##    <fct>           <dbl>
##  1 A자동차         48187
##  2 A자동차         47245
##  3 A자동차         51020
##  4 A자동차         50732
##  5 A자동차         52416
##  6 A자동차         49278
##  7 A자동차         38214
##  8 A자동차         46742
##  9 A자동차         48706
## 10 A자동차         54280
## # ... with 50 more rows

기술통계분석

skim(ist_tb)

Data summary
Name	ist_tb
Number of rows	60
Number of columns	2
_______________________
Column type frequency:
factor	1
numeric	1
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
자동차회사	0	1	FALSE	2	A자동: 30, B자동: 30

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
타이어수명	0	1	50024.08	4113.28	38214	47167.5	50452	52199.75	59299	▁▃▇▆▃

ist_tb %>%
  group_by(자동차회사) %>%
  skim(타이어수명)

Data summary
Name	Piped data
Number of rows	60
Number of columns	2
_______________________
Column type frequency:
numeric	1
________________________
Group variables	자동차회사

Variable type: numeric

skim_variable	자동차회사	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
타이어수명	A자동차	0	1	48670.57	3607.12	38214	46589.75	49047.0	50948.0	55750	▁▂▇▇▂
타이어수명	B자동차	0	1	51377.60	4197.60	41852	48928.25	51395.5	53309.5	59299	▁▃▇▁▅

ist_tb %>% 
  group_by(자동차회사) %>%
  get_summary_stats(타이어수명)

## # A tibble: 2 x 14
##   자동차회사 variable      n   min   max median     q1     q3   iqr   mad   mean
##   <fct>      <chr>     <dbl> <dbl> <dbl>  <dbl>  <dbl>  <dbl> <dbl> <dbl>  <dbl>
## 1 A자동차    타이어수~    30 38214 55750 49047  46590. 50948  4358. 3274. 48671.
## 2 B자동차    타이어수~    30 41852 59299 51396. 48928. 53310. 4381. 3641. 51378.
## # ... with 3 more variables: sd <dbl>, se <dbl>, ci <dbl>

그래프

ist_tb %>%
  ggplot(mapping = aes(x=자동차회사,
                       y=타이어수명)) +
  geom_boxplot()

ist_tb %>%
  ggplot(mapping = aes(x= 타이어수명))+
  geom_histogram(bins = 10,
                 color = "white",
                 fill = "steelblue") +
  facet_wrap(~ 자동차회사)

이상치 제거

ist_tb %>%
  group_by(자동차회사)%>%
  identify_outliers(타이어수명)

## # A tibble: 2 x 4
##   자동차회사 타이어수명 is.outlier is.extreme
##   <fct>           <dbl> <lgl>      <lgl>     
## 1 A자동차         38214 TRUE       FALSE     
## 2 B자동차         41852 TRUE       FALSE

ist_tb <- ist_tb %>%
  filter(!(자동차회사 == "A자동차" & 타이어수명 <= 38214)) %>%
  filter(!(자동차회사 == "B자동차" & 타이어수명 <= 41852))

그래프

ist_tb %>%
  ggplot(mapping = aes(x=자동차회사,
                       y=타이어수명)) +
  geom_boxplot()

정규분포 검정(p값이 0.05보다 크므로 정규분포 이다 )

ist_tb %>%
  group_by(자동차회사)%>%
  shapiro_test(타이어수명)

## # A tibble: 2 x 4
##   자동차회사 variable   statistic     p
##   <fct>      <chr>          <dbl> <dbl>
## 1 A자동차    타이어수명     0.992 0.997
## 2 B자동차    타이어수명     0.965 0.445

등분산검정

ist_tb %>%
  levene_test(타이어수명 ~ 자동차회사)

## # A tibble: 1 x 4
##     df1   df2 statistic     p
##   <int> <int>     <dbl> <dbl>
## 1     1    56     0.754 0.389

차이검정(등분산)

ist_tb %>%
  t_test(formula = 타이어수명 ~ 자동차회사,
         comparisons =c("A자동차","B자동차"),
         var.equal = TRUE,
         alternative = "two.sided",
         detailed = TRUE)

## # A tibble: 1 x 15
##   estimate estimate1 estimate2 .y.   group1 group2    n1    n2 statistic       p
## *    <dbl>     <dbl>     <dbl> <chr> <chr>  <chr>  <int> <int>     <dbl>   <dbl>
## 1   -2675.    49031.    51706. 타이~ A자동~ B자동~    29    29     -2.92 0.00503
## # ... with 5 more variables: df <dbl>, conf.low <dbl>, conf.high <dbl>,
## #   method <chr>, alternative <chr>

두회사 타이어수명은 차이가 있다. (p값 0.005)

ist_tb %>%
  cohens_d(formula = 타이어수명 ~ 자동차회사,
           var.equal = TRUE)

## # A tibble: 1 x 7
##   .y.        group1  group2  effsize    n1    n2 magnitude
## * <chr>      <chr>   <chr>     <dbl> <int> <int> <ord>    
## 1 타이어수명 A자동차 B자동차  -0.767    29    29 moderate

magnitude : moderate 이므로 차이가 있다

추론을 이용한 방법

표본평균

x_bar <- ist_tb %>%
  specify(formula = 타이어수명 ~ 자동차회사) %>%
  calculate(stat = "diff in means",
            order = c("A자동차","B자동차")) %>%
  print()

## Response: 타이어수명 (numeric)
## Explanatory: 자동차회사 (factor)
## # A tibble: 1 x 1
##     stat
##    <dbl>
## 1 -2675.

null_dist_x <- ist_tb %>%
  specify(formula = 타이어수명 ~ 자동차회사) %>%
  hypothesize(null = "independence" ) %>%
  generate(reps = 1000,
           type = "permute") %>% #반복횟수   
  calculate(stat = "diff in means",
            order = c("A자동차","B자동차")) %>%
  print()

## Response: 타이어수명 (numeric)
## Explanatory: 자동차회사 (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 x 2
##    replicate   stat
##        <int>  <dbl>
##  1         1  1495.
##  2         2  1087.
##  3         3   304.
##  4         4  1985.
##  5         5  1926.
##  6         6   204.
##  7         7 -2296.
##  8         8 -1405.
##  9         9 -1771.
## 10        10  -583.
## # ... with 990 more rows

null_dist_ci <- null_dist_x %>%
  get_ci(level = 0.95,
         type = "percentile") %>%
  print()

## # A tibble: 1 x 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1   -1852.    1980.

null_dist_x %>%
  get_p_value(obs_stat = x_bar,
              direction = "two-sided")

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1   0.002

null_dist_x %>%
  visualize() + 
  shade_p_value(obs_stat = x_bar,
                direction = "two-sided") +
  shade_confidence_interval(endpoints = null_dist_ci)

t값

t_cal <- ist_tb %>%
  specify(formula = 타이어수명 ~ 자동차회사) %>%
  calculate(stat = "t",
            order = c("A자동차","B자동차")) %>%
  print()

## Response: 타이어수명 (numeric)
## Explanatory: 자동차회사 (factor)
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1 -2.92

null_dist_t <- ist_tb %>%
  specify(formula = 타이어수명 ~ 자동차회사) %>%
  hypothesize(null = "independence" ) %>%
  generate(reps = 1000,
           type = "permute") %>% #반복횟수   
  calculate(stat = "t",
            order = c("A자동차","B자동차")) %>%
  print()

## Response: 타이어수명 (numeric)
## Explanatory: 자동차회사 (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 x 2
##    replicate    stat
##        <int>   <dbl>
##  1         1 -0.576 
##  2         2  0.446 
##  3         3 -0.0949
##  4         4  0.853 
##  5         5  0.571 
##  6         6 -0.205 
##  7         7  0.0878
##  8         8  0.214 
##  9         9 -0.311 
## 10        10 -1.95  
## # ... with 990 more rows

null_dist_ci <- null_dist_t %>%
  get_ci(level = 0.95,
         type = "percentile") %>%
  print()

## # A tibble: 1 x 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1    -2.09     1.91

null_dist_t %>%
  get_p_value(obs_stat = t_cal,
              direction = "two-sided")

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1   0.006

null_dist_t %>%
  visualize(method = "both") + 
  shade_p_value(obs_stat = t_cal,
                direction = "two-sided") +
  shade_confidence_interval(endpoints = null_dist_ci)

## Warning: Check to make sure the conditions have been met for the theoretical
## method. {infer} currently does not check these for you.

비모수 통계분석

ist_tb %>%
  wilcox_test(formula = 타이어수명 ~ 자동차회사,
              alternative = "two.sided")

## # A tibble: 1 x 7
##   .y.        group1  group2     n1    n2 statistic       p
## * <chr>      <chr>   <chr>   <int> <int>     <dbl>   <dbl>
## 1 타이어수명 A자동차 B자동차    29    29       250 0.00751

대응표본 t-test(Paired Sample t-test))

library(tidyverse)
library(tidymodels)
library(rstatix)
library(skimr)

데이터 불러오기

pst_tb <- read_csv("07.PST.csv",
                   col_names = TRUE,
                   na =".",
                   locale = locale("ko", encoding = "euc-kr"))

## Rows: 20 Columns: 3
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (3): id, 섭취전, 섭취후
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

str(pst_tb)

## spec_tbl_df [20 x 3] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ id    : num [1:20] 1 2 3 4 5 6 7 8 9 10 ...
##  $ 섭취전: num [1:20] 82 54 74 75 71 76 70 62 77 75 ...
##  $ 섭취후: num [1:20] 75 50 74 71 69 73 68 62 68 72 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_double(),
##   ..   섭취전 = col_double(),
##   ..   섭취후 = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

pst_tb

## # A tibble: 20 x 3
##       id 섭취전 섭취후
##    <dbl>  <dbl>  <dbl>
##  1     1     82     75
##  2     2     54     50
##  3     3     74     74
##  4     4     75     71
##  5     5     71     69
##  6     6     76     73
##  7     7     70     68
##  8     8     62     62
##  9     9     77     68
## 10    10     75     72
## 11    11     72     70
## 12    12     83     77
## 13    13     78     71
## 14    14     74     74
## 15    15     68     67
## 16    16     76     73
## 17    17     75     77
## 18    18     75     71
## 19    19     75     76
## 20    20     71     74

long 형 데이터 변환

pst_tb_long <- pst_tb %>%
  pivot_longer(c("섭취전","섭취후"),
               names_to = "시간",
               values_to = "몸무게")

차이값 추가로 설정

pst_tb <- pst_tb %>%
  mutate(차이 = 섭취후 - 섭취전)

기술통계분석

skim(pst_tb)

Data summary
Name	pst_tb
Number of rows	20
Number of columns	4
_______________________
Column type frequency:
numeric	4
________________________
Group variables	None

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	1	10.50	5.92	1	5.75	10.5	15.25	20	▇▇▇▇▇
섭취전	1	73.15	6.43	54	71.00	75.0	76.00	83	▁▁▃▇▂
섭취후	1	70.60	6.10	50	68.75	71.5	74.00	77	▁▁▁▆▇
차이	1	-2.55	3.14	-9	-4.00	-2.5	0.00	3	▂▁▇▃▂

pst_tb %>%
  get_summary_stats(섭취전, 섭취후, 차이)

## # A tibble: 3 x 13
##   variable     n   min   max median    q1    q3   iqr   mad  mean    sd    se
##   <chr>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 섭취전      20    54    83   75    71      76  5     3.71 73.2   6.43 1.44 
## 2 섭취후      20    50    77   71.5  68.8    74  5.25  3.71 70.6   6.10 1.36 
## 3 차이        20    -9     3   -2.5  -4       0  4     2.96 -2.55  3.14 0.701
## # ... with 1 more variable: ci <dbl>

그래프 그리기

pst_tb %>%
  ggplot(mapping = aes(y=차이)) +
  geom_boxplot()

pst_tb %>%
  ggplot(mapping = aes(x=차이)) +
  geom_histogram(binwidth = 3,
                 color = "white",
                 fill = "steelblue")

이상치 제거

pst_tb %>% 
  identify_outliers(차이)

## [1] id         섭취전     섭취후     차이       is.outlier is.extreme
## <0 행> <또는 row.names의 길이가 0입니다>

정규분포 검정

pst_tb %>% 
  shapiro_test(차이)

## # A tibble: 1 x 3
##   variable statistic     p
##   <chr>        <dbl> <dbl>
## 1 차이         0.977 0.883

차이검정

pst_tb_long %>%
  t_test(formula = 몸무게 ~ 시간,
         ref.group = "섭취후",
         paired = TRUE,
         alternative = "less",
         detailed = TRUE)

## # A tibble: 1 x 13
##   estimate .y.    group1 group2    n1    n2 statistic       p    df conf.low
## *    <dbl> <chr>  <chr>  <chr>  <int> <int>     <dbl>   <dbl> <dbl>    <dbl>
## 1    -2.55 몸무게 섭취후 섭취전    20    20     -3.64 0.00088    19     -Inf
## # ... with 3 more variables: conf.high <dbl>, method <chr>, alternative <chr>

pst_tb_long %>%
  cohens_d(formula = 몸무게 ~ 시간,
           paired = TRUE)

## # A tibble: 1 x 7
##   .y.    group1 group2 effsize    n1    n2 magnitude
## * <chr>  <chr>  <chr>    <dbl> <int> <int> <ord>    
## 1 몸무게 섭취전 섭취후   0.813    20    20 large

large -> 차이가 많이난다

추론을 이용한 가설검정

표본 평균(x)

x_bar <- pst_tb %>%
  specify(response = 차이) %>%
  calculate(stat = "mean") %>%
  print()

## Response: 차이 (numeric)
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1 -2.55

null_dist_x <- pst_tb %>%
  specify(response =  차이) %>%
  hypothesise(null = "point",
              mu = 0) %>% 
  generate(reps = 1000,
           type = "bootstrap") %>%
  calculate(stat = "mean") %>%
  print()

## Response: 차이 (numeric)
## Null Hypothesis: point
## # A tibble: 1,000 x 2
##    replicate    stat
##        <int>   <dbl>
##  1         1 -0.5   
##  2         2 -0.45  
##  3         3 -0.5   
##  4         4 -1.3   
##  5         5 -0.350 
##  6         6 -0.0500
##  7         7 -0.100 
##  8         8 -1     
##  9         9  0.100 
## 10        10  0.200 
## # ... with 990 more rows

null_dist_ci <- null_dist_x %>%
  get_ci(level = 0.95,
         type = "percentile") %>%
  print()

## # A tibble: 1 x 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1     -1.4      1.3

null_dist_x %>%
  visualize() + 
  shade_p_value(obs_stat = x_bar,
                direction = "two-sided") + 
  shade_confidence_interval(endpoints = null_dist_ci)

null_dist_x %>%
  get_p_value(obs_stat = x_bar,
              direction = "less")

## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step. See
## `?get_p_value()` for more information.

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1       0

t분포

t_cal <- pst_tb %>%
  specify(response = 차이) %>%
  hypothesize(null = "point",
              mu = 0) %>%
  calculate(stat = "t") %>%
  print()

## Response: 차이 (numeric)
## Null Hypothesis: point
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1 -3.64

null_dist_t <- pst_tb %>%
  specify(response = 차이) %>%
  hypothesise(null = "point",
              mu = 0) %>% 
  generate(reps = 1000,
           type = "bootstrap") %>%
  calculate(stat = "t") %>%
  print()

## Response: 차이 (numeric)
## Null Hypothesis: point
## # A tibble: 1,000 x 2
##    replicate   stat
##        <int>  <dbl>
##  1         1 -0.489
##  2         2  1.25 
##  3         3 -0.280
##  4         4 -2.41 
##  5         5  1.33 
##  6         6 -0.384
##  7         7 -0.796
##  8         8  1.48 
##  9         9  0.577
## 10        10  1.90 
## # ... with 990 more rows

null_dist_ci <- null_dist_t %>%
  get_ci(level = 0.95,
         type = "percentile") %>%
  print()

## # A tibble: 1 x 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1    -1.94     2.08

null_dist_t %>%
  visualize(method = "both") + 
  shade_p_value(obs_stat = t_cal,
                direction = "two-sided") + 
  shade_confidence_interval(endpoints = null_dist_ci)

## Warning: Check to make sure the conditions have been met for the theoretical
## method. {infer} currently does not check these for you.

one -way ANOVA (일원 분산분석)

library(tidyverse)
library(tidymodels)
library(rstatix)
library(skimr)

#데이터 가져오기

owa_tb <- read_csv("08.OWA.csv",
                   col_names = TRUE,
                   na =".",
                   locale = locale("ko", encoding = "euc-kr")) %>%
  mutate_if(is.character, as.factor)%>%
  mutate(매장명 = factor(매장명,
                         levels = c(1:4),
                         labels = c("강남","강서","강동","강북")))

## Rows: 139 Columns: 3
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (3): 번호, 매장명, 만족도
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

str(owa_tb)

## tibble [139 x 3] (S3: tbl_df/tbl/data.frame)
##  $ 번호  : num [1:139] 1 2 3 4 5 6 7 8 9 10 ...
##  $ 매장명: Factor w/ 4 levels "강남","강서",..: 1 2 3 4 1 2 3 4 1 2 ...
##  $ 만족도: num [1:139] 85 82 87 88 93 83 86 85 94 93 ...

owa_tb

## # A tibble: 139 x 3
##     번호 매장명 만족도
##    <dbl> <fct>   <dbl>
##  1     1 강남       85
##  2     2 강서       82
##  3     3 강동       87
##  4     4 강북       88
##  5     5 강남       93
##  6     6 강서       83
##  7     7 강동       86
##  8     8 강북       85
##  9     9 강남       94
## 10    10 강서       93
## # ... with 129 more rows

기술통계분석

skim(owa_tb)

Data summary
Name	owa_tb
Number of rows	139
Number of columns	3
_______________________
Column type frequency:
factor	1
numeric	2
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
매장명	0	1	FALSE	4	강북: 39, 강남: 38, 강서: 32, 강동: 30

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
번호	0	1	70.00	40.27	1	35.5	70	104.5	139	▇▇▇▇▇
만족도	0	1	88.09	5.23	78	84.0	87	92.0	99	▅▇▇▇▃

owa_tb %>%
  group_by(매장명) %>%
  get_summary_stats(만족도)

## # A tibble: 4 x 14
##   매장명 variable     n   min   max median    q1    q3   iqr   mad  mean    sd
##   <fct>  <chr>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 강남   만족도      38    81    99     91  86.2  94.8  8.5   5.93  90.6  5.12
## 2 강서   만족도      32    81    99     91  85.8  93    7.25  5.93  89.6  5.03
## 3 강동   만족도      30    78    94     85  81.2  87    5.75  4.45  84.5  4.25
## 4 강북   만족도      39    79    96     87  84    91    7     5.93  87.2  4.50
## # ... with 2 more variables: se <dbl>, ci <dbl>

그래프 그리기

owa_tb %>%
  ggplot(mapping = aes(x=매장명,
                       y=만족도)) +
  geom_boxplot()

owa_tb %>%
  ggplot(mapping = aes(x=만족도)) +
  geom_histogram(bins = 10,
                 color = "white",
                 fill = "steelblue") +
  facet_wrap(~ 매장명)

이상치 제거

owa_tb %>%
  group_by(매장명) %>%
  identify_outliers(만족도)

## [1] 매장명     번호       만족도     is.outlier is.extreme
## <0 행> <또는 row.names의 길이가 0입니다>

정규분포

owa_tb %>%
  group_by(매장명) %>%
  shapiro_test(만족도)

## # A tibble: 4 x 4
##   매장명 variable statistic     p
##   <fct>  <chr>        <dbl> <dbl>
## 1 강남   만족도       0.956 0.137
## 2 강서   만족도       0.957 0.231
## 3 강동   만족도       0.959 0.299
## 4 강북   만족도       0.973 0.454

등분산 검정

owa_tb %>%
  levene_test(만족도 ~ 매장명)

## # A tibble: 1 x 4
##     df1   df2 statistic     p
##   <int> <int>     <dbl> <dbl>
## 1     3   135      1.06 0.369

ANOVA 검정

owa_tb %>%
  anova_test(만족도 ~ 매장명)

## Coefficient covariances computed by hccm()

## ANOVA Table (type II tests)
## 
##   Effect DFn DFd      F        p p<.05   ges
## 1 매장명   3 135 10.833 2.01e-06     * 0.194

 # p 값 - 2.01e-06 이므로 차이가 있다

owa_results <- aov(만족도 ~ 매장명,
                      data = owa_tb)

tidy(owa_results)

## # A tibble: 2 x 6
##   term         df sumsq meansq statistic     p.value
##   <chr>     <dbl> <dbl>  <dbl>     <dbl>       <dbl>
## 1 매장명        3  734.  245.       10.8  0.00000201
## 2 Residuals   135 3047.   22.6      NA   NA

사후검정

owa_tb %>% 
  pairwise_t_test(formula = 만족도 ~ 매장명,
                  p.adj = "non") # Fisher LSD

## # A tibble: 6 x 9
##   .y.    group1 group2    n1    n2           p p.signif       p.adj p.adj.signif
## * <chr>  <chr>  <chr>  <int> <int>       <dbl> <chr>          <dbl> <chr>       
## 1 만족도 강남   강서      38    32 0.376       ns       0.376       ns          
## 2 만족도 강남   강동      38    30 0.000000547 ****     0.000000547 ****        
## 3 만족도 강서   강동      32    30 0.0000448   ****     0.0000448   ****        
## 4 만족도 강남   강북      38    39 0.00179     **       0.00179     **          
## 5 만족도 강서   강북      32    39 0.0331      *        0.0331      *           
## 6 만족도 강동   강북      30    39 0.023       *        0.023       *

owa_tb %>% 
  pairwise_t_test(formula = 만족도 ~ 매장명,
                  p.adj = "bonferroni")

## # A tibble: 6 x 9
##   .y.    group1 group2    n1    n2           p p.signif      p.adj p.adj.signif
## * <chr>  <chr>  <chr>  <int> <int>       <dbl> <chr>         <dbl> <chr>       
## 1 만족도 강남   강서      38    32 0.376       ns       1          ns          
## 2 만족도 강남   강동      38    30 0.000000547 ****     0.00000328 ****        
## 3 만족도 강서   강동      32    30 0.0000448   ****     0.000269   ***         
## 4 만족도 강남   강북      38    39 0.00179     **       0.0107     *           
## 5 만족도 강서   강북      32    39 0.0331      *        0.199      ns          
## 6 만족도 강동   강북      30    39 0.023       *        0.138      ns

owa_results %>%
  TukeyHSD() %>%
  tidy()

## # A tibble: 6 x 7
##   term   contrast  null.value estimate conf.low conf.high adj.p.value
##   <chr>  <chr>          <dbl>    <dbl>    <dbl>     <dbl>       <dbl>
## 1 매장명 강서-강남          0    -1.01   -3.98      1.95   0.811     
## 2 매장명 강동-강남          0    -6.11   -9.12     -3.09   0.00000325
## 3 매장명 강북-강남          0    -3.45   -6.27     -0.634  0.00956   
## 4 매장명 강동-강서          0    -5.09   -8.23     -1.95   0.000259  
## 5 매장명 강북-강서          0    -2.44   -5.39      0.508  0.142     
## 6 매장명 강북-강동          0     2.65   -0.348     5.66   0.103

library(agricolae)

owa_results %>% 
  LSD.test("매장명",
           group=TRUE,
           console=TRUE)

## 
## Study: . ~ "매장명"
## 
## LSD t Test for 만족도 
## 
## Mean Square Error:  22.57315 
## 
## 매장명,  means and individual ( 95 %) CI
## 
##        만족도      std  r      LCL      UCL Min Max
## 강남 90.60526 5.123024 38 89.08099 92.12954  81  99
## 강동 84.50000 4.248732 30 82.78449 86.21551  78  94
## 강북 87.15385 4.498763 39 85.64924 88.65845  79  96
## 강서 89.59375 5.028046 32 87.93271 91.25479  81  99
## 
## Alpha: 0.05 ; DF Error: 135
## Critical Value of t: 1.977692 
## 
## Groups according to probability of means differences and alpha level( 0.05 )
## 
## Treatments with the same letter are not significantly different.
## 
##        만족도 groups
## 강남 90.60526      a
## 강서 89.59375      a
## 강북 87.15385      b
## 강동 84.50000      c

owa_results %>% 
  duncan.test("매장명",
           group=TRUE,
           console=TRUE)

## 
## Study: . ~ "매장명"
## 
## Duncan's new multiple range test
## for 만족도 
## 
## Mean Square Error:  22.57315 
## 
## 매장명,  means
## 
##        만족도      std  r Min Max
## 강남 90.60526 5.123024 38  81  99
## 강동 84.50000 4.248732 30  78  94
## 강북 87.15385 4.498763 39  79  96
## 강서 89.59375 5.028046 32  81  99
## 
## Groups according to probability of means differences and alpha level( 0.05 )
## 
## Means with the same letter are not significantly different.
## 
##        만족도 groups
## 강남 90.60526      a
## 강서 89.59375      a
## 강북 87.15385      b
## 강동 84.50000      c

owa_results %>% 
  scheffe.test("매장명",
              group=TRUE,
              console=TRUE)

## 
## Study: . ~ "매장명"
## 
## Scheffe Test for 만족도 
## 
## Mean Square Error  : 22.57315 
## 
## 매장명,  means
## 
##        만족도      std  r Min Max
## 강남 90.60526 5.123024 38  81  99
## 강동 84.50000 4.248732 30  78  94
## 강북 87.15385 4.498763 39  79  96
## 강서 89.59375 5.028046 32  81  99
## 
## Alpha: 0.05 ; DF Error: 135 
## Critical Value of F: 2.671676 
## 
## Groups according to probability of means differences and alpha level( 0.05 )
## 
## Means with the same letter are not significantly different.
## 
##        만족도 groups
## 강남 90.60526      a
## 강서 89.59375     ab
## 강북 87.15385     bc
## 강동 84.50000      c

추론에 대한 가설검정

F값

f_cal <- owa_tb %>%
  specify(formula = 만족도 ~ 매장명) %>%
  calculate(stat = "F") %>%
  print()

## Response: 만족도 (numeric)
## Explanatory: 매장명 (factor)
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1  10.8

null_dist_f <- owa_tb %>%
  specify(formula = 만족도 ~ 매장명) %>%
  hypothesize(null = "independence" ) %>%
  generate(reps = 1000,
           type = "permute") %>% #반복횟수   
  calculate(stat = "F") %>%
  print()

## Response: 만족도 (numeric)
## Explanatory: 매장명 (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1 1.30 
##  2         2 1.00 
##  3         3 2.10 
##  4         4 1.36 
##  5         5 2.85 
##  6         6 1.82 
##  7         7 0.701
##  8         8 1.66 
##  9         9 0.947
## 10        10 2.10 
## # ... with 990 more rows

null_dist_f %>%
  get_p_value(obs_stat = f_cal,
              direction = "greater")

## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step. See
## `?get_p_value()` for more information.

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1       0

null_dist_f %>%
  visualize(method = "both") + 
  shade_p_value(obs_stat = f_cal,
                direction = "greater")

## Warning: Check to make sure the conditions have been met for the theoretical
## method. {infer} currently does not check these for you.

부록 : 정규분포 가정이 안될때 shapiro_test < 0.05

Kruskal Wallis H test

owa_tb %>% kruskal_test(만족도 ~ 매장명)

owa_tb %>% dunn_test(만족도 ~ 매장명)

부록 : 이분산(levene_Test) <0.05

Welch’s ANOVA test

owa_tb %>% welch_anova_test(만족도 ~ 매장명)

owa_tb %>% pairwise_t_test(만족도 ~ 매장명, pool.sd = F, P.adj = “bonferroni”)

Share on

Copy URL

Kakao Facebook Twitter LinkedIn

SeongJae Yoo

Share on

데이터마이닝 이론 및 실제 07

독립표본 t-test

기술통계분석

그래프

이상치 제거

그래프

정규분포 검정(p값이 0.05보다 크므로 정규분포 이다 )

등분산검정

차이검정(등분산)

두회사 타이어수명은 차이가 있다. (p값 0.005)

magnitude : moderate 이므로 차이가 있다

추론을 이용한 방법

표본평균

t값

비모수 통계분석

대응표본 t-test(Paired Sample t-test))

데이터 불러오기

long 형 데이터 변환

차이값 추가로 설정

기술통계분석

그래프 그리기

이상치 제거

정규분포 검정

차이검정

large -> 차이가 많이난다

추론을 이용한 가설검정

표본 평균(x)

t분포

one -way ANOVA (일원 분산분석)

기술통계분석

그래프 그리기

이상치 제거

정규분포

등분산 검정

ANOVA 검정

사후검정

추론에 대한 가설검정

F값

부록 : 정규분포 가정이 안될때 shapiro_test < 0.05

Kruskal Wallis H test

부록 : 이분산(levene_Test) <0.05

Welch’s ANOVA test

Share on

Leave a comment

You may also enjoy

자동주식투자프로그램 세팅방법

Data Mining Theory And Reality Midtestexample04

Data Mining Theory And Reality Midtestexample03

Data Mining Theory And Reality Midtestexample02