Data mining theory and reality07
데이터마이닝 이론 및 실제 07
독립표본 t-test
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidymodels)
## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --
## v broom 0.8.0 v rsample 0.1.1
## v dials 0.1.1 v tune 0.2.0
## v infer 1.0.0 v workflows 0.2.6
## v modeldata 0.1.1 v workflowsets 0.2.1
## v parsnip 0.2.1 v yardstick 0.0.9
## v recipes 0.2.0
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## * Use tidymodels_prefer() to resolve common conflicts.
library(rstatix)
##
## 다음의 패키지를 부착합니다: 'rstatix'
## The following objects are masked from 'package:infer':
##
## chisq_test, prop_test, t_test
## The following object is masked from 'package:dials':
##
## get_n
## The following object is masked from 'package:stats':
##
## filter
library(skimr)
#데이터 가져오기
ist_tb <- read_csv("06.IST.csv",
col_names = TRUE,
na = ".",
locale = locale("ko",encoding = "euc-kr")) %>%
mutate_if(is.character, as.factor) %>%
mutate(자동차회사 = factor(자동차회사,
levels = c(1,2),
labels = c("A자동차", "B자동차")))
## Rows: 60 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (2): 자동차회사, 타이어수명
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(ist_tb)
## tibble [60 x 2] (S3: tbl_df/tbl/data.frame)
## $ 자동차회사: Factor w/ 2 levels "A자동차","B자동차": 1 1 1 1 1 1 1 1 1 1 ...
## $ 타이어수명: num [1:60] 48187 47245 51020 50732 52416 ...
ist_tb
## # A tibble: 60 x 2
## 자동차회사 타이어수명
## <fct> <dbl>
## 1 A자동차 48187
## 2 A자동차 47245
## 3 A자동차 51020
## 4 A자동차 50732
## 5 A자동차 52416
## 6 A자동차 49278
## 7 A자동차 38214
## 8 A자동차 46742
## 9 A자동차 48706
## 10 A자동차 54280
## # ... with 50 more rows
기술통계분석
skim(ist_tb)
Name | ist_tb |
Number of rows | 60 |
Number of columns | 2 |
_______________________ | |
Column type frequency: | |
factor | 1 |
numeric | 1 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
자동차회사 | 0 | 1 | FALSE | 2 | A자동: 30, B자동: 30 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
타이어수명 | 0 | 1 | 50024.08 | 4113.28 | 38214 | 47167.5 | 50452 | 52199.75 | 59299 | ▁▃▇▆▃ |
ist_tb %>%
group_by(자동차회사) %>%
skim(타이어수명)
Name | Piped data |
Number of rows | 60 |
Number of columns | 2 |
_______________________ | |
Column type frequency: | |
numeric | 1 |
________________________ | |
Group variables | 자동차회사 |
Variable type: numeric
skim_variable | 자동차회사 | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|---|
타이어수명 | A자동차 | 0 | 1 | 48670.57 | 3607.12 | 38214 | 46589.75 | 49047.0 | 50948.0 | 55750 | ▁▂▇▇▂ |
타이어수명 | B자동차 | 0 | 1 | 51377.60 | 4197.60 | 41852 | 48928.25 | 51395.5 | 53309.5 | 59299 | ▁▃▇▁▅ |
ist_tb %>%
group_by(자동차회사) %>%
get_summary_stats(타이어수명)
## # A tibble: 2 x 14
## 자동차회사 variable n min max median q1 q3 iqr mad mean
## <fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A자동차 타이어수~ 30 38214 55750 49047 46590. 50948 4358. 3274. 48671.
## 2 B자동차 타이어수~ 30 41852 59299 51396. 48928. 53310. 4381. 3641. 51378.
## # ... with 3 more variables: sd <dbl>, se <dbl>, ci <dbl>
그래프
ist_tb %>%
ggplot(mapping = aes(x=자동차회사,
y=타이어수명)) +
geom_boxplot()
ist_tb %>%
ggplot(mapping = aes(x= 타이어수명))+
geom_histogram(bins = 10,
color = "white",
fill = "steelblue") +
facet_wrap(~ 자동차회사)
이상치 제거
ist_tb %>%
group_by(자동차회사)%>%
identify_outliers(타이어수명)
## # A tibble: 2 x 4
## 자동차회사 타이어수명 is.outlier is.extreme
## <fct> <dbl> <lgl> <lgl>
## 1 A자동차 38214 TRUE FALSE
## 2 B자동차 41852 TRUE FALSE
ist_tb <- ist_tb %>%
filter(!(자동차회사 == "A자동차" & 타이어수명 <= 38214)) %>%
filter(!(자동차회사 == "B자동차" & 타이어수명 <= 41852))
그래프
ist_tb %>%
ggplot(mapping = aes(x=자동차회사,
y=타이어수명)) +
geom_boxplot()
정규분포 검정(p값이 0.05보다 크므로 정규분포 이다 )
ist_tb %>%
group_by(자동차회사)%>%
shapiro_test(타이어수명)
## # A tibble: 2 x 4
## 자동차회사 variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 A자동차 타이어수명 0.992 0.997
## 2 B자동차 타이어수명 0.965 0.445
등분산검정
ist_tb %>%
levene_test(타이어수명 ~ 자동차회사)
## # A tibble: 1 x 4
## df1 df2 statistic p
## <int> <int> <dbl> <dbl>
## 1 1 56 0.754 0.389
차이검정(등분산)
ist_tb %>%
t_test(formula = 타이어수명 ~ 자동차회사,
comparisons =c("A자동차","B자동차"),
var.equal = TRUE,
alternative = "two.sided",
detailed = TRUE)
## # A tibble: 1 x 15
## estimate estimate1 estimate2 .y. group1 group2 n1 n2 statistic p
## * <dbl> <dbl> <dbl> <chr> <chr> <chr> <int> <int> <dbl> <dbl>
## 1 -2675. 49031. 51706. 타이~ A자동~ B자동~ 29 29 -2.92 0.00503
## # ... with 5 more variables: df <dbl>, conf.low <dbl>, conf.high <dbl>,
## # method <chr>, alternative <chr>
두회사 타이어수명은 차이가 있다. (p값 0.005)
ist_tb %>%
cohens_d(formula = 타이어수명 ~ 자동차회사,
var.equal = TRUE)
## # A tibble: 1 x 7
## .y. group1 group2 effsize n1 n2 magnitude
## * <chr> <chr> <chr> <dbl> <int> <int> <ord>
## 1 타이어수명 A자동차 B자동차 -0.767 29 29 moderate
magnitude : moderate 이므로 차이가 있다
추론을 이용한 방법
표본평균
x_bar <- ist_tb %>%
specify(formula = 타이어수명 ~ 자동차회사) %>%
calculate(stat = "diff in means",
order = c("A자동차","B자동차")) %>%
print()
## Response: 타이어수명 (numeric)
## Explanatory: 자동차회사 (factor)
## # A tibble: 1 x 1
## stat
## <dbl>
## 1 -2675.
null_dist_x <- ist_tb %>%
specify(formula = 타이어수명 ~ 자동차회사) %>%
hypothesize(null = "independence" ) %>%
generate(reps = 1000,
type = "permute") %>% #반복횟수
calculate(stat = "diff in means",
order = c("A자동차","B자동차")) %>%
print()
## Response: 타이어수명 (numeric)
## Explanatory: 자동차회사 (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 x 2
## replicate stat
## <int> <dbl>
## 1 1 1495.
## 2 2 1087.
## 3 3 304.
## 4 4 1985.
## 5 5 1926.
## 6 6 204.
## 7 7 -2296.
## 8 8 -1405.
## 9 9 -1771.
## 10 10 -583.
## # ... with 990 more rows
null_dist_ci <- null_dist_x %>%
get_ci(level = 0.95,
type = "percentile") %>%
print()
## # A tibble: 1 x 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 -1852. 1980.
null_dist_x %>%
get_p_value(obs_stat = x_bar,
direction = "two-sided")
## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0.002
null_dist_x %>%
visualize() +
shade_p_value(obs_stat = x_bar,
direction = "two-sided") +
shade_confidence_interval(endpoints = null_dist_ci)
t값
t_cal <- ist_tb %>%
specify(formula = 타이어수명 ~ 자동차회사) %>%
calculate(stat = "t",
order = c("A자동차","B자동차")) %>%
print()
## Response: 타이어수명 (numeric)
## Explanatory: 자동차회사 (factor)
## # A tibble: 1 x 1
## stat
## <dbl>
## 1 -2.92
null_dist_t <- ist_tb %>%
specify(formula = 타이어수명 ~ 자동차회사) %>%
hypothesize(null = "independence" ) %>%
generate(reps = 1000,
type = "permute") %>% #반복횟수
calculate(stat = "t",
order = c("A자동차","B자동차")) %>%
print()
## Response: 타이어수명 (numeric)
## Explanatory: 자동차회사 (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 x 2
## replicate stat
## <int> <dbl>
## 1 1 -0.576
## 2 2 0.446
## 3 3 -0.0949
## 4 4 0.853
## 5 5 0.571
## 6 6 -0.205
## 7 7 0.0878
## 8 8 0.214
## 9 9 -0.311
## 10 10 -1.95
## # ... with 990 more rows
null_dist_ci <- null_dist_t %>%
get_ci(level = 0.95,
type = "percentile") %>%
print()
## # A tibble: 1 x 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 -2.09 1.91
null_dist_t %>%
get_p_value(obs_stat = t_cal,
direction = "two-sided")
## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0.006
null_dist_t %>%
visualize(method = "both") +
shade_p_value(obs_stat = t_cal,
direction = "two-sided") +
shade_confidence_interval(endpoints = null_dist_ci)
## Warning: Check to make sure the conditions have been met for the theoretical
## method. {infer} currently does not check these for you.
비모수 통계분석
ist_tb %>%
wilcox_test(formula = 타이어수명 ~ 자동차회사,
alternative = "two.sided")
## # A tibble: 1 x 7
## .y. group1 group2 n1 n2 statistic p
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl>
## 1 타이어수명 A자동차 B자동차 29 29 250 0.00751
대응표본 t-test(Paired Sample t-test))
library(tidyverse)
library(tidymodels)
library(rstatix)
library(skimr)
데이터 불러오기
pst_tb <- read_csv("07.PST.csv",
col_names = TRUE,
na =".",
locale = locale("ko", encoding = "euc-kr"))
## Rows: 20 Columns: 3
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (3): id, 섭취전, 섭취후
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(pst_tb)
## spec_tbl_df [20 x 3] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ id : num [1:20] 1 2 3 4 5 6 7 8 9 10 ...
## $ 섭취전: num [1:20] 82 54 74 75 71 76 70 62 77 75 ...
## $ 섭취후: num [1:20] 75 50 74 71 69 73 68 62 68 72 ...
## - attr(*, "spec")=
## .. cols(
## .. id = col_double(),
## .. 섭취전 = col_double(),
## .. 섭취후 = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
pst_tb
## # A tibble: 20 x 3
## id 섭취전 섭취후
## <dbl> <dbl> <dbl>
## 1 1 82 75
## 2 2 54 50
## 3 3 74 74
## 4 4 75 71
## 5 5 71 69
## 6 6 76 73
## 7 7 70 68
## 8 8 62 62
## 9 9 77 68
## 10 10 75 72
## 11 11 72 70
## 12 12 83 77
## 13 13 78 71
## 14 14 74 74
## 15 15 68 67
## 16 16 76 73
## 17 17 75 77
## 18 18 75 71
## 19 19 75 76
## 20 20 71 74
long 형 데이터 변환
pst_tb_long <- pst_tb %>%
pivot_longer(c("섭취전","섭취후"),
names_to = "시간",
values_to = "몸무게")
차이값 추가로 설정
pst_tb <- pst_tb %>%
mutate(차이 = 섭취후 - 섭취전)
기술통계분석
skim(pst_tb)
Name | pst_tb |
Number of rows | 20 |
Number of columns | 4 |
_______________________ | |
Column type frequency: | |
numeric | 4 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
id | 0 | 1 | 10.50 | 5.92 | 1 | 5.75 | 10.5 | 15.25 | 20 | ▇▇▇▇▇ |
섭취전 | 0 | 1 | 73.15 | 6.43 | 54 | 71.00 | 75.0 | 76.00 | 83 | ▁▁▃▇▂ |
섭취후 | 0 | 1 | 70.60 | 6.10 | 50 | 68.75 | 71.5 | 74.00 | 77 | ▁▁▁▆▇ |
차이 | 0 | 1 | -2.55 | 3.14 | -9 | -4.00 | -2.5 | 0.00 | 3 | ▂▁▇▃▂ |
pst_tb %>%
get_summary_stats(섭취전, 섭취후, 차이)
## # A tibble: 3 x 13
## variable n min max median q1 q3 iqr mad mean sd se
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 섭취전 20 54 83 75 71 76 5 3.71 73.2 6.43 1.44
## 2 섭취후 20 50 77 71.5 68.8 74 5.25 3.71 70.6 6.10 1.36
## 3 차이 20 -9 3 -2.5 -4 0 4 2.96 -2.55 3.14 0.701
## # ... with 1 more variable: ci <dbl>
그래프 그리기
pst_tb %>%
ggplot(mapping = aes(y=차이)) +
geom_boxplot()
pst_tb %>%
ggplot(mapping = aes(x=차이)) +
geom_histogram(binwidth = 3,
color = "white",
fill = "steelblue")
이상치 제거
pst_tb %>%
identify_outliers(차이)
## [1] id 섭취전 섭취후 차이 is.outlier is.extreme
## <0 행> <또는 row.names의 길이가 0입니다>
정규분포 검정
pst_tb %>%
shapiro_test(차이)
## # A tibble: 1 x 3
## variable statistic p
## <chr> <dbl> <dbl>
## 1 차이 0.977 0.883
차이검정
pst_tb_long %>%
t_test(formula = 몸무게 ~ 시간,
ref.group = "섭취후",
paired = TRUE,
alternative = "less",
detailed = TRUE)
## # A tibble: 1 x 13
## estimate .y. group1 group2 n1 n2 statistic p df conf.low
## * <dbl> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 -2.55 몸무게 섭취후 섭취전 20 20 -3.64 0.00088 19 -Inf
## # ... with 3 more variables: conf.high <dbl>, method <chr>, alternative <chr>
pst_tb_long %>%
cohens_d(formula = 몸무게 ~ 시간,
paired = TRUE)
## # A tibble: 1 x 7
## .y. group1 group2 effsize n1 n2 magnitude
## * <chr> <chr> <chr> <dbl> <int> <int> <ord>
## 1 몸무게 섭취전 섭취후 0.813 20 20 large
large -> 차이가 많이난다
추론을 이용한 가설검정
표본 평균(x)
x_bar <- pst_tb %>%
specify(response = 차이) %>%
calculate(stat = "mean") %>%
print()
## Response: 차이 (numeric)
## # A tibble: 1 x 1
## stat
## <dbl>
## 1 -2.55
null_dist_x <- pst_tb %>%
specify(response = 차이) %>%
hypothesise(null = "point",
mu = 0) %>%
generate(reps = 1000,
type = "bootstrap") %>%
calculate(stat = "mean") %>%
print()
## Response: 차이 (numeric)
## Null Hypothesis: point
## # A tibble: 1,000 x 2
## replicate stat
## <int> <dbl>
## 1 1 -0.5
## 2 2 -0.45
## 3 3 -0.5
## 4 4 -1.3
## 5 5 -0.350
## 6 6 -0.0500
## 7 7 -0.100
## 8 8 -1
## 9 9 0.100
## 10 10 0.200
## # ... with 990 more rows
null_dist_ci <- null_dist_x %>%
get_ci(level = 0.95,
type = "percentile") %>%
print()
## # A tibble: 1 x 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 -1.4 1.3
null_dist_x %>%
visualize() +
shade_p_value(obs_stat = x_bar,
direction = "two-sided") +
shade_confidence_interval(endpoints = null_dist_ci)
null_dist_x %>%
get_p_value(obs_stat = x_bar,
direction = "less")
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step. See
## `?get_p_value()` for more information.
## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0
t분포
t_cal <- pst_tb %>%
specify(response = 차이) %>%
hypothesize(null = "point",
mu = 0) %>%
calculate(stat = "t") %>%
print()
## Response: 차이 (numeric)
## Null Hypothesis: point
## # A tibble: 1 x 1
## stat
## <dbl>
## 1 -3.64
null_dist_t <- pst_tb %>%
specify(response = 차이) %>%
hypothesise(null = "point",
mu = 0) %>%
generate(reps = 1000,
type = "bootstrap") %>%
calculate(stat = "t") %>%
print()
## Response: 차이 (numeric)
## Null Hypothesis: point
## # A tibble: 1,000 x 2
## replicate stat
## <int> <dbl>
## 1 1 -0.489
## 2 2 1.25
## 3 3 -0.280
## 4 4 -2.41
## 5 5 1.33
## 6 6 -0.384
## 7 7 -0.796
## 8 8 1.48
## 9 9 0.577
## 10 10 1.90
## # ... with 990 more rows
null_dist_ci <- null_dist_t %>%
get_ci(level = 0.95,
type = "percentile") %>%
print()
## # A tibble: 1 x 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 -1.94 2.08
null_dist_t %>%
visualize(method = "both") +
shade_p_value(obs_stat = t_cal,
direction = "two-sided") +
shade_confidence_interval(endpoints = null_dist_ci)
## Warning: Check to make sure the conditions have been met for the theoretical
## method. {infer} currently does not check these for you.
one -way ANOVA (일원 분산분석)
library(tidyverse)
library(tidymodels)
library(rstatix)
library(skimr)
#데이터 가져오기
owa_tb <- read_csv("08.OWA.csv",
col_names = TRUE,
na =".",
locale = locale("ko", encoding = "euc-kr")) %>%
mutate_if(is.character, as.factor)%>%
mutate(매장명 = factor(매장명,
levels = c(1:4),
labels = c("강남","강서","강동","강북")))
## Rows: 139 Columns: 3
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (3): 번호, 매장명, 만족도
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(owa_tb)
## tibble [139 x 3] (S3: tbl_df/tbl/data.frame)
## $ 번호 : num [1:139] 1 2 3 4 5 6 7 8 9 10 ...
## $ 매장명: Factor w/ 4 levels "강남","강서",..: 1 2 3 4 1 2 3 4 1 2 ...
## $ 만족도: num [1:139] 85 82 87 88 93 83 86 85 94 93 ...
owa_tb
## # A tibble: 139 x 3
## 번호 매장명 만족도
## <dbl> <fct> <dbl>
## 1 1 강남 85
## 2 2 강서 82
## 3 3 강동 87
## 4 4 강북 88
## 5 5 강남 93
## 6 6 강서 83
## 7 7 강동 86
## 8 8 강북 85
## 9 9 강남 94
## 10 10 강서 93
## # ... with 129 more rows
기술통계분석
skim(owa_tb)
Name | owa_tb |
Number of rows | 139 |
Number of columns | 3 |
_______________________ | |
Column type frequency: | |
factor | 1 |
numeric | 2 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
매장명 | 0 | 1 | FALSE | 4 | 강북: 39, 강남: 38, 강서: 32, 강동: 30 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
번호 | 0 | 1 | 70.00 | 40.27 | 1 | 35.5 | 70 | 104.5 | 139 | ▇▇▇▇▇ |
만족도 | 0 | 1 | 88.09 | 5.23 | 78 | 84.0 | 87 | 92.0 | 99 | ▅▇▇▇▃ |
owa_tb %>%
group_by(매장명) %>%
get_summary_stats(만족도)
## # A tibble: 4 x 14
## 매장명 variable n min max median q1 q3 iqr mad mean sd
## <fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 강남 만족도 38 81 99 91 86.2 94.8 8.5 5.93 90.6 5.12
## 2 강서 만족도 32 81 99 91 85.8 93 7.25 5.93 89.6 5.03
## 3 강동 만족도 30 78 94 85 81.2 87 5.75 4.45 84.5 4.25
## 4 강북 만족도 39 79 96 87 84 91 7 5.93 87.2 4.50
## # ... with 2 more variables: se <dbl>, ci <dbl>
그래프 그리기
owa_tb %>%
ggplot(mapping = aes(x=매장명,
y=만족도)) +
geom_boxplot()
owa_tb %>%
ggplot(mapping = aes(x=만족도)) +
geom_histogram(bins = 10,
color = "white",
fill = "steelblue") +
facet_wrap(~ 매장명)
이상치 제거
owa_tb %>%
group_by(매장명) %>%
identify_outliers(만족도)
## [1] 매장명 번호 만족도 is.outlier is.extreme
## <0 행> <또는 row.names의 길이가 0입니다>
정규분포
owa_tb %>%
group_by(매장명) %>%
shapiro_test(만족도)
## # A tibble: 4 x 4
## 매장명 variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 강남 만족도 0.956 0.137
## 2 강서 만족도 0.957 0.231
## 3 강동 만족도 0.959 0.299
## 4 강북 만족도 0.973 0.454
등분산 검정
owa_tb %>%
levene_test(만족도 ~ 매장명)
## # A tibble: 1 x 4
## df1 df2 statistic p
## <int> <int> <dbl> <dbl>
## 1 3 135 1.06 0.369
ANOVA 검정
owa_tb %>%
anova_test(만족도 ~ 매장명)
## Coefficient covariances computed by hccm()
## ANOVA Table (type II tests)
##
## Effect DFn DFd F p p<.05 ges
## 1 매장명 3 135 10.833 2.01e-06 * 0.194
# p 값 - 2.01e-06 이므로 차이가 있다
owa_results <- aov(만족도 ~ 매장명,
data = owa_tb)
tidy(owa_results)
## # A tibble: 2 x 6
## term df sumsq meansq statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 매장명 3 734. 245. 10.8 0.00000201
## 2 Residuals 135 3047. 22.6 NA NA
사후검정
owa_tb %>%
pairwise_t_test(formula = 만족도 ~ 매장명,
p.adj = "non") # Fisher LSD
## # A tibble: 6 x 9
## .y. group1 group2 n1 n2 p p.signif p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <chr> <dbl> <chr>
## 1 만족도 강남 강서 38 32 0.376 ns 0.376 ns
## 2 만족도 강남 강동 38 30 0.000000547 **** 0.000000547 ****
## 3 만족도 강서 강동 32 30 0.0000448 **** 0.0000448 ****
## 4 만족도 강남 강북 38 39 0.00179 ** 0.00179 **
## 5 만족도 강서 강북 32 39 0.0331 * 0.0331 *
## 6 만족도 강동 강북 30 39 0.023 * 0.023 *
owa_tb %>%
pairwise_t_test(formula = 만족도 ~ 매장명,
p.adj = "bonferroni")
## # A tibble: 6 x 9
## .y. group1 group2 n1 n2 p p.signif p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <chr> <dbl> <chr>
## 1 만족도 강남 강서 38 32 0.376 ns 1 ns
## 2 만족도 강남 강동 38 30 0.000000547 **** 0.00000328 ****
## 3 만족도 강서 강동 32 30 0.0000448 **** 0.000269 ***
## 4 만족도 강남 강북 38 39 0.00179 ** 0.0107 *
## 5 만족도 강서 강북 32 39 0.0331 * 0.199 ns
## 6 만족도 강동 강북 30 39 0.023 * 0.138 ns
owa_results %>%
TukeyHSD() %>%
tidy()
## # A tibble: 6 x 7
## term contrast null.value estimate conf.low conf.high adj.p.value
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 매장명 강서-강남 0 -1.01 -3.98 1.95 0.811
## 2 매장명 강동-강남 0 -6.11 -9.12 -3.09 0.00000325
## 3 매장명 강북-강남 0 -3.45 -6.27 -0.634 0.00956
## 4 매장명 강동-강서 0 -5.09 -8.23 -1.95 0.000259
## 5 매장명 강북-강서 0 -2.44 -5.39 0.508 0.142
## 6 매장명 강북-강동 0 2.65 -0.348 5.66 0.103
library(agricolae)
owa_results %>%
LSD.test("매장명",
group=TRUE,
console=TRUE)
##
## Study: . ~ "매장명"
##
## LSD t Test for 만족도
##
## Mean Square Error: 22.57315
##
## 매장명, means and individual ( 95 %) CI
##
## 만족도 std r LCL UCL Min Max
## 강남 90.60526 5.123024 38 89.08099 92.12954 81 99
## 강동 84.50000 4.248732 30 82.78449 86.21551 78 94
## 강북 87.15385 4.498763 39 85.64924 88.65845 79 96
## 강서 89.59375 5.028046 32 87.93271 91.25479 81 99
##
## Alpha: 0.05 ; DF Error: 135
## Critical Value of t: 1.977692
##
## Groups according to probability of means differences and alpha level( 0.05 )
##
## Treatments with the same letter are not significantly different.
##
## 만족도 groups
## 강남 90.60526 a
## 강서 89.59375 a
## 강북 87.15385 b
## 강동 84.50000 c
owa_results %>%
duncan.test("매장명",
group=TRUE,
console=TRUE)
##
## Study: . ~ "매장명"
##
## Duncan's new multiple range test
## for 만족도
##
## Mean Square Error: 22.57315
##
## 매장명, means
##
## 만족도 std r Min Max
## 강남 90.60526 5.123024 38 81 99
## 강동 84.50000 4.248732 30 78 94
## 강북 87.15385 4.498763 39 79 96
## 강서 89.59375 5.028046 32 81 99
##
## Groups according to probability of means differences and alpha level( 0.05 )
##
## Means with the same letter are not significantly different.
##
## 만족도 groups
## 강남 90.60526 a
## 강서 89.59375 a
## 강북 87.15385 b
## 강동 84.50000 c
owa_results %>%
scheffe.test("매장명",
group=TRUE,
console=TRUE)
##
## Study: . ~ "매장명"
##
## Scheffe Test for 만족도
##
## Mean Square Error : 22.57315
##
## 매장명, means
##
## 만족도 std r Min Max
## 강남 90.60526 5.123024 38 81 99
## 강동 84.50000 4.248732 30 78 94
## 강북 87.15385 4.498763 39 79 96
## 강서 89.59375 5.028046 32 81 99
##
## Alpha: 0.05 ; DF Error: 135
## Critical Value of F: 2.671676
##
## Groups according to probability of means differences and alpha level( 0.05 )
##
## Means with the same letter are not significantly different.
##
## 만족도 groups
## 강남 90.60526 a
## 강서 89.59375 ab
## 강북 87.15385 bc
## 강동 84.50000 c
추론에 대한 가설검정
F값
f_cal <- owa_tb %>%
specify(formula = 만족도 ~ 매장명) %>%
calculate(stat = "F") %>%
print()
## Response: 만족도 (numeric)
## Explanatory: 매장명 (factor)
## # A tibble: 1 x 1
## stat
## <dbl>
## 1 10.8
null_dist_f <- owa_tb %>%
specify(formula = 만족도 ~ 매장명) %>%
hypothesize(null = "independence" ) %>%
generate(reps = 1000,
type = "permute") %>% #반복횟수
calculate(stat = "F") %>%
print()
## Response: 만족도 (numeric)
## Explanatory: 매장명 (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 x 2
## replicate stat
## <int> <dbl>
## 1 1 1.30
## 2 2 1.00
## 3 3 2.10
## 4 4 1.36
## 5 5 2.85
## 6 6 1.82
## 7 7 0.701
## 8 8 1.66
## 9 9 0.947
## 10 10 2.10
## # ... with 990 more rows
null_dist_f %>%
get_p_value(obs_stat = f_cal,
direction = "greater")
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step. See
## `?get_p_value()` for more information.
## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0
null_dist_f %>%
visualize(method = "both") +
shade_p_value(obs_stat = f_cal,
direction = "greater")
## Warning: Check to make sure the conditions have been met for the theoretical
## method. {infer} currently does not check these for you.
부록 : 정규분포 가정이 안될때 shapiro_test < 0.05
Kruskal Wallis H test
owa_tb %>% kruskal_test(만족도 ~ 매장명)
owa_tb %>% dunn_test(만족도 ~ 매장명)
부록 : 이분산(levene_Test) <0.05
Welch’s ANOVA test
owa_tb %>% welch_anova_test(만족도 ~ 매장명)
owa_tb %>% pairwise_t_test(만족도 ~ 매장명, pool.sd = F, P.adj = “bonferroni”)
Leave a comment