1. A제약회사에서는 이번에 제품A와 제품B 두 가지 타입의 진통제를 개발하였다. 과연 이 두 가지 타입의 제품은 지속효과(시간)이 차이가 없는가?
(data1.sav) -. group : 1)A제품 2)B제품-. time : 지속효과
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidymodels)
## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --
## v broom 0.8.0 v rsample 0.1.1
## v dials 0.1.1 v tune 0.2.0
## v infer 1.0.0 v workflows 0.2.6
## v modeldata 0.1.1 v workflowsets 0.2.1
## v parsnip 0.2.1 v yardstick 0.0.9
## v recipes 0.2.0
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## * Learn how to get started at https://www.tidymodels.org/start/
library(rstatix)
##
## 다음의 패키지를 부착합니다: 'rstatix'
## The following objects are masked from 'package:infer':
##
## chisq_test, prop_test, t_test
## The following object is masked from 'package:dials':
##
## get_n
## The following object is masked from 'package:stats':
##
## filter
library(skimr)
edu_tb <- read_csv('data1.csv',
col_names = TRUE,
locale=locale('ko', encoding='euc-kr'), # 한글
na=".") %>%
round(3) %>% # 소수점 2자리로 반올림
mutate_if(is.character, as.factor) %>%
mutate(group = factor(group,
levels=c(1:2),
labels=c("A제품","B제품")))
## Rows: 80 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (2): group, time
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(edu_tb)
## tibble [80 x 2] (S3: tbl_df/tbl/data.frame)
## $ group: Factor w/ 2 levels "A제품","B제품": 1 1 1 1 1 1 1 1 1 1 ...
## $ time : num [1:80] 4.25 5.25 5.75 4.08 4.5 4.5 4.5 5.83 6.42 6.75 ...
edu_tb
## # A tibble: 80 x 2
## group time
## <fct> <dbl>
## 1 A제품 4.25
## 2 A제품 5.25
## 3 A제품 5.75
## 4 A제품 4.08
## 5 A제품 4.5
## 6 A제품 4.5
## 7 A제품 4.5
## 8 A제품 5.83
## 9 A제품 6.42
## 10 A제품 6.75
## # ... with 70 more rows
기술통계분석
skim(edu_tb)
Data summary
Name |
edu_tb |
Number of rows |
80 |
Number of columns |
2 |
_______________________ |
|
Column type frequency: |
|
factor |
1 |
numeric |
1 |
________________________ |
|
Group variables |
None |
Variable type: factor
group |
0 |
1 |
FALSE |
2 |
A제품: 40, B제품: 40 |
Variable type: numeric
time |
0 |
1 |
5.02 |
0.75 |
3.56 |
4.5 |
4.83 |
5.52 |
6.75 |
▂▇▅▃▂ |
edu_tb %>%
group_by(group) %>%
get_summary_stats(time)
## # A tibble: 2 x 14
## group variable n min max median q1 q3 iqr mad mean sd
## <fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A제품 time 40 3.67 6.75 4.79 4.5 5.54 1.04 0.682 5.03 0.78
## 2 B제품 time 40 3.56 6.67 5 4.56 5.39 0.828 0.652 5.02 0.738
## # ... with 2 more variables: se <dbl>, ci <dbl>
그래프 그리기(박스그래프,히스토그램)
edu_tb %>%
ggplot(mapping = aes(x = group,
y = time)) +
geom_boxplot() +
labs(y = "time")
edu_tb %>%
ggplot(mapping = aes(x = time)) +
geom_histogram(bins = 100,
color = "white",
fill = "steelblue") +
facet_wrap(~ group)
이상치 제거
이상치 확인
edu_tb %>%
group_by(group) %>%
identify_outliers(time)
## # A tibble: 1 x 4
## group time is.outlier is.extreme
## <fct> <dbl> <lgl> <lgl>
## 1 B제품 6.67 TRUE FALSE
이상치 제거
edu_tb <- edu_tb %>%
filter(!(group == "B제품" & time >= 6.67 ))
edu_tb %>%
ggplot(mapping = aes(x = group,
y = time)) +
geom_boxplot() +
labs(y = "time")
정규분포 검정
edu_tb %>%
group_by(group) %>%
shapiro_test(time)
## # A tibble: 2 x 4
## group variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 A제품 time 0.957 0.127
## 2 B제품 time 0.943 0.0489
p값이 0.05 보다 크므로 정규분포를 따른다.
등분산 검정
edu_tb %>%
levene_test(time ~ group)
## # A tibble: 1 x 4
## df1 df2 statistic p
## <int> <int> <dbl> <dbl>
## 1 1 77 0.514 0.475
p값이 0.05보다 크므로 등분산 성립
등분산 t-검정
edu_tb %>%
t_test(formula = time ~ group,
comparisons = c("A제품", "B제품"),
paired = FALSE,
var.equal = TRUE, # 등분산 성립하므로 TRUE
alternative = "two.sided",
detailed = TRUE)
## # A tibble: 1 x 15
## estimate estimate1 estimate2 .y. group1 group2 n1 n2 statistic p
## * <dbl> <dbl> <dbl> <chr> <chr> <chr> <int> <int> <dbl> <dbl>
## 1 0.0498 5.03 4.98 time A제품 B제품 40 39 0.299 0.766
## # ... with 5 more variables: df <dbl>, conf.low <dbl>, conf.high <dbl>,
## # method <chr>, alternative <chr>
edu_tb %>%
cohens_d(formula = time ~ group,
var.equal = TRUE)
## # A tibble: 1 x 7
## .y. group1 group2 effsize n1 n2 magnitude
## * <chr> <chr> <chr> <dbl> <int> <int> <ord>
## 1 time A제품 B제품 0.0673 40 39 negligible
negligible -> 거의 차이가 없다(A제품과 B제품끼리)
추론(infer)을 이용한 가설검정 및 그래프
표본평균(x)을 이용한 검정그래프
표본평균(x) 계산
x_bar <- edu_tb %>%
specify(formula = time ~ group) %>% # hypothesize 없음
calculate(stat = "diff in means",
order= c("A제품", "B제품")) %>%
print()
## Response: time (numeric)
## Explanatory: group (factor)
## # A tibble: 1 x 1
## stat
## <dbl>
## 1 0.0498
Bootstrapping을 이용한 귀무가설 분포 생성
set.seed(123)
null_dist_x <- edu_tb %>%
specify(formula = time ~ group) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in means",
order= c("A제품", "B제품")) %>%
print()
## Response: time (numeric)
## Explanatory: group (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 x 2
## replicate stat
## <int> <dbl>
## 1 1 0.190
## 2 2 0.409
## 3 3 0.150
## 4 4 0.125
## 5 5 0.0711
## 6 6 0.173
## 7 7 0.112
## 8 8 -0.0307
## 9 9 0.181
## 10 10 -0.0808
## # ... with 990 more rows
신뢰구간 생성
null_dist_ci <- null_dist_x %>%
get_ci(level = 0.95,
type = "percentile") %>%
print()
## # A tibble: 1 x 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 -0.321 0.340
그래프 그리기
null_dist_x %>%
visualize() + # method 없음
shade_p_value(obs_stat = x_bar,
direction = "two-sided") + # x_bar
shade_confidence_interval(endpoints = null_dist_ci) # CI
P값
null_dist_x %>%
get_p_value(obs_stat = x_bar,
direction = "two-sided") %>%
print()
## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0.77
P값이 0.05보다 크므로 A제품과 B제품의 지속효과 차이는 없다.(귀무가설 성립)
t값을 이용한 검정그래프
t_cal 계산
t_cal <- edu_tb %>%
specify(formula = time ~ group) %>% # hypothesize 없음
calculate(stat = "t", # 두 집단은 t 검정 사용
order= c("A제품", "B제품")) %>%
print()
## Response: time (numeric)
## Explanatory: group (factor)
## # A tibble: 1 x 1
## stat
## <dbl>
## 1 0.300
Bootstrapping을 이용한 귀무가설 분포 생성
set.seed(123)
null_dist_t <- edu_tb %>%
specify(formula = time ~ group) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "t",
order= c("A제품", "B제품")) %>%
print()
## Response: time (numeric)
## Explanatory: group (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 x 2
## replicate stat
## <int> <dbl>
## 1 1 1.15
## 2 2 2.56
## 3 3 0.905
## 4 4 0.754
## 5 5 0.427
## 6 6 1.05
## 7 7 0.674
## 8 8 -0.185
## 9 9 1.10
## 10 10 -0.486
## # ... with 990 more rows
신뢰구간 생성
null_dist_ci <- null_dist_t %>%
get_ci(level = 0.95,
type = "percentile") %>%
print()
## # A tibble: 1 x 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 -1.97 2.10
null_dist_t %>%
visualize(method = "both") + #method = "both": 이론분포+boot분포
shade_p_value(obs_stat = t_cal,
direction = "two-sided") +
shade_confidence_interval(endpoints = null_dist_ci)
## Warning: Check to make sure the conditions have been met for the theoretical
## method. {infer} currently does not check these for you.
Leave a comment