2022-06-10-Data-mining-theory-and-reality-midtestExample01

1. A제약회사에서는 이번에 제품A와 제품B 두 가지 타입의 진통제를 개발하였다. 과연 이 두 가지 타입의 제품은 지속효과(시간)이 차이가 없는가?

(data1.sav) -. group : 1)A제품 2)B제품-. time : 지속효과

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidymodels)
## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --
## v broom        0.8.0     v rsample      0.1.1
## v dials        0.1.1     v tune         0.2.0
## v infer        1.0.0     v workflows    0.2.6
## v modeldata    0.1.1     v workflowsets 0.2.1
## v parsnip      0.2.1     v yardstick    0.0.9
## v recipes      0.2.0
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## * Learn how to get started at https://www.tidymodels.org/start/
library(rstatix)
## 
## 다음의 패키지를 부착합니다: 'rstatix'
## The following objects are masked from 'package:infer':
## 
##     chisq_test, prop_test, t_test
## The following object is masked from 'package:dials':
## 
##     get_n
## The following object is masked from 'package:stats':
## 
##     filter
library(skimr)
edu_tb <- read_csv('data1.csv', 
                   col_names = TRUE,
                   locale=locale('ko', encoding='euc-kr'), # 한글
                   na=".") %>%
  round(3) %>%                 # 소수점 2자리로 반올림
  mutate_if(is.character, as.factor) %>%
  mutate(group = factor(group,
                       levels=c(1:2),
                       labels=c("A제품","B제품")))
## Rows: 80 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (2): group, time
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(edu_tb)
## tibble [80 x 2] (S3: tbl_df/tbl/data.frame)
##  $ group: Factor w/ 2 levels "A제품","B제품": 1 1 1 1 1 1 1 1 1 1 ...
##  $ time : num [1:80] 4.25 5.25 5.75 4.08 4.5 4.5 4.5 5.83 6.42 6.75 ...
edu_tb
## # A tibble: 80 x 2
##    group  time
##    <fct> <dbl>
##  1 A제품  4.25
##  2 A제품  5.25
##  3 A제품  5.75
##  4 A제품  4.08
##  5 A제품  4.5 
##  6 A제품  4.5 
##  7 A제품  4.5 
##  8 A제품  5.83
##  9 A제품  6.42
## 10 A제품  6.75
## # ... with 70 more rows

기술통계분석

skim(edu_tb)
Data summary
Name edu_tb
Number of rows 80
Number of columns 2
_______________________
Column type frequency:
factor 1
numeric 1
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
group 0 1 FALSE 2 A제품: 40, B제품: 40

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
time 0 1 5.02 0.75 3.56 4.5 4.83 5.52 6.75 ▂▇▅▃▂
edu_tb %>% 
  group_by(group) %>%
  get_summary_stats(time)
## # A tibble: 2 x 14
##   group variable     n   min   max median    q1    q3   iqr   mad  mean    sd
##   <fct> <chr>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A제품 time        40  3.67  6.75   4.79  4.5   5.54 1.04  0.682  5.03 0.78 
## 2 B제품 time        40  3.56  6.67   5     4.56  5.39 0.828 0.652  5.02 0.738
## # ... with 2 more variables: se <dbl>, ci <dbl>

그래프 그리기(박스그래프,히스토그램)

edu_tb %>% 
  ggplot(mapping = aes(x = group,
                       y = time)) +
  geom_boxplot() +
  labs(y = "time")

edu_tb %>% 
  ggplot(mapping = aes(x = time)) +
  geom_histogram(bins = 100,               
                 color = "white", 
                 fill = "steelblue") +
  facet_wrap(~ group)                   

이상치 제거

이상치 확인

edu_tb %>%
  group_by(group) %>%
  identify_outliers(time)
## # A tibble: 1 x 4
##   group  time is.outlier is.extreme
##   <fct> <dbl> <lgl>      <lgl>     
## 1 B제품  6.67 TRUE       FALSE

이상치 제거

edu_tb <- edu_tb %>%
  filter(!(group == "B제품" & time >= 6.67 )) 
edu_tb %>% 
  ggplot(mapping = aes(x = group,
                       y = time)) +
  geom_boxplot() +
  labs(y = "time")

정규분포 검정

edu_tb %>%
  group_by(group) %>%
  shapiro_test(time)
## # A tibble: 2 x 4
##   group variable statistic      p
##   <fct> <chr>        <dbl>  <dbl>
## 1 A제품 time         0.957 0.127 
## 2 B제품 time         0.943 0.0489

p값이 0.05 보다 크므로 정규분포를 따른다.

등분산 검정

edu_tb %>%
  levene_test(time ~ group)
## # A tibble: 1 x 4
##     df1   df2 statistic     p
##   <int> <int>     <dbl> <dbl>
## 1     1    77     0.514 0.475

p값이 0.05보다 크므로 등분산 성립

등분산 t-검정

edu_tb %>% 
  t_test(formula = time ~ group,
         comparisons = c("A제품", "B제품"),
         paired = FALSE,
         var.equal = TRUE, # 등분산 성립하므로 TRUE
         alternative = "two.sided", 
         detailed = TRUE)
## # A tibble: 1 x 15
##   estimate estimate1 estimate2 .y.   group1 group2    n1    n2 statistic     p
## *    <dbl>     <dbl>     <dbl> <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>
## 1   0.0498      5.03      4.98 time  A제품  B제품     40    39     0.299 0.766
## # ... with 5 more variables: df <dbl>, conf.low <dbl>, conf.high <dbl>,
## #   method <chr>, alternative <chr>
edu_tb %>% 
  cohens_d(formula = time ~ group, 
           var.equal = TRUE)
## # A tibble: 1 x 7
##   .y.   group1 group2 effsize    n1    n2 magnitude 
## * <chr> <chr>  <chr>    <dbl> <int> <int> <ord>     
## 1 time  A제품  B제품   0.0673    40    39 negligible

negligible -> 거의 차이가 없다(A제품과 B제품끼리)

추론(infer)을 이용한 가설검정 및 그래프

표본평균(x)을 이용한 검정그래프

표본평균(x) 계산

x_bar <- edu_tb %>%
  specify(formula = time ~ group) %>%   # hypothesize 없음
  calculate(stat = "diff in means",         
            order= c("A제품", "B제품")) %>%
  print()
## Response: time (numeric)
## Explanatory: group (factor)
## # A tibble: 1 x 1
##     stat
##    <dbl>
## 1 0.0498

Bootstrapping을 이용한 귀무가설 분포 생성

set.seed(123) 
null_dist_x <- edu_tb %>%
  specify(formula = time ~ group) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "diff in means", 
            order= c("A제품", "B제품")) %>%
  print()
## Response: time (numeric)
## Explanatory: group (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 x 2
##    replicate    stat
##        <int>   <dbl>
##  1         1  0.190 
##  2         2  0.409 
##  3         3  0.150 
##  4         4  0.125 
##  5         5  0.0711
##  6         6  0.173 
##  7         7  0.112 
##  8         8 -0.0307
##  9         9  0.181 
## 10        10 -0.0808
## # ... with 990 more rows

신뢰구간 생성

null_dist_ci <- null_dist_x %>%
  get_ci(level = 0.95, 
         type = "percentile") %>%
  print()
## # A tibble: 1 x 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1   -0.321    0.340

그래프 그리기

null_dist_x %>%
  visualize() +                                       # method 없음
  shade_p_value(obs_stat = x_bar,
                direction = "two-sided") +            # x_bar
  shade_confidence_interval(endpoints = null_dist_ci) # CI

P값

null_dist_x %>%
  get_p_value(obs_stat = x_bar,           
              direction = "two-sided") %>%
  print()
## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1    0.77

P값이 0.05보다 크므로 A제품과 B제품의 지속효과 차이는 없다.(귀무가설 성립)

t값을 이용한 검정그래프

t_cal 계산

t_cal <- edu_tb %>%
  specify(formula = time ~ group) %>% # hypothesize 없음
  calculate(stat = "t",  # 두 집단은 t 검정 사용
            order= c("A제품", "B제품")) %>%
  print()
## Response: time (numeric)
## Explanatory: group (factor)
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1 0.300

Bootstrapping을 이용한 귀무가설 분포 생성

set.seed(123) 
null_dist_t <- edu_tb %>%
  specify(formula = time ~ group) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "t", 
            order= c("A제품", "B제품")) %>%
  print()
## Response: time (numeric)
## Explanatory: group (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 x 2
##    replicate   stat
##        <int>  <dbl>
##  1         1  1.15 
##  2         2  2.56 
##  3         3  0.905
##  4         4  0.754
##  5         5  0.427
##  6         6  1.05 
##  7         7  0.674
##  8         8 -0.185
##  9         9  1.10 
## 10        10 -0.486
## # ... with 990 more rows

신뢰구간 생성

null_dist_ci <- null_dist_t %>%
  get_ci(level = 0.95, 
         type = "percentile") %>%
  print()
## # A tibble: 1 x 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1    -1.97     2.10
null_dist_t %>%
  visualize(method = "both") +           #method = "both": 이론분포+boot분포
  shade_p_value(obs_stat = t_cal,
                direction = "two-sided") +
  shade_confidence_interval(endpoints = null_dist_ci)
## Warning: Check to make sure the conditions have been met for the theoretical
## method. {infer} currently does not check these for you.

Leave a comment