2022-06-10-Data-mining-theory-and-reality-midtestExample04

4. K제약회사에서는 새로운 진통제를 개발하였다. 새로운 진통제의 지속효과가 5시간 이상이라고 말할 수 있는가?(data4.sav)

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidymodels)
## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --
## v broom        0.8.0     v rsample      0.1.1
## v dials        0.1.1     v tune         0.2.0
## v infer        1.0.0     v workflows    0.2.6
## v modeldata    0.1.1     v workflowsets 0.2.1
## v parsnip      0.2.1     v yardstick    0.0.9
## v recipes      0.2.0
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## * Dig deeper into tidy modeling with R at https://www.tmwr.org
library(rstatix)
## 
## 다음의 패키지를 부착합니다: 'rstatix'
## The following objects are masked from 'package:infer':
## 
##     chisq_test, prop_test, t_test
## The following object is masked from 'package:dials':
## 
##     get_n
## The following object is masked from 'package:stats':
## 
##     filter
library(skimr)
sat_tb <- read_csv('data4.csv', 
                   col_names = TRUE,
                   locale=locale('ko', encoding='euc-kr'), # 한글
                   na=".") %>%
  round(2) %>%                  # 소수점 2자리로 반올림
  mutate_if(is.character, as.factor)
## Rows: 40 Columns: 1
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (1): pscore
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(sat_tb)
## tibble [40 x 1] (S3: tbl_df/tbl/data.frame)
##  $ pscore: num [1:40] 4.5 4.5 4.6 4.7 4.8 4.8 4.8 4.8 4.8 4.9 ...
glimpse(sat_tb)
## Rows: 40
## Columns: 1
## $ pscore <dbl> 4.5, 4.5, 4.6, 4.7, 4.8, 4.8, 4.8, 4.8, 4.8, 4.9, 4.9, 4.9, 4.9~
sat_tb
## # A tibble: 40 x 1
##    pscore
##     <dbl>
##  1    4.5
##  2    4.5
##  3    4.6
##  4    4.7
##  5    4.8
##  6    4.8
##  7    4.8
##  8    4.8
##  9    4.8
## 10    4.9
## # ... with 30 more rows

기본통계치 확인

skim(sat_tb)
Data summary
Name sat_tb
Number of rows 40
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
pscore 0 1 5.07 0.26 4.5 4.9 5.1 5.3 5.4 ▂▃▆▃▇
sat_tb %>%
  get_summary_stats(pscore)
## # A tibble: 1 x 13
##   variable     n   min   max median    q1    q3   iqr   mad  mean    sd    se
##   <chr>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 pscore      40   4.5   5.4    5.1   4.9   5.3   0.4 0.297  5.07 0.262 0.041
## # ... with 1 more variable: ci <dbl>

그래프 그리기(박스그래프,히스토그램)

sat_tb %>% 
  ggplot(aes(y = pscore)) +
  geom_boxplot() +
  labs(y = "지속효과")

sat_tb %>% 
  ggplot(mapping = aes(x = pscore)) +
  geom_histogram(binwidth = 0.1) +
  coord_cartesian()

이상치 제거

이상치 확인

sat_tb %>% 
  identify_outliers(pscore)
## [1] pscore     is.outlier is.extreme
## <0 행> <또는 row.names의 길이가 0입니다>

정규분포 검정

sat_tb %>%
  shapiro_test(pscore)
## # A tibble: 1 x 3
##   variable statistic       p
##   <chr>        <dbl>   <dbl>
## 1 pscore       0.914 0.00503

0.05 보다 작으므로 정규분포 성립안됩니다.

#kruskal_test(data = sat_tb, formula = pscore ~1)

sat_tb %>% 
        t_test(formula = pscore ~ 1,
               alternative = "greater",  
               mu = 5, 
               conf.level = 0.95,
              detailed = TRUE)
## # A tibble: 1 x 12
##   estimate .y.    group1 group2      n statistic      p    df conf.low conf.high
## *    <dbl> <chr>  <chr>  <chr>   <int>     <dbl>  <dbl> <dbl>    <dbl>     <dbl>
## 1     5.07 pscore 1      null m~    40      1.63 0.0553    39     5.00       Inf
## # ... with 2 more variables: method <chr>, alternative <chr>
sat_tb %>% 
  cohens_d(formula = pscore ~ 1, 
           mu = 5)
## # A tibble: 1 x 6
##   .y.    group1 group2     effsize     n magnitude
## * <chr>  <chr>  <chr>        <dbl> <int> <ord>    
## 1 pscore 1      null model   0.258    40 small

추론(infer)을 이용한 가설검정 및 그래프

표본평균(x) 계산

x_bar <- sat_tb %>%
  specify(response = pscore) %>%    # hypothesize 없음
  calculate(stat = "mean") %>%      # stat = "mean"
  print()
## Response: pscore (numeric)
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1  5.07

Bootstrapping을 이용한 귀무가설 분포 생성

set.seed(123) 
null_dist_x <- sat_tb %>%
  specify(response = pscore) %>%
  hypothesize(null = "point",  # 평균의 비교 : point
              mu = 5) %>%
  generate(reps = 1000, 
           type = "bootstrap") %>%
  calculate(stat = "mean") %>%
  print()
## Response: pscore (numeric)
## Null Hypothesis: point
## # A tibble: 1,000 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1  5.03
##  2         2  5.08
##  3         3  5.03
##  4         4  5   
##  5         5  5   
##  6         6  4.96
##  7         7  5.09
##  8         8  5.02
##  9         9  4.98
## 10        10  4.99
## # ... with 990 more rows

신뢰구간 생성

null_dist_ci <- null_dist_x %>%
  get_ci(level = 0.95, 
         type = "percentile") %>%
  print()
## # A tibble: 1 x 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1     4.92     5.08

그래프

null_dist_x %>%
  visualize() +                                       # method 없음
  shade_p_value(obs_stat = x_bar,
                direction = "greater") +            # x_bar
  shade_confidence_interval(endpoints = null_dist_ci) # CI

p_value

null_dist_x %>%
  get_p_value(obs_stat = x_bar,           
              direction = "greater") %>%
  print()
## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1   0.057

p값이 0.05 보다 크므로 지속효과가 5시간 이상이라고 말할 수 있다.

t값을 이용한 검정그래프

t_cal <- sat_tb %>%
  specify(response = pscore) %>%
  hypothesize(null = "point",         # hypothesize 필요
              mu = 5) %>%  
  calculate(stat = "t") %>%           # stat = "t"              
  print()
## Response: pscore (numeric)
## Null Hypothesis: point
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1  1.63

Bootstrapping을 이용한 귀무가설 분포 생성

set.seed(123) 
null_dist_t <- sat_tb %>%
  specify(response = pscore) %>%
  hypothesize(null = "point", 
              mu = 5) %>%
  generate(reps = 1000, 
           type = "bootstrap") %>%
  calculate(stat = "t") %>%
  print()
## Response: pscore (numeric)
## Null Hypothesis: point
## # A tibble: 1,000 x 2
##    replicate   stat
##        <int>  <dbl>
##  1         1  0.653
##  2         2  2.43 
##  3         3  0.896
##  4         4  0    
##  5         5  0    
##  6         6 -0.774
##  7         7  2.15 
##  8         8  0.491
##  9         9 -0.512
## 10        10 -0.274
## # ... with 990 more rows

신뢰구간 생성

null_dist_ci <- null_dist_t %>%
  get_ci(level = 0.95, 
         type = "percentile") %>%
  print()
## # A tibble: 1 x 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1    -1.95     2.22
null_dist_t %>%
  visualize(method = "both") +           #method = "both": 이론분포+boot분포
  shade_p_value(obs_stat = t_cal,
                direction = "greater") +
  shade_confidence_interval(endpoints = null_dist_ci)
## Warning: Check to make sure the conditions have been met for the theoretical
## method. {infer} currently does not check these for you.

Leave a comment