데이터마이닝의 이론과 실제 6주차
library(tidymodels)
## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --
## v broom        0.8.0     v recipes      0.2.0
## v dials        0.1.1     v rsample      0.1.1
## v dplyr        1.0.8     v tibble       3.1.6
## v ggplot2      3.3.5     v tidyr        1.2.0
## v infer        1.0.0     v tune         0.2.0
## v modeldata    0.1.1     v workflows    0.2.6
## v parsnip      0.2.1     v workflowsets 0.2.1
## v purrr        0.3.4     v yardstick    0.0.9
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x purrr::discard() masks scales::discard()
## x dplyr::filter()  masks stats::filter()
## x dplyr::lag()     masks stats::lag()
## x recipes::step()  masks stats::step()
## * Dig deeper into tidy modeling with R at https://www.tmwr.org
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v readr   2.1.2     v forcats 0.5.1
## v stringr 1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x readr::col_factor() masks scales::col_factor()
## x purrr::discard()    masks scales::discard()
## x dplyr::filter()     masks stats::filter()
## x stringr::fixed()    masks recipes::fixed()
## x dplyr::lag()        masks stats::lag()
## x readr::spec()       masks yardstick::spec()
library(dplyr)
library(ggplot2)

데이터 불러오기

data(mpg)
glimpse(mpg)
## Rows: 234
## Columns: 11
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", "~
## $ model        <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "~
## $ displ        <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0, 2.~
## $ year         <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 200~
## $ cyl          <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, ~
## $ trans        <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)", "auto~
## $ drv          <chr> "f", "f", "f", "f", "f", "f", "f", "4", "4", "4", "4", "4~
## $ cty          <int> 18, 21, 20, 21, 16, 18, 18, 18, 16, 20, 19, 15, 17, 17, 1~
## $ hwy          <int> 29, 29, 31, 30, 26, 26, 27, 26, 25, 28, 27, 25, 25, 25, 2~
## $ fl           <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p~
## $ class        <chr> "compact", "compact", "compact", "compact", "compact", "c~
str(mpg) #tibble 구조
## tibble [234 x 11] (S3: tbl_df/tbl/data.frame)
##  $ manufacturer: chr [1:234] "audi" "audi" "audi" "audi" ...
##  $ model       : chr [1:234] "a4" "a4" "a4" "a4" ...
##  $ displ       : num [1:234] 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##  $ year        : int [1:234] 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
##  $ cyl         : int [1:234] 4 4 4 4 6 6 6 4 4 4 ...
##  $ trans       : chr [1:234] "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
##  $ drv         : chr [1:234] "f" "f" "f" "f" ...
##  $ cty         : int [1:234] 18 21 20 21 16 18 18 18 16 20 ...
##  $ hwy         : int [1:234] 29 29 31 30 26 26 27 26 25 28 ...
##  $ fl          : chr [1:234] "p" "p" "p" "p" ...
##  $ class       : chr [1:234] "compact" "compact" "compact" "compact" ...
mpg
## # A tibble: 234 x 11
##    manufacturer model      displ  year   cyl trans drv     cty   hwy fl    class
##    <chr>        <chr>      <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
##  1 audi         a4           1.8  1999     4 auto~ f        18    29 p     comp~
##  2 audi         a4           1.8  1999     4 manu~ f        21    29 p     comp~
##  3 audi         a4           2    2008     4 manu~ f        20    31 p     comp~
##  4 audi         a4           2    2008     4 auto~ f        21    30 p     comp~
##  5 audi         a4           2.8  1999     6 auto~ f        16    26 p     comp~
##  6 audi         a4           2.8  1999     6 manu~ f        18    26 p     comp~
##  7 audi         a4           3.1  2008     6 auto~ f        18    27 p     comp~
##  8 audi         a4 quattro   1.8  1999     4 manu~ 4        18    26 p     comp~
##  9 audi         a4 quattro   1.8  1999     4 auto~ 4        16    25 p     comp~
## 10 audi         a4 quattro   2    2008     4 manu~ 4        20    28 p     comp~
## # ... with 224 more rows

기본함수

#barplot(mpg$class)
tb <- table(mpg$class)
tb
## 
##    2seater    compact    midsize    minivan     pickup subcompact        suv 
##          5         47         41         11         33         35         62
barplot(tb)

# 빈도수

ggplot(data = mpg,
       mapping = aes(x=class)) + 
  geom_bar()

ggplot(data=mpg,
       mapping = aes(x=class,
                     y=stat(prop),
                     group=1)) +
  geom_bar()

ggplot에 기본 문법

ggplot(data = mpg,
       mapping = aes(x=displ,
                     y = hwy))+
  geom_point()

ggplot(data = mpg) +
  geom_point(mapping = aes(x=displ,
                           y=hwy))

# aesthetic # 변수에 대해서 매핑

ggplot(data=mpg,
       mapping = aes(x=displ,
                     y=hwy,
                     color=class)) +
  geom_point()

ggplot(data=mpg,
       mapping = aes(x=displ,
                     y=hwy))+
  geom_point(color="blue")

ggplot(data=mpg,
       mapping = aes(x=displ,
                     y=hwy,
                     color=class,
                     size = hwy,
                     alpha = displ)) +
  geom_point()

facet

ggplot(data=mpg,
       mapping = aes(x=displ,
                     y=hwy))+
  geom_point() +
  facet_wrap(~class,
             nrow = 2) # ncol=2

ggplot(data=mpg,
       mapping = aes(x=displ,
                     y=hwy))+
  geom_point() +
  facet_wrap(cyl~class,
             nrow = 2) # ncol=2

geom 여러개 사용할 경우

ggplot(data=mpg,
       mapping = aes(x=displ,
                     y=hwy))+
  geom_point() +
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data=mpg,
       mapping = aes(x=displ,
                     y=hwy))+
  geom_point(mapping = aes(color=class)) 

# 수평 , 수직

ggplot(data=mpg,
       mapping = aes(x=class,
                     y=hwy)) +
  geom_boxplot() + 
  coord_flip() # x,y축 변경

one sample t-test

library(rstatix)
## 
## 다음의 패키지를 부착합니다: 'rstatix'
## The following objects are masked from 'package:infer':
## 
##     chisq_test, prop_test, t_test
## The following object is masked from 'package:dials':
## 
##     get_n
## The following object is masked from 'package:stats':
## 
##     filter
library(skimr)

데이터 불어오기

ost_tb <- read_csv("05.ost.csv",
                   col_names = TRUE,
                   na = ".")
## Rows: 100 Columns: 1
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (1): weight
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(ost_tb)
## spec_tbl_df [100 x 1] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ weight: num [1:100] 319 242 291 276 348 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   weight = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
glimpse(ost_tb)
## Rows: 100
## Columns: 1
## $ weight <dbl> 319.3148, 241.9693, 290.9807, 276.0801, 347.5499, 298.8435, 292~
ost_tb
## # A tibble: 100 x 1
##    weight
##     <dbl>
##  1   319.
##  2   242.
##  3   291.
##  4   276.
##  5   348.
##  6   299.
##  7   293.
##  8   304.
##  9   296.
## 10   286.
## # ... with 90 more rows

기초 통계 분석

skim(ost_tb)
Data summary
Name ost_tb
Number of rows 100
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
weight 0 1 295.44 20.04 241.97 283.14 295.77 309.43 347.55 ▂▅▇▇▁
ost_tb %>%
  get_summary_stats(weight)
## # A tibble: 1 x 13
##   variable     n   min   max median    q1    q3   iqr   mad  mean    sd    se
##   <chr>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 weight     100  242.  348.   296.  283.  309.  26.3  19.7  295.  20.0  2.00
## # ... with 1 more variable: ci <dbl>

그래프 그리기

ost_tb %>%
  ggplot(mapping = aes(y=weight)) +
  geom_boxplot() +
  labs(y = "몸무게")

ost_tb %>%
  ggplot(mapping = aes(x=weight)) +
  geom_histogram(binwidth = 10) +
  coord_cartesian(xlim = c(200,400),
                  ylim = c(0,30))

# 이상치 제거

ost_tb %>%
  identify_outliers(weight)
## # A tibble: 1 x 3
##   weight is.outlier is.extreme
##    <dbl> <lgl>      <lgl>     
## 1   242. TRUE       FALSE
ost_tb <- ost_tb %>%
  filter(!(weight <=242))
ost_tb %>%
  ggplot(mapping = aes(y=weight)) +
  geom_boxplot() +
  labs(y = "몸무게")

정규분포

ost_tb %>%
  shapiro_test(weight)
## # A tibble: 1 x 3
##   variable statistic     p
##   <chr>        <dbl> <dbl>
## 1 weight       0.987 0.422

통계분석(차이검증)

ost_tb %>%
  t_test(formula = weight ~ 1,
         alternative = "two.sided",
         mu = 320,
         conf.level = 0.95,
         detailed = TRUE)
## # A tibble: 1 x 12
##   estimate .y.   group1 group2     n statistic        p    df conf.low conf.high
## *    <dbl> <chr> <chr>  <chr>  <int>     <dbl>    <dbl> <dbl>    <dbl>     <dbl>
## 1     296. weig~ 1      null ~    99     -12.3 1.27e-21    98     292.      300.
## # ... with 2 more variables: method <chr>, alternative <chr>
ost_tb %>%
  cohens_d(formula = weight ~ 1,
           mu=320)
## # A tibble: 1 x 6
##   .y.    group1 group2     effsize     n magnitude
## * <chr>  <chr>  <chr>        <dbl> <int> <ord>    
## 1 weight 1      null model   -1.24    99 large

추론(infer)

표본 평균(x)

x_bar <- ost_tb %>%
  specify(response = weight) %>%
  calculate(stat = "mean")

귀무가설의 분포를 만들어주라

null_dist_x <- ost_tb %>%
  specify(response = weight) %>%
  hypothesise(null = "point",
              mu = 320) %>% # 귀무가설의 값:320
  generate(reps = 1000,
           type = "bootstrap") %>%
  calculate(stat = "mean") %>%
  print()
## Response: weight (numeric)
## Null Hypothesis: point
## # A tibble: 1,000 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1  322.
##  2         2  321.
##  3         3  323.
##  4         4  320.
##  5         5  317.
##  6         6  319.
##  7         7  324.
##  8         8  320.
##  9         9  322.
## 10        10  319.
## # ... with 990 more rows
null_dist_ci <- null_dist_x %>%
  get_ci(level = 0.95,
         type = "percentile") %>%
  print()
## # A tibble: 1 x 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1     316.     324.
null_dist_x %>%
  visualize() + 
  shade_p_value(obs_stat = x_bar,
                direction = "two-sided") + 
  shade_confidence_interval(endpoints = null_dist_ci)

null_dist_x %>%
  get_p_value(obs_stat = x_bar,
              direction = "two-sided")
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step. See
## `?get_p_value()` for more information.
## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1       0

t분포

t_cal <- ost_tb %>%
  specify(response = weight) %>%
  hypothesize(null = "point",
              mu = 320) %>%
  calculate(stat = "t") %>%
  print()
## Response: weight (numeric)
## Null Hypothesis: point
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1 -12.3
null_dist_t <- ost_tb %>%
  specify(response = weight) %>%
  hypothesise(null = "point",
              mu = 320) %>% # 귀무가설의 값:320
  generate(reps = 1000,
           type = "bootstrap") %>%
  calculate(stat = "t") %>%
  print()
## Response: weight (numeric)
## Null Hypothesis: point
## # A tibble: 1,000 x 2
##    replicate   stat
##        <int>  <dbl>
##  1         1  0.120
##  2         2 -0.170
##  3         3  0.409
##  4         4  0.623
##  5         5  1.13 
##  6         6 -1.67 
##  7         7  0.796
##  8         8  0.354
##  9         9 -0.300
## 10        10  0.921
## # ... with 990 more rows
null_dist_ci <- null_dist_t %>%
  get_ci(level = 0.95,
         type = "percentile") %>%
  print()
## # A tibble: 1 x 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1    -1.86     1.92
null_dist_t %>%
  visualize(method = "both") + 
  shade_p_value(obs_stat = t_cal,
                direction = "two-sided") + 
  shade_confidence_interval(endpoints = null_dist_ci)
## Warning: Check to make sure the conditions have been met for the theoretical
## method. {infer} currently does not check these for you.

Leave a comment