Share on

데이터마이닝의 이론과 실제 6주차

library(tidymodels)

## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --

## v broom        0.8.0     v recipes      0.2.0
## v dials        0.1.1     v rsample      0.1.1
## v dplyr        1.0.8     v tibble       3.1.6
## v ggplot2      3.3.5     v tidyr        1.2.0
## v infer        1.0.0     v tune         0.2.0
## v modeldata    0.1.1     v workflows    0.2.6
## v parsnip      0.2.1     v workflowsets 0.2.1
## v purrr        0.3.4     v yardstick    0.0.9

## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x purrr::discard() masks scales::discard()
## x dplyr::filter()  masks stats::filter()
## x dplyr::lag()     masks stats::lag()
## x recipes::step()  masks stats::step()
## * Dig deeper into tidy modeling with R at https://www.tmwr.org

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v readr   2.1.2     v forcats 0.5.1
## v stringr 1.4.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x readr::col_factor() masks scales::col_factor()
## x purrr::discard()    masks scales::discard()
## x dplyr::filter()     masks stats::filter()
## x stringr::fixed()    masks recipes::fixed()
## x dplyr::lag()        masks stats::lag()
## x readr::spec()       masks yardstick::spec()

library(dplyr)
library(ggplot2)

데이터 불러오기

data(mpg)

glimpse(mpg)

## Rows: 234
## Columns: 11
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", "~
## $ model        <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "~
## $ displ        <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0, 2.~
## $ year         <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 200~
## $ cyl          <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, ~
## $ trans        <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)", "auto~
## $ drv          <chr> "f", "f", "f", "f", "f", "f", "f", "4", "4", "4", "4", "4~
## $ cty          <int> 18, 21, 20, 21, 16, 18, 18, 18, 16, 20, 19, 15, 17, 17, 1~
## $ hwy          <int> 29, 29, 31, 30, 26, 26, 27, 26, 25, 28, 27, 25, 25, 25, 2~
## $ fl           <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p~
## $ class        <chr> "compact", "compact", "compact", "compact", "compact", "c~

str(mpg) #tibble 구조

## tibble [234 x 11] (S3: tbl_df/tbl/data.frame)
##  $ manufacturer: chr [1:234] "audi" "audi" "audi" "audi" ...
##  $ model       : chr [1:234] "a4" "a4" "a4" "a4" ...
##  $ displ       : num [1:234] 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##  $ year        : int [1:234] 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
##  $ cyl         : int [1:234] 4 4 4 4 6 6 6 4 4 4 ...
##  $ trans       : chr [1:234] "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
##  $ drv         : chr [1:234] "f" "f" "f" "f" ...
##  $ cty         : int [1:234] 18 21 20 21 16 18 18 18 16 20 ...
##  $ hwy         : int [1:234] 29 29 31 30 26 26 27 26 25 28 ...
##  $ fl          : chr [1:234] "p" "p" "p" "p" ...
##  $ class       : chr [1:234] "compact" "compact" "compact" "compact" ...

mpg

## # A tibble: 234 x 11
##    manufacturer model      displ  year   cyl trans drv     cty   hwy fl    class
##    <chr>        <chr>      <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
##  1 audi         a4           1.8  1999     4 auto~ f        18    29 p     comp~
##  2 audi         a4           1.8  1999     4 manu~ f        21    29 p     comp~
##  3 audi         a4           2    2008     4 manu~ f        20    31 p     comp~
##  4 audi         a4           2    2008     4 auto~ f        21    30 p     comp~
##  5 audi         a4           2.8  1999     6 auto~ f        16    26 p     comp~
##  6 audi         a4           2.8  1999     6 manu~ f        18    26 p     comp~
##  7 audi         a4           3.1  2008     6 auto~ f        18    27 p     comp~
##  8 audi         a4 quattro   1.8  1999     4 manu~ 4        18    26 p     comp~
##  9 audi         a4 quattro   1.8  1999     4 auto~ 4        16    25 p     comp~
## 10 audi         a4 quattro   2    2008     4 manu~ 4        20    28 p     comp~
## # ... with 224 more rows

기본함수

#barplot(mpg$class)

tb <- table(mpg$class)
tb

## 
##    2seater    compact    midsize    minivan     pickup subcompact        suv 
##          5         47         41         11         33         35         62

barplot(tb)

# 빈도수

ggplot(data = mpg,
       mapping = aes(x=class)) + 
  geom_bar()

ggplot(data=mpg,
       mapping = aes(x=class,
                     y=stat(prop),
                     group=1)) +
  geom_bar()

ggplot에 기본 문법

ggplot(data = mpg,
       mapping = aes(x=displ,
                     y = hwy))+
  geom_point()

ggplot(data = mpg) +
  geom_point(mapping = aes(x=displ,
                           y=hwy))

# aesthetic # 변수에 대해서 매핑

ggplot(data=mpg,
       mapping = aes(x=displ,
                     y=hwy,
                     color=class)) +
  geom_point()

ggplot(data=mpg,
       mapping = aes(x=displ,
                     y=hwy))+
  geom_point(color="blue")

ggplot(data=mpg,
       mapping = aes(x=displ,
                     y=hwy,
                     color=class,
                     size = hwy,
                     alpha = displ)) +
  geom_point()

facet

ggplot(data=mpg,
       mapping = aes(x=displ,
                     y=hwy))+
  geom_point() +
  facet_wrap(~class,
             nrow = 2) # ncol=2

ggplot(data=mpg,
       mapping = aes(x=displ,
                     y=hwy))+
  geom_point() +
  facet_wrap(cyl~class,
             nrow = 2) # ncol=2

geom 여러개 사용할 경우

ggplot(data=mpg,
       mapping = aes(x=displ,
                     y=hwy))+
  geom_point() +
  geom_smooth()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data=mpg,
       mapping = aes(x=displ,
                     y=hwy))+
  geom_point(mapping = aes(color=class))

# 수평 , 수직

ggplot(data=mpg,
       mapping = aes(x=class,
                     y=hwy)) +
  geom_boxplot() + 
  coord_flip() # x,y축 변경

one sample t-test

library(rstatix)

## 
## 다음의 패키지를 부착합니다: 'rstatix'

## The following objects are masked from 'package:infer':
## 
##     chisq_test, prop_test, t_test

## The following object is masked from 'package:dials':
## 
##     get_n

## The following object is masked from 'package:stats':
## 
##     filter

library(skimr)

데이터 불어오기

ost_tb <- read_csv("05.ost.csv",
                   col_names = TRUE,
                   na = ".")

## Rows: 100 Columns: 1
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (1): weight
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

str(ost_tb)

## spec_tbl_df [100 x 1] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ weight: num [1:100] 319 242 291 276 348 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   weight = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

glimpse(ost_tb)

## Rows: 100
## Columns: 1
## $ weight <dbl> 319.3148, 241.9693, 290.9807, 276.0801, 347.5499, 298.8435, 292~

ost_tb

## # A tibble: 100 x 1
##    weight
##     <dbl>
##  1   319.
##  2   242.
##  3   291.
##  4   276.
##  5   348.
##  6   299.
##  7   293.
##  8   304.
##  9   296.
## 10   286.
## # ... with 90 more rows

기초 통계 분석

skim(ost_tb)

Data summary
Name	ost_tb
Number of rows	100
Number of columns	1
_______________________
Column type frequency:
numeric	1
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
weight	0	1	295.44	20.04	241.97	283.14	295.77	309.43	347.55	▂▅▇▇▁

ost_tb %>%
  get_summary_stats(weight)

## # A tibble: 1 x 13
##   variable     n   min   max median    q1    q3   iqr   mad  mean    sd    se
##   <chr>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 weight     100  242.  348.   296.  283.  309.  26.3  19.7  295.  20.0  2.00
## # ... with 1 more variable: ci <dbl>

그래프 그리기

ost_tb %>%
  ggplot(mapping = aes(y=weight)) +
  geom_boxplot() +
  labs(y = "몸무게")

ost_tb %>%
  ggplot(mapping = aes(x=weight)) +
  geom_histogram(binwidth = 10) +
  coord_cartesian(xlim = c(200,400),
                  ylim = c(0,30))

# 이상치 제거

ost_tb %>%
  identify_outliers(weight)

## # A tibble: 1 x 3
##   weight is.outlier is.extreme
##    <dbl> <lgl>      <lgl>     
## 1   242. TRUE       FALSE

ost_tb <- ost_tb %>%
  filter(!(weight <=242))

ost_tb %>%
  ggplot(mapping = aes(y=weight)) +
  geom_boxplot() +
  labs(y = "몸무게")

정규분포

ost_tb %>%
  shapiro_test(weight)

## # A tibble: 1 x 3
##   variable statistic     p
##   <chr>        <dbl> <dbl>
## 1 weight       0.987 0.422

통계분석(차이검증)

ost_tb %>%
  t_test(formula = weight ~ 1,
         alternative = "two.sided",
         mu = 320,
         conf.level = 0.95,
         detailed = TRUE)

## # A tibble: 1 x 12
##   estimate .y.   group1 group2     n statistic        p    df conf.low conf.high
## *    <dbl> <chr> <chr>  <chr>  <int>     <dbl>    <dbl> <dbl>    <dbl>     <dbl>
## 1     296. weig~ 1      null ~    99     -12.3 1.27e-21    98     292.      300.
## # ... with 2 more variables: method <chr>, alternative <chr>

ost_tb %>%
  cohens_d(formula = weight ~ 1,
           mu=320)

## # A tibble: 1 x 6
##   .y.    group1 group2     effsize     n magnitude
## * <chr>  <chr>  <chr>        <dbl> <int> <ord>    
## 1 weight 1      null model   -1.24    99 large

추론(infer)

표본 평균(x)

x_bar <- ost_tb %>%
  specify(response = weight) %>%
  calculate(stat = "mean")

귀무가설의 분포를 만들어주라

null_dist_x <- ost_tb %>%
  specify(response = weight) %>%
  hypothesise(null = "point",
              mu = 320) %>% # 귀무가설의 값:320
  generate(reps = 1000,
           type = "bootstrap") %>%
  calculate(stat = "mean") %>%
  print()

## Response: weight (numeric)
## Null Hypothesis: point
## # A tibble: 1,000 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1  322.
##  2         2  321.
##  3         3  323.
##  4         4  320.
##  5         5  317.
##  6         6  319.
##  7         7  324.
##  8         8  320.
##  9         9  322.
## 10        10  319.
## # ... with 990 more rows

null_dist_ci <- null_dist_x %>%
  get_ci(level = 0.95,
         type = "percentile") %>%
  print()

## # A tibble: 1 x 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1     316.     324.

null_dist_x %>%
  visualize() + 
  shade_p_value(obs_stat = x_bar,
                direction = "two-sided") + 
  shade_confidence_interval(endpoints = null_dist_ci)

null_dist_x %>%
  get_p_value(obs_stat = x_bar,
              direction = "two-sided")

## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step. See
## `?get_p_value()` for more information.

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1       0

t분포

t_cal <- ost_tb %>%
  specify(response = weight) %>%
  hypothesize(null = "point",
              mu = 320) %>%
  calculate(stat = "t") %>%
  print()

## Response: weight (numeric)
## Null Hypothesis: point
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1 -12.3

null_dist_t <- ost_tb %>%
  specify(response = weight) %>%
  hypothesise(null = "point",
              mu = 320) %>% # 귀무가설의 값:320
  generate(reps = 1000,
           type = "bootstrap") %>%
  calculate(stat = "t") %>%
  print()

## Response: weight (numeric)
## Null Hypothesis: point
## # A tibble: 1,000 x 2
##    replicate   stat
##        <int>  <dbl>
##  1         1  0.120
##  2         2 -0.170
##  3         3  0.409
##  4         4  0.623
##  5         5  1.13 
##  6         6 -1.67 
##  7         7  0.796
##  8         8  0.354
##  9         9 -0.300
## 10        10  0.921
## # ... with 990 more rows

null_dist_ci <- null_dist_t %>%
  get_ci(level = 0.95,
         type = "percentile") %>%
  print()

## # A tibble: 1 x 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1    -1.86     1.92

null_dist_t %>%
  visualize(method = "both") + 
  shade_p_value(obs_stat = t_cal,
                direction = "two-sided") + 
  shade_confidence_interval(endpoints = null_dist_ci)

## Warning: Check to make sure the conditions have been met for the theoretical
## method. {infer} currently does not check these for you.

Share on

Copy URL

Kakao Facebook Twitter LinkedIn

SeongJae Yoo

Data mining theory and reality06

Share on

6주차

데이터 불러오기

기본함수

ggplot에 기본 문법

facet

geom 여러개 사용할 경우

one sample t-test

데이터 불어오기

기초 통계 분석

그래프 그리기

정규분포

통계분석(차이검증)

추론(infer)

표본 평균(x)

귀무가설의 분포를 만들어주라

t분포

Share on

Leave a comment

You may also enjoy

자동주식투자프로그램 세팅방법

Data Mining Theory And Reality Midtestexample04

Data Mining Theory And Reality Midtestexample03

Data Mining Theory And Reality Midtestexample02