Data mining theory and reality06
6주차
library(tidymodels)
## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --
## v broom 0.8.0 v recipes 0.2.0
## v dials 0.1.1 v rsample 0.1.1
## v dplyr 1.0.8 v tibble 3.1.6
## v ggplot2 3.3.5 v tidyr 1.2.0
## v infer 1.0.0 v tune 0.2.0
## v modeldata 0.1.1 v workflows 0.2.6
## v parsnip 0.2.1 v workflowsets 0.2.1
## v purrr 0.3.4 v yardstick 0.0.9
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x purrr::discard() masks scales::discard()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x recipes::step() masks stats::step()
## * Dig deeper into tidy modeling with R at https://www.tmwr.org
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v readr 2.1.2 v forcats 0.5.1
## v stringr 1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x readr::col_factor() masks scales::col_factor()
## x purrr::discard() masks scales::discard()
## x dplyr::filter() masks stats::filter()
## x stringr::fixed() masks recipes::fixed()
## x dplyr::lag() masks stats::lag()
## x readr::spec() masks yardstick::spec()
library(dplyr)
library(ggplot2)
데이터 불러오기
data(mpg)
glimpse(mpg)
## Rows: 234
## Columns: 11
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", "~
## $ model <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "~
## $ displ <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0, 2.~
## $ year <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 200~
## $ cyl <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, ~
## $ trans <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)", "auto~
## $ drv <chr> "f", "f", "f", "f", "f", "f", "f", "4", "4", "4", "4", "4~
## $ cty <int> 18, 21, 20, 21, 16, 18, 18, 18, 16, 20, 19, 15, 17, 17, 1~
## $ hwy <int> 29, 29, 31, 30, 26, 26, 27, 26, 25, 28, 27, 25, 25, 25, 2~
## $ fl <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p~
## $ class <chr> "compact", "compact", "compact", "compact", "compact", "c~
str(mpg) #tibble 구조
## tibble [234 x 11] (S3: tbl_df/tbl/data.frame)
## $ manufacturer: chr [1:234] "audi" "audi" "audi" "audi" ...
## $ model : chr [1:234] "a4" "a4" "a4" "a4" ...
## $ displ : num [1:234] 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
## $ year : int [1:234] 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
## $ cyl : int [1:234] 4 4 4 4 6 6 6 4 4 4 ...
## $ trans : chr [1:234] "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
## $ drv : chr [1:234] "f" "f" "f" "f" ...
## $ cty : int [1:234] 18 21 20 21 16 18 18 18 16 20 ...
## $ hwy : int [1:234] 29 29 31 30 26 26 27 26 25 28 ...
## $ fl : chr [1:234] "p" "p" "p" "p" ...
## $ class : chr [1:234] "compact" "compact" "compact" "compact" ...
mpg
## # A tibble: 234 x 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto~ f 18 29 p comp~
## 2 audi a4 1.8 1999 4 manu~ f 21 29 p comp~
## 3 audi a4 2 2008 4 manu~ f 20 31 p comp~
## 4 audi a4 2 2008 4 auto~ f 21 30 p comp~
## 5 audi a4 2.8 1999 6 auto~ f 16 26 p comp~
## 6 audi a4 2.8 1999 6 manu~ f 18 26 p comp~
## 7 audi a4 3.1 2008 6 auto~ f 18 27 p comp~
## 8 audi a4 quattro 1.8 1999 4 manu~ 4 18 26 p comp~
## 9 audi a4 quattro 1.8 1999 4 auto~ 4 16 25 p comp~
## 10 audi a4 quattro 2 2008 4 manu~ 4 20 28 p comp~
## # ... with 224 more rows
기본함수
#barplot(mpg$class)
tb <- table(mpg$class)
tb
##
## 2seater compact midsize minivan pickup subcompact suv
## 5 47 41 11 33 35 62
barplot(tb)
# 빈도수
ggplot(data = mpg,
mapping = aes(x=class)) +
geom_bar()
ggplot(data=mpg,
mapping = aes(x=class,
y=stat(prop),
group=1)) +
geom_bar()
ggplot에 기본 문법
ggplot(data = mpg,
mapping = aes(x=displ,
y = hwy))+
geom_point()
ggplot(data = mpg) +
geom_point(mapping = aes(x=displ,
y=hwy))
# aesthetic # 변수에 대해서 매핑
ggplot(data=mpg,
mapping = aes(x=displ,
y=hwy,
color=class)) +
geom_point()
ggplot(data=mpg,
mapping = aes(x=displ,
y=hwy))+
geom_point(color="blue")
ggplot(data=mpg,
mapping = aes(x=displ,
y=hwy,
color=class,
size = hwy,
alpha = displ)) +
geom_point()
facet
ggplot(data=mpg,
mapping = aes(x=displ,
y=hwy))+
geom_point() +
facet_wrap(~class,
nrow = 2) # ncol=2
ggplot(data=mpg,
mapping = aes(x=displ,
y=hwy))+
geom_point() +
facet_wrap(cyl~class,
nrow = 2) # ncol=2
geom 여러개 사용할 경우
ggplot(data=mpg,
mapping = aes(x=displ,
y=hwy))+
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(data=mpg,
mapping = aes(x=displ,
y=hwy))+
geom_point(mapping = aes(color=class))
# 수평 , 수직
ggplot(data=mpg,
mapping = aes(x=class,
y=hwy)) +
geom_boxplot() +
coord_flip() # x,y축 변경
one sample t-test
library(rstatix)
##
## 다음의 패키지를 부착합니다: 'rstatix'
## The following objects are masked from 'package:infer':
##
## chisq_test, prop_test, t_test
## The following object is masked from 'package:dials':
##
## get_n
## The following object is masked from 'package:stats':
##
## filter
library(skimr)
데이터 불어오기
ost_tb <- read_csv("05.ost.csv",
col_names = TRUE,
na = ".")
## Rows: 100 Columns: 1
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (1): weight
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(ost_tb)
## spec_tbl_df [100 x 1] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ weight: num [1:100] 319 242 291 276 348 ...
## - attr(*, "spec")=
## .. cols(
## .. weight = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
glimpse(ost_tb)
## Rows: 100
## Columns: 1
## $ weight <dbl> 319.3148, 241.9693, 290.9807, 276.0801, 347.5499, 298.8435, 292~
ost_tb
## # A tibble: 100 x 1
## weight
## <dbl>
## 1 319.
## 2 242.
## 3 291.
## 4 276.
## 5 348.
## 6 299.
## 7 293.
## 8 304.
## 9 296.
## 10 286.
## # ... with 90 more rows
기초 통계 분석
skim(ost_tb)
Name | ost_tb |
Number of rows | 100 |
Number of columns | 1 |
_______________________ | |
Column type frequency: | |
numeric | 1 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
weight | 0 | 1 | 295.44 | 20.04 | 241.97 | 283.14 | 295.77 | 309.43 | 347.55 | ▂▅▇▇▁ |
ost_tb %>%
get_summary_stats(weight)
## # A tibble: 1 x 13
## variable n min max median q1 q3 iqr mad mean sd se
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 weight 100 242. 348. 296. 283. 309. 26.3 19.7 295. 20.0 2.00
## # ... with 1 more variable: ci <dbl>
그래프 그리기
ost_tb %>%
ggplot(mapping = aes(y=weight)) +
geom_boxplot() +
labs(y = "몸무게")
ost_tb %>%
ggplot(mapping = aes(x=weight)) +
geom_histogram(binwidth = 10) +
coord_cartesian(xlim = c(200,400),
ylim = c(0,30))
# 이상치 제거
ost_tb %>%
identify_outliers(weight)
## # A tibble: 1 x 3
## weight is.outlier is.extreme
## <dbl> <lgl> <lgl>
## 1 242. TRUE FALSE
ost_tb <- ost_tb %>%
filter(!(weight <=242))
ost_tb %>%
ggplot(mapping = aes(y=weight)) +
geom_boxplot() +
labs(y = "몸무게")
정규분포
ost_tb %>%
shapiro_test(weight)
## # A tibble: 1 x 3
## variable statistic p
## <chr> <dbl> <dbl>
## 1 weight 0.987 0.422
통계분석(차이검증)
ost_tb %>%
t_test(formula = weight ~ 1,
alternative = "two.sided",
mu = 320,
conf.level = 0.95,
detailed = TRUE)
## # A tibble: 1 x 12
## estimate .y. group1 group2 n statistic p df conf.low conf.high
## * <dbl> <chr> <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 296. weig~ 1 null ~ 99 -12.3 1.27e-21 98 292. 300.
## # ... with 2 more variables: method <chr>, alternative <chr>
ost_tb %>%
cohens_d(formula = weight ~ 1,
mu=320)
## # A tibble: 1 x 6
## .y. group1 group2 effsize n magnitude
## * <chr> <chr> <chr> <dbl> <int> <ord>
## 1 weight 1 null model -1.24 99 large
추론(infer)
표본 평균(x)
x_bar <- ost_tb %>%
specify(response = weight) %>%
calculate(stat = "mean")
귀무가설의 분포를 만들어주라
null_dist_x <- ost_tb %>%
specify(response = weight) %>%
hypothesise(null = "point",
mu = 320) %>% # 귀무가설의 값:320
generate(reps = 1000,
type = "bootstrap") %>%
calculate(stat = "mean") %>%
print()
## Response: weight (numeric)
## Null Hypothesis: point
## # A tibble: 1,000 x 2
## replicate stat
## <int> <dbl>
## 1 1 322.
## 2 2 321.
## 3 3 323.
## 4 4 320.
## 5 5 317.
## 6 6 319.
## 7 7 324.
## 8 8 320.
## 9 9 322.
## 10 10 319.
## # ... with 990 more rows
null_dist_ci <- null_dist_x %>%
get_ci(level = 0.95,
type = "percentile") %>%
print()
## # A tibble: 1 x 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 316. 324.
null_dist_x %>%
visualize() +
shade_p_value(obs_stat = x_bar,
direction = "two-sided") +
shade_confidence_interval(endpoints = null_dist_ci)
null_dist_x %>%
get_p_value(obs_stat = x_bar,
direction = "two-sided")
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step. See
## `?get_p_value()` for more information.
## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0
t분포
t_cal <- ost_tb %>%
specify(response = weight) %>%
hypothesize(null = "point",
mu = 320) %>%
calculate(stat = "t") %>%
print()
## Response: weight (numeric)
## Null Hypothesis: point
## # A tibble: 1 x 1
## stat
## <dbl>
## 1 -12.3
null_dist_t <- ost_tb %>%
specify(response = weight) %>%
hypothesise(null = "point",
mu = 320) %>% # 귀무가설의 값:320
generate(reps = 1000,
type = "bootstrap") %>%
calculate(stat = "t") %>%
print()
## Response: weight (numeric)
## Null Hypothesis: point
## # A tibble: 1,000 x 2
## replicate stat
## <int> <dbl>
## 1 1 0.120
## 2 2 -0.170
## 3 3 0.409
## 4 4 0.623
## 5 5 1.13
## 6 6 -1.67
## 7 7 0.796
## 8 8 0.354
## 9 9 -0.300
## 10 10 0.921
## # ... with 990 more rows
null_dist_ci <- null_dist_t %>%
get_ci(level = 0.95,
type = "percentile") %>%
print()
## # A tibble: 1 x 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 -1.86 1.92
null_dist_t %>%
visualize(method = "both") +
shade_p_value(obs_stat = t_cal,
direction = "two-sided") +
shade_confidence_interval(endpoints = null_dist_ci)
## Warning: Check to make sure the conditions have been met for the theoretical
## method. {infer} currently does not check these for you.
Leave a comment