데이터마이닝의 이론과 실제 10주차

상관분석

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidymodels)
## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --
## v broom        0.8.0     v rsample      0.1.1
## v dials        0.1.1     v tune         0.2.0
## v infer        1.0.0     v workflows    0.2.6
## v modeldata    0.1.1     v workflowsets 0.2.1
## v parsnip      0.2.1     v yardstick    0.0.9
## v recipes      0.2.0
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## * Use tidymodels_prefer() to resolve common conflicts.
library(rstatix)
## 
## 다음의 패키지를 부착합니다: 'rstatix'
## The following objects are masked from 'package:infer':
## 
##     chisq_test, prop_test, t_test
## The following object is masked from 'package:dials':
## 
##     get_n
## The following object is masked from 'package:stats':
## 
##     filter
library(skimr)

데이터 가져오기

corr_tb <- read_csv("12.CORR.csv",
                   col_names = TRUE,
                   na =".",
                   locale = locale("ko", encoding = "euc-kr")) 
## Rows: 30 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (2): 몸무게, 키
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(corr_tb)
## spec_tbl_df [30 x 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ 몸무게: num [1:30] 72 72 70 43 48 54 51 52 73 45 ...
##  $ 키    : num [1:30] 176 172 182 160 163 165 168 163 182 148 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   몸무게 = col_double(),
##   ..   키 = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
corr_tb
## # A tibble: 30 x 2
##    몸무게    키
##     <dbl> <dbl>
##  1     72   176
##  2     72   172
##  3     70   182
##  4     43   160
##  5     48   163
##  6     54   165
##  7     51   168
##  8     52   163
##  9     73   182
## 10     45   148
## # ... with 20 more rows

기술통계분석

skim(corr_tb)
Data summary
Name corr_tb
Number of rows 30
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
몸무게 0 1 62.70 11.52 43 54.00 63 72.00 88 ▇▆▇▇▂
0 1 170.33 8.68 148 164.25 170 175.75 188 ▁▅▇▅▃
corr_tb %>% 
  get_summary_stats(몸무게, 키)
## # A tibble: 2 x 13
##   variable     n   min   max median    q1    q3   iqr   mad  mean    sd    se
##   <chr>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 몸무게      30    43    88     63   54    72   18   13.3   62.7 11.5   2.10
## 2 키          30   148   188    170  164.  176.  11.5  8.90 170.   8.68  1.58
## # ... with 1 more variable: ci <dbl>

그래프 그리기

geom_jitter : 점의 위치 표현

corr_tb %>% 
  ggplot(mapping = aes(x=몸무게,
                       y=키))+
  geom_jitter() +
  geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

상관분석

연속형 변수(키,몸무게) 상관관계 분석 : pearson

p값이 0.05미만이므로 상관관계가 있다

corr_tb %>%
  cor_test(몸무게,키,
              method="pearson")
## # A tibble: 1 x 8
##   var1   var2    cor statistic             p conf.low conf.high method 
##   <chr>  <chr> <dbl>     <dbl>         <dbl>    <dbl>     <dbl> <chr>  
## 1 몸무게 키     0.86      8.79 0.00000000154    0.718     0.930 Pearson

단순 선형회귀분석

library(tidyverse)
library(tidymodels)
library(rstatix)
library(skimr)

데이터 가져오기

slr_tb<- read_csv("13.SLR.csv",
                    col_names = TRUE,
                    na =".",
                    locale = locale("ko", encoding = "euc-kr")) 
## Rows: 62 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (2): col, fat
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(slr_tb)
## spec_tbl_df [62 x 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ col: num [1:62] 146 190 179 164 185 ...
##  $ fat: num [1:62] 162 225 194 161 181 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   col = col_double(),
##   ..   fat = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
slr_tb
## # A tibble: 62 x 2
##      col   fat
##    <dbl> <dbl>
##  1  146   162.
##  2  190.  225.
##  3  179.  194.
##  4  164.  161.
##  5  185.  181.
##  6  154.  137.
##  7  179   165.
##  8  225.  228.
##  9  145   113.
## 10  154.  123 
## # ... with 52 more rows

기술통계분석

skim(slr_tb)
Data summary
Name slr_tb
Number of rows 62
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
col 0 1 171.14 28.05 108.4 150.98 169.70 187.25 257.4 ▂▇▇▃▁
fat 0 1 117.15 81.56 40.9 65.43 102.25 141.23 508.5 ▇▃▁▁▁
slr_tb %>% 
  get_summary_stats(col, fat)
## # A tibble: 2 x 13
##   variable     n   min   max median    q1    q3   iqr   mad  mean    sd    se
##   <chr>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 col         62 108.   257.   170. 151.   187.  36.3  26.8  171.  28.1  3.56
## 2 fat         62  40.9  508.   102.  65.4  141.  75.8  56.5  117.  81.6 10.4 
## # ... with 1 more variable: ci <dbl>

그래프 그리기

slr_tb %>% 
  ggplot(mapping = aes(x=col,
                       y=fat))+
  geom_jitter() +
  geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

단순회귀분석

다중회귀분석은 독립변수가 2개이상임 => lm(fat~ A+B, data= )

#install.packages("lm.beta")
library(lm.beta)
slr_fit <- lm(fat ~ col, 
              data = slr_tb) %>%
  lm.beta()
anova(slr_fit)
## Analysis of Variance Table
## 
## Response: fat
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## col        1 184334  184334  49.938 1.925e-09 ***
## Residuals 60 221476    3691                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tidy(slr_fit, conf.int = TRUE)
## # A tibble: 2 x 8
##   term      estimate std_estimate std.error statistic p.value conf.low conf.high
##   <chr>        <dbl>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Interce~  -218.         NA        48.1       -4.54 2.78e-5   NA         NA   
## 2 col           1.96        0.674     0.277      7.07 1.93e-9    0.119      1.23
glance(slr_fit)
## Warning: The `glance()` method for objects of class lm.beta is not maintained
## by the broom team, and is only supported through the lm tidier method. Please be
## cautious in interpreting and reporting broom output.
## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic       p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>         <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.454         0.445  60.8      49.9 0.00000000193     1  -342.  689.  696.
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

회귀분석 가정

opar <- par(no.readonly = TRUE)
  par(mfrow=c(2,2))
  plot(slr_fit)

par(opar)
shapiro_test(slr_fit$residuals)
## # A tibble: 1 x 3
##   variable          statistic    p.value
##   <chr>                 <dbl>      <dbl>
## 1 slr_fit$residuals     0.848 0.00000192
library(car)
## 필요한 패키지를 로딩중입니다: carData
## 
## 다음의 패키지를 부착합니다: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
car::ncvTest(slr_fit)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 33.2511, Df = 1, p = 8.0994e-09
car::influencePlot(slr_fit, id.method="identify")
## Warning in plot.window(...): "id.method"는 그래픽 매개변수가 아닙니다
## Warning in plot.xy(xy, type, ...): "id.method"는 그래픽 매개변수가 아닙니다
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method"는 그래
## 픽 매개변수가 아닙니다

## Warning in axis(side = side, at = at, labels = labels, ...): "id.method"는 그래
## 픽 매개변수가 아닙니다
## Warning in box(...): "id.method"는 그래픽 매개변수가 아닙니다
## Warning in title(...): "id.method"는 그래픽 매개변수가 아닙니다
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.method"는 그래픽 매개
## 변수가 아닙니다

##      StudRes        Hat      CookD
## 15 0.8628319 0.09811898 0.04067052
## 61 4.9889620 0.03172057 0.29159034
## 62 4.6626699 0.17111158 1.66756404
slr_tb <- slr_tb[-c(61,62),]

다중회귀분석

library(tidyverse)
library(tidymodels)
library(rstatix)
library(skimr)

데이터 가져오기

mr_tb<- read_csv("14.MR.csv",
                  col_names = TRUE,
                  na =".",
                  locale = locale("ko", encoding = "euc-kr")) 
## Rows: 175 Columns: 7
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (7): id, design, info, comm, op, fb, flow
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(mr_tb)
## spec_tbl_df [175 x 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ id    : num [1:175] 1 2 3 4 5 6 7 8 9 10 ...
##  $ design: num [1:175] 1.23 0.7 -0.19 1.2 0.14 -0.83 -0.45 -1.1 0.33 -1.67 ...
##  $ info  : num [1:175] -1.46 -0.38 0.84 -1.21 -1.44 -1.45 0.29 -0.92 -1.47 -1.24 ...
##  $ comm  : num [1:175] 1.67 -0.79 -1.12 -0.42 -1.48 -0.36 0.39 -3.02 -0.08 -0.47 ...
##  $ op    : num [1:175] 0.76 -0.31 0.06 1.8 0.2 1.55 0.62 -0.94 0 -1.99 ...
##  $ fb    : num [1:175] 1.22 1.29 -0.32 -0.27 0.68 -0.47 0.57 -0.21 0.08 1.46 ...
##  $ flow  : num [1:175] 2.06 -0.34 0.75 -2.2 -0.18 -0.34 0.75 -3.29 -0.34 -1.11 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_double(),
##   ..   design = col_double(),
##   ..   info = col_double(),
##   ..   comm = col_double(),
##   ..   op = col_double(),
##   ..   fb = col_double(),
##   ..   flow = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
mr_tb
## # A tibble: 175 x 7
##       id design  info  comm    op    fb  flow
##    <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1     1   1.23 -1.46  1.67  0.76  1.22  2.06
##  2     2   0.7  -0.38 -0.79 -0.31  1.29 -0.34
##  3     3  -0.19  0.84 -1.12  0.06 -0.32  0.75
##  4     4   1.2  -1.21 -0.42  1.8  -0.27 -2.2 
##  5     5   0.14 -1.44 -1.48  0.2   0.68 -0.18
##  6     6  -0.83 -1.45 -0.36  1.55 -0.47 -0.34
##  7     7  -0.45  0.29  0.39  0.62  0.57  0.75
##  8     8  -1.1  -0.92 -3.02 -0.94 -0.21 -3.29
##  9     9   0.33 -1.47 -0.08  0     0.08 -0.34
## 10    10  -1.67 -1.24 -0.47 -1.99  1.46 -1.11
## # ... with 165 more rows

기본 통계 확인

skim(mr_tb)
Data summary
Name mr_tb
Number of rows 175
Number of columns 7
_______________________
Column type frequency:
numeric 7
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 88 50.66 1.00 44.50 88.00 131.50 175.00 ▇▇▇▇▇
design 0 1 0 1.00 -3.01 -0.58 0.03 0.66 2.56 ▁▃▇▅▁
info 0 1 0 1.00 -3.21 -0.56 0.09 0.64 2.96 ▁▃▇▅▁
comm 0 1 0 1.00 -3.34 -0.66 -0.06 0.72 2.72 ▁▂▇▆▁
op 0 1 0 1.00 -2.48 -0.72 -0.06 0.70 3.23 ▂▅▇▃▁
fb 0 1 0 1.00 -2.29 -0.68 0.02 0.69 3.28 ▂▇▇▂▁
flow 0 1 0 1.00 -3.29 -0.53 -0.18 0.75 2.61 ▁▂▇▆▁
mr_tb %>%
  get_summary_stats()
## # A tibble: 7 x 13
##   variable     n   min    max median     q1      q3   iqr    mad   mean    sd
##   <chr>    <dbl> <dbl>  <dbl>  <dbl>  <dbl>   <dbl> <dbl>  <dbl>  <dbl> <dbl>
## 1 comm       175 -3.34   2.72  -0.06 -0.665   0.72   1.38  1.02   0       1  
## 2 design     175 -3.01   2.56   0.03 -0.575   0.66   1.24  0.919  0       1  
## 3 fb         175 -2.29   3.28   0.02 -0.675   0.69   1.36  1.01   0       1  
## 4 flow       175 -3.29   2.61  -0.18 -0.535   0.75   1.28  1.05  -0.001   1  
## 5 id         175  1    175     88    44.5   132.    87    65.2   88      50.7
## 6 info       175 -3.21   2.96   0.09 -0.56    0.64   1.2   0.875  0       1  
## 7 op         175 -2.48   3.23  -0.06 -0.725   0.695  1.42  1.07   0       1  
## # ... with 2 more variables: se <dbl>, ci <dbl>
mr_tb <- mr_tb %>%
  select(-(id))

그래프 그리기

pairs(~., data = mr_tb)

다중회귀분석

library(lm.beta)
mr_fit <- lm(flow ~ design+info+comm+op +fb,
             data=mr_tb) %>%
  lm.beta()
summary(mr_fit)
## 
## Call:
## lm(formula = flow ~ design + info + comm + op + fb, data = mr_tb)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.51516 -0.40653  0.01118  0.51748  2.15898 
## 
## Coefficients:
##               Estimate Standardized Std. Error t value Pr(>|t|)    
## (Intercept) -0.0006925           NA  0.0604381  -0.011 0.990871    
## design       0.3539020    0.3538603  0.0605969   5.840 2.61e-08 ***
## info         0.2446373    0.2446131  0.0605957   4.037 8.19e-05 ***
## comm         0.3312351    0.3311719  0.0606013   5.466 1.63e-07 ***
## op           0.2085877    0.2085529  0.0605998   3.442 0.000728 ***
## fb           0.2040364    0.2039402  0.0606183   3.366 0.000944 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7995 on 169 degrees of freedom
## Multiple R-squared:  0.3796, Adjusted R-squared:  0.3612 
## F-statistic: 20.68 on 5 and 169 DF,  p-value: 4.359e-16
tidy(mr_fit)
## # A tibble: 6 x 6
##   term         estimate std_estimate std.error statistic      p.value
##   <chr>           <dbl>        <dbl>     <dbl>     <dbl>        <dbl>
## 1 (Intercept) -0.000693       NA        0.0604   -0.0115 0.991       
## 2 design       0.354           0.354    0.0606    5.84   0.0000000261
## 3 info         0.245           0.245    0.0606    4.04   0.0000819   
## 4 comm         0.331           0.331    0.0606    5.47   0.000000163 
## 5 op           0.209           0.209    0.0606    3.44   0.000728    
## 6 fb           0.204           0.204    0.0606    3.37   0.000944
glance(mr_fit) # r.squared : 결정계수 , 1에 가까울수록 모델 설명력이 좋은것
## Warning: The `glance()` method for objects of class lm.beta is not maintained
## by the broom team, and is only supported through the lm tidier method. Please be
## cautious in interpreting and reporting broom output.
## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.380         0.361 0.800      20.7 4.36e-16     5  -206.  426.  448.
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

회귀식 가정

opar <- par(no.readonly = TRUE)
  par(mfrow=c(2,2))
  plot(mr_fit)

par(opar)
shapiro_test(mr_fit$residuals)
## # A tibble: 1 x 3
##   variable         statistic p.value
##   <chr>                <dbl>   <dbl>
## 1 mr_fit$residuals     0.986  0.0836
#등분산성을 검정하는 방법이다. p-value > 0.05면 등분산성을 만족
library(car)
ncvTest(mr_fit) 
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.328302, Df = 1, p = 0.56666
influencePlot(mr_fit, id.method="identify")
## Warning in plot.window(...): "id.method"는 그래픽 매개변수가 아닙니다
## Warning in plot.xy(xy, type, ...): "id.method"는 그래픽 매개변수가 아닙니다
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method"는 그래
## 픽 매개변수가 아닙니다

## Warning in axis(side = side, at = at, labels = labels, ...): "id.method"는 그래
## 픽 매개변수가 아닙니다
## Warning in box(...): "id.method"는 그래픽 매개변수가 아닙니다
## Warning in title(...): "id.method"는 그래픽 매개변수가 아닙니다
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.method"는 그래픽 매개
## 변수가 아닙니다

##        StudRes        Hat        CookD
## 4  -3.29966948 0.04242307 7.594934e-02
## 66  2.42797272 0.08320834 8.666266e-02
## 69 -3.30344105 0.03995443 7.149907e-02
## 74 -0.45755557 0.15038636 6.205260e-03
## 86 -0.05172283 0.12938180 6.665446e-05
## 93 -2.19253241 0.08868148 7.624799e-02

backward

mr_fit_bk <- lm(flow ~ ., data=mr_tb)
mr_fit_bk <- stats::step(mr_fit_bk,
                         direction = "backward",
                         trace = TRUE)
## Start:  AIC=-72.42
## flow ~ design + info + comm + op + fb
## 
##          Df Sum of Sq    RSS     AIC
## <none>                108.03 -72.415
## - fb      1    7.2422 115.27 -63.060
## - op      1    7.5735 115.60 -62.558
## - info    1   10.4189 118.45 -58.303
## - comm    1   19.0972 127.13 -45.929
## - design  1   21.8035 129.83 -42.243
mr_fit_fw <- lm(flow ~ 1, data=mr_tb)
mr_fit_fw <- stats::step(mr_fit_fw,
                         direction = 'forward',
                         scope = (flow ~ design+info+comm+op +fb),
                         trace = TRUE)
## Start:  AIC=1.12
## flow ~ 1
## 
##          Df Sum of Sq    RSS      AIC
## + design  1   21.7762 152.35 -20.2570
## + comm    1   19.0888 155.04 -17.1969
## + info    1   10.3936 163.73  -7.6474
## + op      1    7.5677 166.56  -4.6528
## + fb      1    7.2282 166.90  -4.2965
## <none>                174.12   1.1231
## 
## Step:  AIC=-20.26
## flow ~ design
## 
##        Df Sum of Sq    RSS     AIC
## + comm  1   19.0950 133.25 -41.692
## + info  1   10.4063 141.94 -30.638
## + op    1    7.5707 144.78 -27.177
## + fb    1    7.2335 145.12 -26.770
## <none>              152.35 -20.257
## 
## Step:  AIC=-41.69
## flow ~ design + comm
## 
##        Df Sum of Sq    RSS     AIC
## + info  1   10.4046 122.85 -53.919
## + op    1    7.5681 125.69 -49.925
## + fb    1    7.2400 126.01 -49.469
## <none>              133.25 -41.692
## 
## Step:  AIC=-53.92
## flow ~ design + comm + info
## 
##        Df Sum of Sq    RSS     AIC
## + op    1    7.5769 115.27 -63.060
## + fb    1    7.2456 115.60 -62.558
## <none>              122.85 -53.919
## 
## Step:  AIC=-63.06
## flow ~ design + comm + info + op
## 
##        Df Sum of Sq    RSS     AIC
## + fb    1    7.2422 108.03 -72.415
## <none>              115.27 -63.060
## 
## Step:  AIC=-72.42
## flow ~ design + comm + info + op + fb

Leave a comment