상관분석
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidymodels)
## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --
## v broom 0.8.0 v rsample 0.1.1
## v dials 0.1.1 v tune 0.2.0
## v infer 1.0.0 v workflows 0.2.6
## v modeldata 0.1.1 v workflowsets 0.2.1
## v parsnip 0.2.1 v yardstick 0.0.9
## v recipes 0.2.0
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## * Use tidymodels_prefer() to resolve common conflicts.
library(rstatix)
##
## 다음의 패키지를 부착합니다: 'rstatix'
## The following objects are masked from 'package:infer':
##
## chisq_test, prop_test, t_test
## The following object is masked from 'package:dials':
##
## get_n
## The following object is masked from 'package:stats':
##
## filter
library(skimr)
데이터 가져오기
corr_tb <- read_csv("12.CORR.csv",
col_names = TRUE,
na =".",
locale = locale("ko", encoding = "euc-kr"))
## Rows: 30 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (2): 몸무게, 키
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(corr_tb)
## spec_tbl_df [30 x 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ 몸무게: num [1:30] 72 72 70 43 48 54 51 52 73 45 ...
## $ 키 : num [1:30] 176 172 182 160 163 165 168 163 182 148 ...
## - attr(*, "spec")=
## .. cols(
## .. 몸무게 = col_double(),
## .. 키 = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
corr_tb
## # A tibble: 30 x 2
## 몸무게 키
## <dbl> <dbl>
## 1 72 176
## 2 72 172
## 3 70 182
## 4 43 160
## 5 48 163
## 6 54 165
## 7 51 168
## 8 52 163
## 9 73 182
## 10 45 148
## # ... with 20 more rows
기술통계분석
skim(corr_tb)
Data summary
Name |
corr_tb |
Number of rows |
30 |
Number of columns |
2 |
_______________________ |
|
Column type frequency: |
|
numeric |
2 |
________________________ |
|
Group variables |
None |
Variable type: numeric
몸무게 |
0 |
1 |
62.70 |
11.52 |
43 |
54.00 |
63 |
72.00 |
88 |
▇▆▇▇▂ |
키 |
0 |
1 |
170.33 |
8.68 |
148 |
164.25 |
170 |
175.75 |
188 |
▁▅▇▅▃ |
corr_tb %>%
get_summary_stats(몸무게, 키)
## # A tibble: 2 x 13
## variable n min max median q1 q3 iqr mad mean sd se
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 몸무게 30 43 88 63 54 72 18 13.3 62.7 11.5 2.10
## 2 키 30 148 188 170 164. 176. 11.5 8.90 170. 8.68 1.58
## # ... with 1 more variable: ci <dbl>
그래프 그리기
geom_jitter : 점의 위치 표현
corr_tb %>%
ggplot(mapping = aes(x=몸무게,
y=키))+
geom_jitter() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
상관분석
연속형 변수(키,몸무게) 상관관계 분석 : pearson
p값이 0.05미만이므로 상관관계가 있다
corr_tb %>%
cor_test(몸무게,키,
method="pearson")
## # A tibble: 1 x 8
## var1 var2 cor statistic p conf.low conf.high method
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 몸무게 키 0.86 8.79 0.00000000154 0.718 0.930 Pearson
단순 선형회귀분석
library(tidyverse)
library(tidymodels)
library(rstatix)
library(skimr)
데이터 가져오기
slr_tb<- read_csv("13.SLR.csv",
col_names = TRUE,
na =".",
locale = locale("ko", encoding = "euc-kr"))
## Rows: 62 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (2): col, fat
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(slr_tb)
## spec_tbl_df [62 x 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ col: num [1:62] 146 190 179 164 185 ...
## $ fat: num [1:62] 162 225 194 161 181 ...
## - attr(*, "spec")=
## .. cols(
## .. col = col_double(),
## .. fat = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
slr_tb
## # A tibble: 62 x 2
## col fat
## <dbl> <dbl>
## 1 146 162.
## 2 190. 225.
## 3 179. 194.
## 4 164. 161.
## 5 185. 181.
## 6 154. 137.
## 7 179 165.
## 8 225. 228.
## 9 145 113.
## 10 154. 123
## # ... with 52 more rows
기술통계분석
skim(slr_tb)
Data summary
Name |
slr_tb |
Number of rows |
62 |
Number of columns |
2 |
_______________________ |
|
Column type frequency: |
|
numeric |
2 |
________________________ |
|
Group variables |
None |
Variable type: numeric
col |
0 |
1 |
171.14 |
28.05 |
108.4 |
150.98 |
169.70 |
187.25 |
257.4 |
▂▇▇▃▁ |
fat |
0 |
1 |
117.15 |
81.56 |
40.9 |
65.43 |
102.25 |
141.23 |
508.5 |
▇▃▁▁▁ |
slr_tb %>%
get_summary_stats(col, fat)
## # A tibble: 2 x 13
## variable n min max median q1 q3 iqr mad mean sd se
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 col 62 108. 257. 170. 151. 187. 36.3 26.8 171. 28.1 3.56
## 2 fat 62 40.9 508. 102. 65.4 141. 75.8 56.5 117. 81.6 10.4
## # ... with 1 more variable: ci <dbl>
그래프 그리기
slr_tb %>%
ggplot(mapping = aes(x=col,
y=fat))+
geom_jitter() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
단순회귀분석
다중회귀분석은 독립변수가 2개이상임 => lm(fat~ A+B, data= )
#install.packages("lm.beta")
library(lm.beta)
slr_fit <- lm(fat ~ col,
data = slr_tb) %>%
lm.beta()
anova(slr_fit)
## Analysis of Variance Table
##
## Response: fat
## Df Sum Sq Mean Sq F value Pr(>F)
## col 1 184334 184334 49.938 1.925e-09 ***
## Residuals 60 221476 3691
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tidy(slr_fit, conf.int = TRUE)
## # A tibble: 2 x 8
## term estimate std_estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Interce~ -218. NA 48.1 -4.54 2.78e-5 NA NA
## 2 col 1.96 0.674 0.277 7.07 1.93e-9 0.119 1.23
glance(slr_fit)
## Warning: The `glance()` method for objects of class lm.beta is not maintained
## by the broom team, and is only supported through the lm tidier method. Please be
## cautious in interpreting and reporting broom output.
## # A tibble: 1 x 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.454 0.445 60.8 49.9 0.00000000193 1 -342. 689. 696.
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
회귀분석 가정
opar <- par(no.readonly = TRUE)
par(mfrow=c(2,2))
plot(slr_fit)
par(opar)
shapiro_test(slr_fit$residuals)
## # A tibble: 1 x 3
## variable statistic p.value
## <chr> <dbl> <dbl>
## 1 slr_fit$residuals 0.848 0.00000192
library(car)
## 필요한 패키지를 로딩중입니다: carData
##
## 다음의 패키지를 부착합니다: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
car::ncvTest(slr_fit)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 33.2511, Df = 1, p = 8.0994e-09
car::influencePlot(slr_fit, id.method="identify")
## Warning in plot.window(...): "id.method"는 그래픽 매개변수가 아닙니다
## Warning in plot.xy(xy, type, ...): "id.method"는 그래픽 매개변수가 아닙니다
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method"는 그래
## 픽 매개변수가 아닙니다
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method"는 그래
## 픽 매개변수가 아닙니다
## Warning in box(...): "id.method"는 그래픽 매개변수가 아닙니다
## Warning in title(...): "id.method"는 그래픽 매개변수가 아닙니다
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.method"는 그래픽 매개
## 변수가 아닙니다
## StudRes Hat CookD
## 15 0.8628319 0.09811898 0.04067052
## 61 4.9889620 0.03172057 0.29159034
## 62 4.6626699 0.17111158 1.66756404
slr_tb <- slr_tb[-c(61,62),]
다중회귀분석
library(tidyverse)
library(tidymodels)
library(rstatix)
library(skimr)
데이터 가져오기
mr_tb<- read_csv("14.MR.csv",
col_names = TRUE,
na =".",
locale = locale("ko", encoding = "euc-kr"))
## Rows: 175 Columns: 7
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (7): id, design, info, comm, op, fb, flow
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(mr_tb)
## spec_tbl_df [175 x 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ id : num [1:175] 1 2 3 4 5 6 7 8 9 10 ...
## $ design: num [1:175] 1.23 0.7 -0.19 1.2 0.14 -0.83 -0.45 -1.1 0.33 -1.67 ...
## $ info : num [1:175] -1.46 -0.38 0.84 -1.21 -1.44 -1.45 0.29 -0.92 -1.47 -1.24 ...
## $ comm : num [1:175] 1.67 -0.79 -1.12 -0.42 -1.48 -0.36 0.39 -3.02 -0.08 -0.47 ...
## $ op : num [1:175] 0.76 -0.31 0.06 1.8 0.2 1.55 0.62 -0.94 0 -1.99 ...
## $ fb : num [1:175] 1.22 1.29 -0.32 -0.27 0.68 -0.47 0.57 -0.21 0.08 1.46 ...
## $ flow : num [1:175] 2.06 -0.34 0.75 -2.2 -0.18 -0.34 0.75 -3.29 -0.34 -1.11 ...
## - attr(*, "spec")=
## .. cols(
## .. id = col_double(),
## .. design = col_double(),
## .. info = col_double(),
## .. comm = col_double(),
## .. op = col_double(),
## .. fb = col_double(),
## .. flow = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
mr_tb
## # A tibble: 175 x 7
## id design info comm op fb flow
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1.23 -1.46 1.67 0.76 1.22 2.06
## 2 2 0.7 -0.38 -0.79 -0.31 1.29 -0.34
## 3 3 -0.19 0.84 -1.12 0.06 -0.32 0.75
## 4 4 1.2 -1.21 -0.42 1.8 -0.27 -2.2
## 5 5 0.14 -1.44 -1.48 0.2 0.68 -0.18
## 6 6 -0.83 -1.45 -0.36 1.55 -0.47 -0.34
## 7 7 -0.45 0.29 0.39 0.62 0.57 0.75
## 8 8 -1.1 -0.92 -3.02 -0.94 -0.21 -3.29
## 9 9 0.33 -1.47 -0.08 0 0.08 -0.34
## 10 10 -1.67 -1.24 -0.47 -1.99 1.46 -1.11
## # ... with 165 more rows
기본 통계 확인
skim(mr_tb)
Data summary
Name |
mr_tb |
Number of rows |
175 |
Number of columns |
7 |
_______________________ |
|
Column type frequency: |
|
numeric |
7 |
________________________ |
|
Group variables |
None |
Variable type: numeric
id |
0 |
1 |
88 |
50.66 |
1.00 |
44.50 |
88.00 |
131.50 |
175.00 |
▇▇▇▇▇ |
design |
0 |
1 |
0 |
1.00 |
-3.01 |
-0.58 |
0.03 |
0.66 |
2.56 |
▁▃▇▅▁ |
info |
0 |
1 |
0 |
1.00 |
-3.21 |
-0.56 |
0.09 |
0.64 |
2.96 |
▁▃▇▅▁ |
comm |
0 |
1 |
0 |
1.00 |
-3.34 |
-0.66 |
-0.06 |
0.72 |
2.72 |
▁▂▇▆▁ |
op |
0 |
1 |
0 |
1.00 |
-2.48 |
-0.72 |
-0.06 |
0.70 |
3.23 |
▂▅▇▃▁ |
fb |
0 |
1 |
0 |
1.00 |
-2.29 |
-0.68 |
0.02 |
0.69 |
3.28 |
▂▇▇▂▁ |
flow |
0 |
1 |
0 |
1.00 |
-3.29 |
-0.53 |
-0.18 |
0.75 |
2.61 |
▁▂▇▆▁ |
mr_tb %>%
get_summary_stats()
## # A tibble: 7 x 13
## variable n min max median q1 q3 iqr mad mean sd
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 comm 175 -3.34 2.72 -0.06 -0.665 0.72 1.38 1.02 0 1
## 2 design 175 -3.01 2.56 0.03 -0.575 0.66 1.24 0.919 0 1
## 3 fb 175 -2.29 3.28 0.02 -0.675 0.69 1.36 1.01 0 1
## 4 flow 175 -3.29 2.61 -0.18 -0.535 0.75 1.28 1.05 -0.001 1
## 5 id 175 1 175 88 44.5 132. 87 65.2 88 50.7
## 6 info 175 -3.21 2.96 0.09 -0.56 0.64 1.2 0.875 0 1
## 7 op 175 -2.48 3.23 -0.06 -0.725 0.695 1.42 1.07 0 1
## # ... with 2 more variables: se <dbl>, ci <dbl>
mr_tb <- mr_tb %>%
select(-(id))
그래프 그리기
pairs(~., data = mr_tb)
다중회귀분석
library(lm.beta)
mr_fit <- lm(flow ~ design+info+comm+op +fb,
data=mr_tb) %>%
lm.beta()
summary(mr_fit)
##
## Call:
## lm(formula = flow ~ design + info + comm + op + fb, data = mr_tb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.51516 -0.40653 0.01118 0.51748 2.15898
##
## Coefficients:
## Estimate Standardized Std. Error t value Pr(>|t|)
## (Intercept) -0.0006925 NA 0.0604381 -0.011 0.990871
## design 0.3539020 0.3538603 0.0605969 5.840 2.61e-08 ***
## info 0.2446373 0.2446131 0.0605957 4.037 8.19e-05 ***
## comm 0.3312351 0.3311719 0.0606013 5.466 1.63e-07 ***
## op 0.2085877 0.2085529 0.0605998 3.442 0.000728 ***
## fb 0.2040364 0.2039402 0.0606183 3.366 0.000944 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7995 on 169 degrees of freedom
## Multiple R-squared: 0.3796, Adjusted R-squared: 0.3612
## F-statistic: 20.68 on 5 and 169 DF, p-value: 4.359e-16
tidy(mr_fit)
## # A tibble: 6 x 6
## term estimate std_estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -0.000693 NA 0.0604 -0.0115 0.991
## 2 design 0.354 0.354 0.0606 5.84 0.0000000261
## 3 info 0.245 0.245 0.0606 4.04 0.0000819
## 4 comm 0.331 0.331 0.0606 5.47 0.000000163
## 5 op 0.209 0.209 0.0606 3.44 0.000728
## 6 fb 0.204 0.204 0.0606 3.37 0.000944
glance(mr_fit) # r.squared : 결정계수 , 1에 가까울수록 모델 설명력이 좋은것
## Warning: The `glance()` method for objects of class lm.beta is not maintained
## by the broom team, and is only supported through the lm tidier method. Please be
## cautious in interpreting and reporting broom output.
## # A tibble: 1 x 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.380 0.361 0.800 20.7 4.36e-16 5 -206. 426. 448.
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
회귀식 가정
opar <- par(no.readonly = TRUE)
par(mfrow=c(2,2))
plot(mr_fit)
par(opar)
shapiro_test(mr_fit$residuals)
## # A tibble: 1 x 3
## variable statistic p.value
## <chr> <dbl> <dbl>
## 1 mr_fit$residuals 0.986 0.0836
#등분산성을 검정하는 방법이다. p-value > 0.05면 등분산성을 만족
library(car)
ncvTest(mr_fit)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.328302, Df = 1, p = 0.56666
influencePlot(mr_fit, id.method="identify")
## Warning in plot.window(...): "id.method"는 그래픽 매개변수가 아닙니다
## Warning in plot.xy(xy, type, ...): "id.method"는 그래픽 매개변수가 아닙니다
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method"는 그래
## 픽 매개변수가 아닙니다
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method"는 그래
## 픽 매개변수가 아닙니다
## Warning in box(...): "id.method"는 그래픽 매개변수가 아닙니다
## Warning in title(...): "id.method"는 그래픽 매개변수가 아닙니다
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.method"는 그래픽 매개
## 변수가 아닙니다
## StudRes Hat CookD
## 4 -3.29966948 0.04242307 7.594934e-02
## 66 2.42797272 0.08320834 8.666266e-02
## 69 -3.30344105 0.03995443 7.149907e-02
## 74 -0.45755557 0.15038636 6.205260e-03
## 86 -0.05172283 0.12938180 6.665446e-05
## 93 -2.19253241 0.08868148 7.624799e-02
backward
mr_fit_bk <- lm(flow ~ ., data=mr_tb)
mr_fit_bk <- stats::step(mr_fit_bk,
direction = "backward",
trace = TRUE)
## Start: AIC=-72.42
## flow ~ design + info + comm + op + fb
##
## Df Sum of Sq RSS AIC
## <none> 108.03 -72.415
## - fb 1 7.2422 115.27 -63.060
## - op 1 7.5735 115.60 -62.558
## - info 1 10.4189 118.45 -58.303
## - comm 1 19.0972 127.13 -45.929
## - design 1 21.8035 129.83 -42.243
mr_fit_fw <- lm(flow ~ 1, data=mr_tb)
mr_fit_fw <- stats::step(mr_fit_fw,
direction = 'forward',
scope = (flow ~ design+info+comm+op +fb),
trace = TRUE)
## Start: AIC=1.12
## flow ~ 1
##
## Df Sum of Sq RSS AIC
## + design 1 21.7762 152.35 -20.2570
## + comm 1 19.0888 155.04 -17.1969
## + info 1 10.3936 163.73 -7.6474
## + op 1 7.5677 166.56 -4.6528
## + fb 1 7.2282 166.90 -4.2965
## <none> 174.12 1.1231
##
## Step: AIC=-20.26
## flow ~ design
##
## Df Sum of Sq RSS AIC
## + comm 1 19.0950 133.25 -41.692
## + info 1 10.4063 141.94 -30.638
## + op 1 7.5707 144.78 -27.177
## + fb 1 7.2335 145.12 -26.770
## <none> 152.35 -20.257
##
## Step: AIC=-41.69
## flow ~ design + comm
##
## Df Sum of Sq RSS AIC
## + info 1 10.4046 122.85 -53.919
## + op 1 7.5681 125.69 -49.925
## + fb 1 7.2400 126.01 -49.469
## <none> 133.25 -41.692
##
## Step: AIC=-53.92
## flow ~ design + comm + info
##
## Df Sum of Sq RSS AIC
## + op 1 7.5769 115.27 -63.060
## + fb 1 7.2456 115.60 -62.558
## <none> 122.85 -53.919
##
## Step: AIC=-63.06
## flow ~ design + comm + info + op
##
## Df Sum of Sq RSS AIC
## + fb 1 7.2422 108.03 -72.415
## <none> 115.27 -63.060
##
## Step: AIC=-72.42
## flow ~ design + comm + info + op + fb
Leave a comment