青年駕駛人重大事故回歸分析

場景

  • 根據統計調查,年齡 21 歲以下駕駛人發生重大交通事故比例,如:附件資料檔內容。

問題

  • 製作統計圖表
  • 進行回歸分析,解釋年齡 21 歲以下駕駛人與發生重大交通事故,是否有關連。

GNU R

library(ggplot2)

options(digits=4)
rm(list=ls())

myData <- read.table("Simple-Linear-Regression-Case2.dat", 
                     header=TRUE)

myData.Model = lm(CasultyRate ~ YouthRate, data=myData)
myData.Info <- summary(myData.Model)

myData.Formula <- as.expression(
  substitute(bolditalic(y)==b0+b1*bolditalic(x),
             list(b0=myData.Info$coefficients[1],
                  b1=myData.Info$coefficients[2])))

myData.RR <- substitute(bolditalic(R)^2==rr,
                        list(rr=round(myData.Info$adj.r.squared, digits=4)))

p <- ggplot(myData, aes(x=YouthRate, y=CasultyRate)) +
                          geom_smooth(method='lm') +
                          geom_point() +
                          labs(title=myData.Formula) +
                          xlab("Youth Rate") +
                          ylab("Casulty Rate") + 
  annotate("text", parse=TRUE,
           label=as.character(as.expression(myData.RR)), 
           x=16, y=0.5, size=8, colour='blue')

p

# 讀取資料檔案至表格中
TrafficCasulty <- read.table("Simple-Linear-Regression-Case2.dat", header=TRUE)

# 敘述統計資料
summary(TrafficCasulty)

# 結果
#    YouthRate      CasultyRate   
#  Min.   : 8.00   Min.   :0.039  
#  1st Qu.: 9.25   1st Qu.:1.018  
#  Median :12.00   Median :1.881  
#  Mean   :12.26   Mean   :1.922  
#  3rd Qu.:14.75   3rd Qu.:2.811  
#  Max.   :18.00   Max.   :4.100  

# 繪製 Box Plot 圖
par(mfrow=c(2,2))
boxplot(main="YouthRate Mean", TrafficCasulty$YouthRate, ylab="YouthRate")
boxplot(main="CasultyRate Mean", TrafficCasulty$CasultyRate, ylab="CasultyRate")

YouthRate.Size = length(TrafficCasulty$YouthRate)
YouthRate.Mean = mean(TrafficCasulty$YouthRate)
YouthRate.StdDeviation = sd(TrafficCasulty$YouthRate)

CasultyRate.Size = length(TrafficCasulty$CasultyRate)
CasultyRate.Mean = mean(TrafficCasulty$CasultyRate)
CasultyRate.StdDeviation = sd(TrafficCasulty$CasultyRate)

print(sprintf("YouthRate Size=%d Mean=%.4f StdDeviation=%.4f", YouthRate.Size, YouthRate.Mean, YouthRate.StdDeviation))
print(sprintf("CasultyRate Size=%d Mean=%.4f StdDeviation=%.4f", CasultyRate.Size, CasultyRate.Mean, CasultyRate.StdDeviation))

# 結果
# YouthRate Mean=12.2619 StdDeviation=3.1317
###############################################################################
# y = β0 + β1x + ε

YouthRate.Var = TrafficCasulty$YouthRate - YouthRate.Mean
CasultyRate.Var = TrafficCasulty$CasultyRate - CasultyRate.Mean
Sum.Var = sum(YouthRate.Var * CasultyRate.Var)
Sum.VarSqr = sum(YouthRate.Var ^ 2)

b1 = Sum.Var / Sum.VarSqr
b0 = CasultyRate.Mean - b1 * YouthRate.Mean

print(sprintf("ŷ = %.4f + %.4fx", b0, b1))

# 結果
# ŷ = -1.5974 + 0.2871x
###############################################################################
plot(TrafficCasulty$YouthRate, TrafficCasulty$CasultyRate, main="YouthRate - CasultyRate",
      xlab="YouthRate", ylab="CasultyRate")

abline(a=b0, b=b1, col="blue")
###############################################################################
# Total SUm of Squares
SST = 0
for (i in c(1:CasultyRate.Size)) {
  Diff.Sqr = (TrafficCasulty$CasultyRate[i] - CasultyRate.Mean) ^ 2
  SST = SST + Diff.Sqr
}

# Sum of Squares due to Regression
SSR = 0
for (i in c(1:YouthRate.Size)) {
  Y.hat = TrafficCasulty$YouthRate[i] * b1 + b0
  Diff.Sqr = (Y.hat - CasultyRate.Mean) ^ 2
  SSR = SSR + Diff.Sqr
}

# Sum of Squares due to Error
SSE = SST - SSR

# Coefficient of Determination
R.Square = SSR / SST

# Sample Correlation Coefficient
if (b1 > 0) {
  r.xy = sqrt(R.Square)
} else {
  r.xy = (-1) * sqrt(R.Square)
}

print(sprintf("SST(%.4f) = SSR(%.4f) + SSE(%.4fx) -> R2(%.2f%%) Correlation=(%.4f) -> Positive Linear Associated",
       SST, SSR, SSE, R.Square * 100, r.xy))

# 結果
# SST(47.0278) = SSR(33.1344) + SSE(13.8934x) -> R^2(70.46%) Correlation=(0.8394) -> Positive Linear Associated
###############################################################################
plot(YouthRate.Var, CasultyRate.Var, main="Covariance YouthRate-CasultyRate", 
      xlab="Covariance YouthRate%", ylab="Covariance CasultyRate%")

Sum.Var = sum(YouthRate.Var * CasultyRate.Var)
Covariance.TrafficCasulty = Sum.Var / (YouthRate.Size - 1)

print(sprintf("Covariance.YouthRate-CasultyRate=%.4f -> Positive Linear Associated", Covariance.TrafficCasulty))

# 結果
# Covariance.YouthRate-CasultyRate=2.8154 -> Positive Linear Associated

解答

   YouthRate      CasultyRate   
 Min.   : 8.00   Min.   :0.039  
 1st Qu.: 9.25   1st Qu.:1.018  
 Median :12.00   Median :1.881  
 Mean   :12.26   Mean   :1.922  
 3rd Qu.:14.75   3rd Qu.:2.811  
 Max.   :18.00   Max.   :4.100  
 
 YouthRate Size=42 Mean=12.2619 StdDeviation=3.1317
 CasultyRate Size=42 Mean=1.9224 StdDeviation=1.0710

 ŷ = -1.5974 + 0.2871x

 SST(47.0278) = SSR(33.1344) + SSE(13.8934x) -> 
 R2(70.46%) Correlation=(0.8394) -> Positive Linear Associated

 Covariance.YouthRate-CasultyRate=2.8154 -> Positive Linear Associated

ċ
Simple-Linear-Regression-Case2-New.R
(1k)
李智,
2013年12月7日 上午4:35
ċ
Simple-Linear-Regression-Case2.R
(3k)
李智,
2011年6月29日 上午3:40
ċ
Simple-Linear-Regression-Case2.dat
(0k)
李智,
2011年6月29日 上午3:40