library(data.table) #用於資料讀取與查詢, 載入套件 data.table 才可使用fread()函數
<-"c:/Users/user/Downloads/sushi/sushi.csv"#檔案名稱
file_name<-fread(file_name,header="auto")
x<-data.frame(x) x
日期 | 星期 | 盤數 | 營收 | 營收調整 | 氣象 | 促銷活動 | 假日 | 氣溫 | 最高氣溫 | 最低氣溫 | 相對濕度… |
---|---|---|---|---|---|---|---|---|---|---|---|
20140301 | 6 | 1511 | 45340 | 45340 | 陰 | 0 | 1 | 17.4 | 23.5 | 14.1 | 88 |
20140302 | 7 | 1500 | 45000 | 45000 | 雨 | 0 | 1 | 13.1 | 17.5 | 12.2 | 97 |
20140303 | 1 | 522 | 15660 | 15660 | 雨 | 0 | 0 | 14.7 | 16.9 | 11.8 | 85 |
20140304 | 2 | 545 | 16340 | 16340 | 雨 | 0 | 0 | 14.3 | 16.6 | 13.2 | 90 |
20140305 | 3 | 493 | 14780 | 14780 | 雨 | 0 | 0 | 14.6 | 16.2 | 13.3 | 76 |
20140306 | 4 | 522 | 15660 | 15660 | 雨 | 0 | 0 | 13.5 | 14.7 | 12.1 | 89 |
20140307 | 5 | 939 | 28160 | 28160 | 雨 | 0 | 0 | 14.6 | 16.1 | 12.1 | 90 |
20140308 | 6 | 1094 | 32830 | 32830 | 雨 | 0 | 1 | 13.4 | 15.5 | 11.7 | 10 |
20140309 | 7 | 1936 | 58090 | 58090 | 雨 | 0 | 1 | 12.8 | 14.3 | 11.1 | 81 |
20140310 | 1 | 833 | 25000 | 25000 | 陰 | 0 | 0 | 15.2 | 16.4 | 12.9 | 68 |
日期 | 星期 | 盤數 | 營收 | 營收調整 | 氣象 | 促銷活動 | 假日 | 氣溫 | 最高氣溫 | 最低氣溫 | 相對濕度… | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
39 | 20140408 | 2 | 1880 | 56405 | 46805 | 雨 | 1 | 0 | 21.0 | 24.2 | 18.5 | 87 |
40 | 20140409 | 3 | 1571 | 47125 | 38525 | 陰 | 1 | 0 | 20.9 | 23.0 | 19.5 | 85 |
41 | 20140410 | 4 | 2526 | 75775 | 48275 | 晴 | 1 | 0 | 21.8 | 25.1 | 18.7 | 76 |
42 | 20140411 | 5 | 3229 | 96880 | 66880 | 晴 | 1 | 0 | 21.9 | 25.7 | 17.1 | 80 |
43 | 20140412 | 6 | 4878 | 146336 | 121836 | 晴 | 1 | 1 | 23.4 | 27.5 | 20.3 | 79 |
44 | 20140413 | 7 | 5060 | 151785 | 125285 | 晴 | 1 | 1 | 23.8 | 27.1 | 19.8 | 83 |
45 | 20140414 | 1 | 2359 | 70755 | 44755 | 晴 | 1 | 0 | 19.6 | 21.8 | 18.3 | 71 |
46 | 20140415 | 2 | 2491 | 74729 | 49729 | 晴 | 1 | 0 | 21.7 | 24.9 | 18.4 | 67 |
47 | 20140416 | 3 | 2662 | 79850 | 49850 | 晴 | 1 | 0 | 22.3 | 24.6 | 20.4 | 80 |
48 | 20140417 | 4 | 2555 | 76640 | 49140 | 晴 | 1 | 0 | 24.1 | 27.3 | 20.6 | 79 |
#欄位名稱
<-colnames(x)
colnx<-colnx[(c(3,4,5,9,10,11,12))] #屬量變數
conti_var conti_var
## [1] "盤數" "營收" "營收調整" "氣溫" "最高氣溫"
## [6] "最低氣溫" "相對濕度..."
\[\bar{X}=\frac{\sum_{i=1}^n X_i}{n}\]
<-c()
averagefor(names in conti_var){
<-which(colnames(x)==names)
idx<-c(average,round(mean(x[,idx],na.rm=TRUE),2))
average
}
::kable(cbind(conti_var,average), caption = '',row.names =NA) knitr
conti_var | average |
---|---|
盤數 | 1815.17 |
營收 | 54455.29 |
營收調整 | 46180.29 |
氣溫 | 18.58 |
最高氣溫 | 21.68 |
最低氣溫 | 15.74 |
相對濕度… | 77.36 |
#氣溫
<-matrix(x[,9],ncol=8)
xx xx
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 17.4 14.6 15.3 22.2 20.1 20.8 20.4 23.4
## [2,] 13.1 13.4 14.0 16.2 22.5 17.9 18.8 23.8
## [3,] 14.7 12.8 16.7 13.5 22.2 18.7 21.0 19.6
## [4,] 14.3 15.2 16.3 16.2 22.4 18.8 20.9 21.7
## [5,] 14.6 16.5 19.1 18.4 24.6 18.1 21.8 22.3
## [6,] 13.5 15.7 20.9 19.0 22.6 19.7 21.9 24.1
<-apply(xx,1, sum)
tt<-cbind(xx,tt)
xx xx
## tt
## [1,] 17.4 14.6 15.3 22.2 20.1 20.8 20.4 23.4 154.2
## [2,] 13.1 13.4 14.0 16.2 22.5 17.9 18.8 23.8 139.7
## [3,] 14.7 12.8 16.7 13.5 22.2 18.7 21.0 19.6 139.2
## [4,] 14.3 15.2 16.3 16.2 22.4 18.8 20.9 21.7 145.8
## [5,] 14.6 16.5 19.1 18.4 24.6 18.1 21.8 22.3 155.4
## [6,] 13.5 15.7 20.9 19.0 22.6 19.7 21.9 24.1 157.4
sum(tt) #總和
## [1] 891.7
sum(tt)/48 #平均
## [1] 18.57708
<-c()
medianfor(names in conti_var){
<-which(colnames(x)==names)
idx<-c(median,round(median(x[,idx],na.rm=TRUE),2))
median
}
::kable(cbind(conti_var,median), caption = '',row.names =NA) knitr
conti_var | median |
---|---|
盤數 | 1646.5 |
營收 | 49390 |
營收調整 | 45170 |
氣溫 | 18.8 |
最高氣溫 | 22.8 |
最低氣溫 | 14.7 |
相對濕度… | 80 |
#氣溫
<-sort(x[,9]) #排序
xx xx
## [1] 12.8 13.1 13.4 13.5 13.5 14.0 14.3 14.6 14.6 14.7 15.2 15.3 15.7 16.2 16.2
## [16] 16.3 16.5 16.7 17.4 17.9 18.1 18.4 18.7 18.8 18.8 19.0 19.1 19.6 19.7 20.1
## [31] 20.4 20.8 20.9 20.9 21.0 21.7 21.8 21.9 22.2 22.2 22.3 22.4 22.5 22.6 23.4
## [46] 23.8 24.1 24.6
變異數 \[s^2=\frac{\sum_{i=1}^n (X_i-\bar{X})^2}{n-1}\]
標準差 \[s=\sqrt{\frac{\sum_{i=1}^n (X_i-\bar{X})^2}{n-1}}\]
<-c()
ssfor(names in conti_var){
<-which(colnames(x)==names)
idx<-c(ss,round(sd(x[,idx],na.rm=TRUE),2))
ss
}::kable(cbind(conti_var,ss), caption = '',row.names =NA) knitr
conti_var | ss |
---|---|
盤數 | 1035.57 |
營收 | 31065.42 |
營收調整 | 23916.66 |
氣溫 | 3.42 |
最高氣溫 | 3.97 |
最低氣溫 | 3.06 |
相對濕度… | 13.62 |
\[離差=X_i-\bar{X}\] 用來測度觀察值與平均值的偏離量, 絕對值越大表示差越多!!
#氣溫平均值18.87708
<-x[,9]-mean(x[,9]) #離差
xx xx
## [1] -1.1770833 -5.4770833 -3.8770833 -4.2770833 -3.9770833 -5.0770833
## [7] -3.9770833 -5.1770833 -5.7770833 -3.3770833 -2.0770833 -2.8770833
## [13] -3.2770833 -4.5770833 -1.8770833 -2.2770833 0.5229167 2.3229167
## [19] 3.6229167 -2.3770833 -5.0770833 -2.3770833 -0.1770833 0.4229167
## [25] 1.5229167 3.9229167 3.6229167 3.8229167 6.0229167 4.0229167
## [31] 2.2229167 -0.6770833 0.1229167 0.2229167 -0.4770833 1.1229167
## [37] 1.8229167 0.2229167 2.4229167 2.3229167 3.2229167 3.3229167
## [43] 4.8229167 5.2229167 1.0229167 3.1229167 3.7229167 5.5229167
\[離差平方=離差^2=(X_i-\bar{X})^2\]
^2 #離差平方 xx
## [1] 1.38552517 29.99844184 15.03177517 18.29344184 15.81719184 25.77677517
## [7] 15.81719184 26.80219184 33.37469184 11.40469184 4.31427517 8.27760851
## [13] 10.73927517 20.94969184 3.52344184 5.18510851 0.27344184 5.39594184
## [19] 13.12552517 5.65052517 25.77677517 5.65052517 0.03135851 0.17885851
## [25] 2.31927517 15.38927517 13.12552517 14.61469184 36.27552517 16.18385851
## [31] 4.94135851 0.45844184 0.01510851 0.04969184 0.22760851 1.26094184
## [37] 3.32302517 0.04969184 5.87052517 5.39594184 10.38719184 11.04177517
## [43] 23.26052517 27.27885851 1.04635851 9.75260851 13.86010851 30.50260851
\[離差平方總和=\sum 離差^2=\sum_{i=1}^n(X_i-\bar{X})^2\]
sum(xx^2) #離差平方總和
## [1] 549.4048
\[s^2=變異數=\frac{離差平方總和}{n-1}=\frac{\sum_{i=1}^n(X_i-\bar{X})^2}{n-1}\]
<-sum(xx^2)/(48-1) #變異數離差平方總和除以48-1
s2 s2
## [1] 11.68946
\[s=標準差=\sqrt{變異數}=\sqrt{\frac{\sum_{i=1}^n(X_i-\bar{X})^2}{n-1}}\]
<-sqrt(s2) #變異數開根號
s s
## [1] 3.418986
<-c()
IQRfor(names in conti_var){
<-which(colnames(x)==names)
idx<-quantile(x[,idx],0.75,type=2,na.rm=TRUE) #Q3
Q3<-quantile(x[,idx],0.25,type=2,na.rm=TRUE) #Q1
Q1<-c(IQR,Q3-Q1)
IQR
}<-cbind(conti_var,IQR);rownames(temp)<-NULL
temp::kable(temp, caption = '',row.names =NA) knitr
conti_var | IQR |
---|---|
盤數 | 1488.5 |
營收 | 44654.5 |
營收調整 | 27490 |
氣溫 | 6.25 |
最高氣溫 | 6.8 |
最低氣溫 | 5.6 |
相對濕度… | 18 |
<-cbind(conti_var, average,median, round(ss,2), round(IQR,2))
statsrownames(stats)<-NULL
colnames(stats)<-c("變數","平均值","中位數","標準差","四分位距")
::kable(stats, caption = '基本統計量') knitr
變數 | 平均值 | 中位數 | 標準差 | 四分位距 |
---|---|---|---|---|
盤數 | 1815.17 | 1646.5 | 1035.57 | 1488.5 |
營收 | 54455.29 | 49390 | 31065.42 | 44654.5 |
營收調整 | 46180.29 | 45170 | 23916.66 | 27490 |
氣溫 | 18.58 | 18.8 | 3.42 | 6.25 |
最高氣溫 | 21.68 | 22.8 | 3.97 | 6.8 |
最低氣溫 | 15.74 | 14.7 | 3.06 | 5.6 |
相對濕度… | 77.36 | 80 | 13.62 | 18 |
par(mfrow=c(1,2),mar=c(5,5,5,5))
<-rep(c("blue","purple"),each=5)
colx<-colnames(x)
colnfor(k in c(3,9)){
hist(x[,k],xlab=coln[k],main="直方圖",ylab="次數",col=colx[k])
}
par(mfrow=c(1,2),mar=c(5,5,5,5))
<-rep(c("blue","purple"),each=5)
colx<-colnames(x)
colnfor(k in c(3,9)){
<-hist(x[,k],xlab=coln[k],main="直方圖",ylab="次數",col="blue")
hthist(x[which(x[,6]%in%c("晴","陰")),k],breaks=ht$breaks,xlab=coln[k],main="直方圖",ylab="次數",col="green4",add=TRUE)
hist(x[which(x[,6]=="晴"),k],breaks=ht$breaks,xlab=coln[k],main="直方圖",ylab="次數",col="red",add=TRUE)
}
紅色為晴天的情況, 銷售盤數偏多, 天氣偏熱。
藍色為雨天的情況, 銷售盤數偏少, 天氣偏冷。
黑色為陰天。
屬量資料統計圖
par(mfrow=c(1,2),mar=c(5,5,5,5))
<-rep(c("blue","purple"),each=5)
colx<-colnames(x)
colnfor(k in c(3,9)){
plot(1:48,x[,k],xlab="",main=paste(coln[k],"的趨勢圖",sep=""),ylab=coln[k],col=colx[k],type="l",lwd=2,axes=FALSE)
points(1:48,x[,k],pch=19,col=colx[k])
<-seq(1,48,7)
atxaxis(1,at=atx,labels=substr(x[atx,1],5,8),las=2)
<-round(seq(min(x[,k]),max(x[,k]),length=6),2)
atyaxis(2,at=aty,labels=aty,las=2)
}
- 三到四月, 氣溫上升。 - 銷售量逐步上升, 每周有循環波動現象。
par(mfrow=c(1,2),mar=c(5,5,5,5))
<-array("green4",48)
colx<-array(19,48)
pchxwhich(x[,6]=="晴")]<-18
pchx[which(x[,6]=="雨")]<-17
pchx[which(x[,6]=="晴")]<-"red"
colx[which(x[,6]=="雨")]<-"blue"
colx[<-colnames(x)
colnfor(k in c(3,9)){
plot(1:48,x[,k],xlab="",main=paste(coln[k],"的趨勢圖",sep=""),ylab=coln[k],col="purple",type="l",lwd=2,axes=FALSE)
points(1:48,x[,k],pch=pchx,col=colx)
<-seq(1,48,7)
atxaxis(1,at=atx,labels=substr(x[atx,1],5,8),las=2)
<-round(seq(min(x[,k]),max(x[,k]),length=6),2)
atyaxis(2,at=aty,labels=aty,las=2)
}
table(x[,6]) #天氣
##
## 雨 陰 晴
## 17 12 19
table(x[,7]) #促銷
##
## 0 1
## 19 29
table(x[,8]) #假日
##
## 0 1
## 33 15
par(mfrow=c(2,3),mar=c(3,3,1,1))
<-colnames(x)
colnfor(k in c(6,7,8)){
pie(table(x[,k]),main=coln[k],cex.main=2,cex.lab=1.5,cex.axis=1.5)
}for(k in c(6,7,8)){
barplot(table(x[,k]),ylim=c(0,40),width=0.8,space=0.2,col=c("blue","green4","red"),main=coln[k],cex.main=2,cex.lab=1.5,cex.axis=1.5)
}
<-x[,9]-mean(x[,9]) #氣溫的離差
dxround(dx,2)
## [1] -1.18 -5.48 -3.88 -4.28 -3.98 -5.08 -3.98 -5.18 -5.78 -3.38 -2.08 -2.88
## [13] -3.28 -4.58 -1.88 -2.28 0.52 2.32 3.62 -2.38 -5.08 -2.38 -0.18 0.42
## [25] 1.52 3.92 3.62 3.82 6.02 4.02 2.22 -0.68 0.12 0.22 -0.48 1.12
## [37] 1.82 0.22 2.42 2.32 3.22 3.32 4.82 5.22 1.02 3.12 3.72 5.52
round(dx/sd(x[,9]),2) #氣溫的標準分數
## [1] -0.34 -1.60 -1.13 -1.25 -1.16 -1.48 -1.16 -1.51 -1.69 -0.99 -0.61 -0.84
## [13] -0.96 -1.34 -0.55 -0.67 0.15 0.68 1.06 -0.70 -1.48 -0.70 -0.05 0.12
## [25] 0.45 1.15 1.06 1.12 1.76 1.18 0.65 -0.20 0.04 0.07 -0.14 0.33
## [37] 0.53 0.07 0.71 0.68 0.94 0.97 1.41 1.53 0.30 0.91 1.09 1.62
<-x[,3]-mean(x[,3]) #營收的離差
dxround(dx,3)
## [1] -304.167 -315.167 -1293.167 -1270.167 -1322.167 -1293.167 -876.167
## [8] -721.167 120.833 -982.167 -1126.167 -1188.167 357.833 -112.167
## [15] 21.833 621.833 -225.167 -432.167 -868.167 -898.167 -738.167
## [22] 111.833 688.833 -425.167 -594.167 -914.167 -811.167 -101.167
## [29] 295.833 834.833 -308.167 -954.167 -86.167 -304.167 1187.833
## [36] 1377.833 1860.833 -75.167 64.833 -244.167 710.833 1413.833
## [43] 3062.833 3244.833 543.833 675.833 846.833 739.833
round(dx/sd(x[,3]),2) #營收的標準分數
## [1] -0.29 -0.30 -1.25 -1.23 -1.28 -1.25 -0.85 -0.70 0.12 -0.95 -1.09 -1.15
## [13] 0.35 -0.11 0.02 0.60 -0.22 -0.42 -0.84 -0.87 -0.71 0.11 0.67 -0.41
## [25] -0.57 -0.88 -0.78 -0.10 0.29 0.81 -0.30 -0.92 -0.08 -0.29 1.15 1.33
## [37] 1.80 -0.07 0.06 -0.24 0.69 1.37 2.96 3.13 0.53 0.65 0.82 0.71
<-which(x[,6]=="晴")
idx13] x[idx1,
## [1] 1703 1837 2437 1590 1383 1927 2504 1390 1221 3003 3193 2526 3229 4878 5060
## [16] 2359 2491 2662 2555
<-which(x[,6]=="陰")
idx23] x[idx2,
## [1] 1511 833 947 1077 901 1004 1714 2650 861 1729 3676 1571
<-which(x[,6]=="雨")
idx33] x[idx3,
## [1] 1500 522 545 493 522 939 1094 1936 689 627 2173 917 2111 1507 1511
## [16] 1740 1880
<-list(sunny=x[idx1,3],cloud=x[idx2,3],rainy=x[idx3,3])
revenueW revenueW
## $sunny
## [1] 1703 1837 2437 1590 1383 1927 2504 1390 1221 3003 3193 2526 3229 4878 5060
## [16] 2359 2491 2662 2555
##
## $cloud
## [1] 1511 833 947 1077 901 1004 1714 2650 861 1729 3676 1571
##
## $rainy
## [1] 1500 522 545 493 522 939 1094 1936 689 627 2173 917 2111 1507 1511
## [16] 1740 1880
boxplot(revenueW,col=rainbow(3),horizontal = TRUE)
#平均值
<-lapply(revenueW,mean)
meanW meanW
## $sunny
## [1] 2523.579
##
## $cloud
## [1] 1539.5
##
## $rainy
## [1] 1218
#中位數
<-lapply(revenueW,median)
medianW medianW
## $sunny
## [1] 2491
##
## $cloud
## [1] 1294
##
## $rainy
## [1] 1094
#標準差
<-lapply(revenueW,sd)
sdW sdW
## $sunny
## [1] 1050.485
##
## $cloud
## [1] 857.317
##
## $rainy
## [1] 609.5565
#標準差
<-lapply(revenueW,quantile, type=2)
aa aa
## $sunny
## 0% 25% 50% 75% 100%
## 1221 1703 2491 3003 5060
##
## $cloud
## 0% 25% 50% 75% 100%
## 833.0 924.0 1294.0 1721.5 3676.0
##
## $rainy
## 0% 25% 50% 75% 100%
## 493 627 1094 1740 2173
$sun[4]-aa$sun[2];aa$cloud[4]-aa$cloud[2];aa$rainy[4]-aa$rainy[2] aa
## 75%
## 1300
## 75%
## 797.5
## 75%
## 1113
<-list(yes=x[which(x[,7]==1),3],no=x[which(x[,7]==0),3])
revenueP revenueP
## $yes
## [1] 917 1077 1927 2504 1390 1221 901 1004 1714 2111 2650 1507 861 1729 1511
## [16] 3003 3193 3676 1740 1880 1571 2526 3229 4878 5060 2359 2491 2662 2555
##
## $no
## [1] 1511 1500 522 545 493 522 939 1094 1936 833 689 627 2173 1703 1837
## [16] 2437 1590 1383 947
boxplot(revenueP,col=rainbow(2),horizontal = TRUE)
#平均值
<-lapply(revenueP,mean)
meanP meanP
## $yes
## [1] 2201.621
##
## $no
## [1] 1225.316
#中位數
<-lapply(revenueP,median)
medianP medianP
## $yes
## [1] 1927
##
## $no
## [1] 1094
#標準差
<-lapply(revenueP,sd)
sdP sdP
## $yes
## [1] 1080.802
##
## $no
## [1] 612.5199
#百分位數
<-lapply(revenueP,quantile, type=2)
ps ps
## $yes
## 0% 25% 50% 75% 100%
## 861 1507 1927 2650 5060
##
## $no
## 0% 25% 50% 75% 100%
## 493 627 1094 1703 2437
$yes[4]-ps$yes[2];ps$no[4]-ps$no[2] ps
## 75%
## 1143
## 75%
## 1076
<-list(yes=x[which(x[,8]==1),3],no=x[which(x[,8]==0),3])
revenueH revenueH
## $yes
## [1] 1511 1500 1094 1936 1837 2437 1927 2504 2111 2650 3003 3193 3676 4878 5060
##
## $no
## [1] 522 545 493 522 939 833 689 627 2173 1703 1590 1383 947 917 1077
## [16] 1390 1221 901 1004 1714 1507 861 1729 1511 1740 1880 1571 2526 3229 2359
## [31] 2491 2662 2555
boxplot(revenueH,col=rainbow(2),horizontal = TRUE)
#平均值
<-lapply(revenueH,mean)
meanH meanH
## $yes
## [1] 2621.133
##
## $no
## [1] 1448.818
#中位數
<-lapply(revenueH,median)
medianH medianH
## $yes
## [1] 2437
##
## $no
## [1] 1390
#標準差
<-lapply(revenueH,sd)
sdH sdH
## $yes
## [1] 1174.837
##
## $no
## [1] 726.8615
#百分位數
<-lapply(revenueP,quantile, type=2)
hs hs
## $yes
## 0% 25% 50% 75% 100%
## 861 1507 1927 2650 5060
##
## $no
## 0% 25% 50% 75% 100%
## 493 627 1094 1703 2437
$yes[4]-hs$yes[2];hs$no[4]-hs$no hs
## 75%
## 1143
## 0% 25% 50% 75% 100%
## 1210 1076 609 0 -734
<-c(unlist(meanW),unlist(meanP), unlist(meanH))
mean_allnames(mean_all)<-c("晴","陰","雨","促銷","無促銷","假日","非假日")
mean_all
## 晴 陰 雨 促銷 無促銷 假日 非假日
## 2523.579 1539.500 1218.000 2201.621 1225.316 2621.133 1448.818
<-barplot(height=mean_all,names.arg = names(mean_all),col=rainbow(7),ylim=c(0,3000))
outx<-(outx[3]+outx[4])/2;br2<-(outx[5]+outx[6])/2
br1lines(c(br1,br1),c(0,3000),lty=2,lwd=2)
lines(c(br2,br2),c(0,3000),lty=2,lwd=2)
text(outx,mean_all,round(mean_all,1),pos=3)
母體參數的假設檢定 (hypothesis testing) 是研究者將母體參數空間(parameter)劃分成兩類, 並陳述他認為真正的參數是屬於這兩類其中一, 做出假設性的陳述;並利用隨機抽樣的資料驗證陳述是否為真的過程。
例如:
製程管制問題: 一些生產因素可能隨時間推移而改變, 例如, 原物料品質, 機器老化等。品管人員擔心某一台機器, 或某一條生產線的生產狀況可能會偶而發生不穩定的情形, 所以, 定期檢測產品的品質特性變數 (quality characteristic variable) 以監控 (monitor) 製程的穩定性。(參數空間為穩定和不穩定, 或品質特性變數的範圍)
產品保證: 為了確保供應商的元件品質, 品管部門訂定抽樣計畫用以判定供應商所供應的元件批量是否可以允收。(參數空間為合格和不合格, \(0\leq p\leq 1\))
兩個假設: 當母體參數空間劃分成兩類, 一為虛無假設, 另一個稱為對立假設。
兩種錯誤決策:(a) \(H_0\) 為真, 但錯誤拒絕 \(H_0\) (b) \(H_1\) 為真, 但無法拒絕 \(H_0\)。
p-value (樞紐量): 錯誤拒絕\(H_0\)的機率。
顯著水準 significance level: 一個檢定可以忍受最大的型I錯誤, 通常用\(\alpha\)表示。
拒絕域: 滿足拒絕 \(H_0\) 的條件。\(C=\{p\_value<\alpha\}\)
## Warning in kable_pipe(x = structure(c("決策", "拒絕HO", "不拒絕H0", "H0為真", :
## The table should have a header (column names)
決策 | H0為真 | H1為真 |
拒絕HO | 型I錯誤 | 正確 |
不拒絕H0 | 正確 | 型II錯誤 |
當\(H_0\) (虛無假設 null hypothesis) 為真, 但做出拒絕\(H_0\)的決策, 稱為型I錯誤, 又稱生產者風險。
當\(H_1\) (對立假設 alternative hypothesis) 為真, 但無法拒絕\(H_0\), 稱為型II錯誤, 又稱消費者風險。
影響有兩個部分, 一個是變異(銷售穩定性或波動情形), 另一個平均值。 + 標準差(變異數)檢定 + 平均值檢定: 假日的平均銷售量是否比較高
假設假日的平均盤數為\(\mu_{h0}\), 標準差為\(\sigma_{h0}\), 非假日的平均盤數為\(\mu_{h1}\), 標準差為\(\sigma_{h1}.\) \[H_0:假日收入的標準差與平日相同 (\sigma_{h0}=\sigma_{h1}), H_1: 假日收入的標準差與平日不相同(\sigma_{h0}\neq\sigma_{h1})\]
var.test(revenueH$yes,revenueH$no,alternative = "two.sided",conf.level=0.9,var.equal = FALSE)
##
## F test to compare two variances
##
## data: revenueH$yes and revenueH$no
## F = 2.6125, num df = 14, denom df = 32, p-value = 0.02431
## alternative hypothesis: true ratio of variances is not equal to 1
## 90 percent confidence interval:
## 1.296680 6.003143
## sample estimates:
## ratio of variances
## 2.612472
\[H_0:假日收入與平日相同 (\mu_{h0}=\mu_{h1}), H_1: 假日收入大於平日(\mu_{h0}>\mu_{h1})\] - 因為變異數檢定var.test()證實兩者變異數不相同,故檢定平均時要設定“var.equal=FALSE”
t.test(revenueH$yes,revenueH$no,alternative = "greater",var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: revenueH$yes and revenueH$no
## t = 3.5668, df = 19.043, p-value = 0.001026
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 604.0619 Inf
## sample estimates:
## mean of x mean of y
## 2621.133 1448.818
假設有促銷活動的平均盤數為\(\mu_{p0}\), 標準差為\(\sigma_{p0}\), 沒有促銷活動的平均盤數為\(\mu_{p1}\), 標準差為\(\sigma_{p1}.\)
\[H_0:有無促銷活動的標準差相同 (\sigma_{p0}=\sigma_{p1}), H_1: 有無促銷活動的標準差不相同(\sigma_{p0}\neq\sigma_{p1})\]
var.test(revenueP$yes,revenueP$no,alternative = "two.sided",conf.level=0.9)
##
## F test to compare two variances
##
## data: revenueP$yes and revenueP$no
## F = 3.1135, num df = 28, denom df = 18, p-value = 0.01466
## alternative hypothesis: true ratio of variances is not equal to 1
## 90 percent confidence interval:
## 1.469015 6.185892
## sample estimates:
## ratio of variances
## 3.113519
\[H_0:促銷活動沒有效益, H_1: 促銷活動有效益\] - 因為變異數檢定var.test()證實兩者變異數不相同,故檢定平均時要設定“var.equal=FALSE”
t.test(revenueP$yes,revenueP$no,alternative = "greater",var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: revenueP$yes and revenueP$no
## t = 3.9849, df = 45.261, p-value = 0.0001215
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 564.8891 Inf
## sample estimates:
## mean of x mean of y
## 2201.621 1225.316
\[H_0:促銷活動效益\leq 500, H_1: 促銷活動效益>500\]
t.test(revenueP$yes,revenueP$no,mu=500,alternative = "greater",var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: revenueP$yes and revenueP$no
## t = 1.9441, df = 45.261, p-value = 0.02906
## alternative hypothesis: true difference in means is greater than 500
## 95 percent confidence interval:
## 564.8891 Inf
## sample estimates:
## mean of x mean of y
## 2201.621 1225.316
\[H_0:晴天與陰天銷售量的變異相同, H_1: 晴天與陰天銷售量的變異相同\]
var.test(revenueW$sunny,revenueW$cloud,alternative = "two.sided",conf.level=0.9)
##
## F test to compare two variances
##
## data: revenueW$sunny and revenueW$cloud
## F = 1.5014, num df = 18, denom df = 11, p-value = 0.4966
## alternative hypothesis: true ratio of variances is not equal to 1
## 90 percent confidence interval:
## 0.5621333 3.5645627
## sample estimates:
## ratio of variances
## 1.501402
\[H_0:晴天與陰天銷售量相同, H_1: 晴天優於陰天\]
t.test(revenueW$sunny,revenueW$cloud,alternative = "greater",var.equal =TRUE)
##
## Two Sample t-test
##
## data: revenueW$sunny and revenueW$cloud
## t = 2.7186, df = 29, p-value = 0.005476
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 369.0186 Inf
## sample estimates:
## mean of x mean of y
## 2523.579 1539.500
var.test(revenueW$sunny,revenueW$rainy,alternative = "two.sided",conf.level=0.9)
##
## F test to compare two variances
##
## data: revenueW$sunny and revenueW$rainy
## F = 2.97, num df = 18, denom df = 16, p-value = 0.03347
## alternative hypothesis: true ratio of variances is not equal to 1
## 90 percent confidence interval:
## 1.290373 6.681204
## sample estimates:
## ratio of variances
## 2.96997
t.test(revenueW$sunny,revenueW$rainy,alternative = "greater",var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: revenueW$sunny and revenueW$rainy
## t = 4.6178, df = 29.411, p-value = 3.561e-05
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 825.4056 Inf
## sample estimates:
## mean of x mean of y
## 2523.579 1218.000
var.test(revenueW$cloud,revenueW$rainy,alternative = "two.sided",conf.level=0.9)
##
## F test to compare two variances
##
## data: revenueW$cloud and revenueW$rainy
## F = 1.9781, num df = 11, denom df = 16, p-value = 0.2087
## alternative hypothesis: true ratio of variances is not equal to 1
## 90 percent confidence interval:
## 0.8053066 5.3427613
## sample estimates:
## ratio of variances
## 1.978131
t.test(revenueW$cloud,revenueW$rainy,alternative = "greater",var.equal = TRUE)
##
## Two Sample t-test
##
## data: revenueW$cloud and revenueW$rainy
## t = 1.1829, df = 27, p-value = 0.1236
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## -141.4317 Inf
## sample estimates:
## mean of x mean of y
## 1539.5 1218.0
<-list(sunny=x[idx1,3],others=x[-idx1,3])
revenueW2 revenueW2
## $sunny
## [1] 1703 1837 2437 1590 1383 1927 2504 1390 1221 3003 3193 2526 3229 4878 5060
## [16] 2359 2491 2662 2555
##
## $others
## [1] 1511 1500 522 545 493 522 939 1094 1936 833 689 627 2173 947 917
## [16] 1077 901 1004 1714 2111 2650 1507 861 1729 1511 3676 1740 1880 1571
boxplot(revenueW2,col=rainbow(2),horizontal = TRUE)
\[H_0:晴天與非晴天銷售量的變異相同, H_1: 晴天與非晴天銷售量的變異不相同\]
var.test(revenueW2$sunny,revenueW2$others,alternative = "two.sided",conf.level=0.9)
##
## F test to compare two variances
##
## data: revenueW2$sunny and revenueW2$others
## F = 2.0938, num df = 18, denom df = 28, p-value = 0.07692
## alternative hypothesis: true ratio of variances is not equal to 1
## 90 percent confidence interval:
## 1.053878 4.437785
## sample estimates:
## ratio of variances
## 2.093828
t.test(revenueW2$sunny,revenueW2$others,alternative = "greater",var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: revenueW2$sunny and revenueW2$others
## t = 4.2462, df = 29.19, p-value = 0.0001011
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 703.4473 Inf
## sample estimates:
## mean of x mean of y
## 2523.579 1351.034
plot(x[,9],x[,3],xlab="氣溫",ylab="營收",main="氣溫vs營收",pch=pchx,col=colx)
<-lm(x[,3]~x[,9])
outabline(out,lwd=2,col="blue")