这是我尝试制作可重现问题的第一个镜头.努力成为SO的更好成员.
这是数据集
reach.dat <- structure(list(stream = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), .Label = c("Brooks Creek", "Buncombe Hollow Creek",
"Bypass Channel", "Cape Horn Creek", "Cougar Creek", "Dog Creek",
"Indian George Creek", "Jim Creek", "Lower Speelyai", "North Siouxon Creek",
"Ole Creek", "Panamaker Creek", "Siouxon Creek", "Speelyai Creek",
"West Fork Speelyai Creek", "West Tributary Speelyai Creek"), class = "factor"),
reach = structure(c(21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L,
29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 19L, 20L, 46L, 47L, 38L, 39L, 40L, 41L, 42L, 43L,
44L, 45L), .Label = c("BNH_0001", "BNH_0002", "BNH_0003",
"BNH_0004", "BNH_0005", "BNH_0006", "BNH_0007", "BNH_0008",
"BNH_0009", "BPC_0001", "BPC_0002", "BPC_0003", "BPC_0004",
"BPC_0005", "BPC_0006", "BPC_0007", "BPC_0008", "BPC_0009",
"BPC_0010", "BPC_0011", "BRK_001", "BRK_002", "BRK_003",
"BRK_004", "BRK_005", "BRK_006", "BRK_007", "BRK_008", "BRK_009",
"BRK_010", "BRK_011", "BRK_012", "BRK_013", "BRK_014", "BRK_015",
"BRK_016", "BRK_017", "CGR_0001", "CGR_0002", "CGR_0003",
"CGR_0004", "CGR_0005", "CGR_0006", "CGR_0007", "CGR_0008",
"CHC_0001", "CHC_0002", "DOG_0001", "ING_0001", "ING_0002",
"ING_0003", "ING_0004", "ING_0005", "ING_0006", "ING_0007",
"JMC_0001", "JMC_0002", "LSPL_0001", "NSX_0001", "NSX_0002",
"OLE_0001", "OLE_0002", "OLE_0003", "OLE_0004", "OLE_0005",
"OLE_0006", "PMR_0001", "PMR_0002", "SPL_0001", "SPL_0002",
"SPL_0003", "SPL_0004", "SPL_0005", "SPL_0006", "SPL_0007",
"SPL_0008", "SPL_0009", "SPL_0010", "SPL_0011", "SPL_0012",
"SPL_0013", "SPL_0014", "SPL_0015", "SPL_0016", "SPL_0017",
"SPL_0018", "SPL_0019", "SPL_0020", "SPL_0021", "SPL_0022",
"SPL_0023", "SXN_0001", "SXN_0002", "SXN_0003", "SXN_0004",
"SXN_0005", "SXN_0006", "SXN_0007", "SXN_0008", "WFSPL_0001",
"WFSPL_0002", "WFSPL_0003", "WFSPL_0004", "WFSPL_0005", "WTSPL_0001",
"WTSPL_0002", "WTSPL_0003", "WTSPL_0004", "WTSPL_0005"), class = "factor"),
length = c(178.9, 170.1, 185.8, 199.7, 190.3, 190, 172.3,
196.4, 179, 183.4, 182.4, 196.7, 188.4, 181.5, 178.9, 196.8,
181.2, 116.2, 121.4, 124.2, 180, 111.7, 130.3, 127.5, 143.4,
75.7, 591.1, 640.9, 582.2, 659, 534.9, 621.9, 636, 564.2,
547.1, 537.2, 589.6, 329.5, 192.7, 454.4, 464.5, 477.6, 544.8,
500.1, 473.1, 481.7, 506.4), SUM_LWD = c(0.288093907, 1.324221046,
1.763186222, 0.774161242, 0.391907514, 0.889842105, 0, 1.5200611,
1.097765363, 0.573173391, 0, 0.622267412, 0.427070064, 1.43338843,
1.151928452, 1.935365854, 1.856622517, 0.326333907, 0.07199341,
0.037520129, 0, 0, 0, 0.169647059, 0.138493724, 0.098678996,
0, 0, 0.217279285, 0.179044006, 0.097868761, 0.210644798,
0, 0.708259482, 0.145567538, 0.03877513, 0.026611262, 2.302579666,
2.125583809, 0.092033451, 0.176533907, 0.955904523, 0.175624082,
1.434553089, 1.038575354, 0.198048578, 1.042061611), Avg_RPD = c(0.352,
0.343333333, 0.325, 0.34, 0.225, 0.2475, 0.254, 0.2125, 0.276666667,
0.22, 0.32875, 0.23125, 0.215, 0.268, 0.305, 0.243636, 0.335714286,
0.2, 0.216666667, 0.24, 0.243333333, 0.56, 0.2575, 0.208,
0.335833333, 0.376666667, 0.175, 0.695, 0.75, 0.546666667,
1.188333333, 1.58, 0.885, 0.448571429, NA, NA, NA, 0.611818182,
0.50875, NA, 0.77, 0.49875, NA, 0.544, 1.017777778, 0.9175,
0.623333333), Avg_Substrate_Pools = c(86.10955531, 104.0366873,
83.94224019, 88.24737809, 88.93432719, 82.59257502, 84.02947757,
81.71253918, 82.76023619, 82.88298227, 84.91750332, 81.21887219,
85.05625823, 82.04273063, 84.01489539, 92.41003006, 117.3416424,
88.61396078, 88.86511047, 83.38603629, 71.73828707, 76.57563745,
86.2069123, 83.62789949, 81.19546417, 80.89002676, 182.5586,
160.63761, 134.82532, 123.1769494, 112.0805653, 93.59420959,
180.5731068, 82.91509529, NA, 61.9283, NA, 111.7153167, 83.79134743,
61.9283, 89.51477005, NA, NA, 86.08708892, 85.3472397, 110.8504333,
91.00371559)), .Names = c("stream", "reach", "length", "SUM_LWD",
"Avg_RPD", "Avg_Substrate_Pools"), row.names = c(NA, 47L), class = "data.frame")
这是数据框的一小部分,我已经删除了一个因子变量的几个级别(在这种情况下是流).因此,您将在脚本的开头看到,我添加了一些列,删除了NA的行(这是我认为的问题的根源,因为此代码适用于没有任何NA单元格的变量).然后我放弃水平,因为原始数据集有更多的流,正如我刚才提到的.当我运行ht.means.xx时,错误发生在脚本的末尾我得到一个错误错误:结果大小(5),预期4或1.我认为5是正确的数字,因为有5个不同的流名称.我没有通过删除具有NA值的行来删除整个流.每行实际上是特定流的范围,每个流都有超过1个范围.因此,即使为了具有NA值而移除了到达(行),由于该流的其他到达(行)没有NA值,该流仍然应该存在.
这是脚本.这有点令人困惑,我正在努力.我想如果你阅读我的描述并查看创建的data.frame(其结构),它将是有意义的.
# Getting Started ---------------------------------------------------------
require(dplyr)
require(sampling)
require(xtable)
require(car)
require(lattice)
## Read in Data on reaches for generating 1st order probabilities
#add a column for stream ID
reach.dat$streamID <- as.numeric(reach.dat$stream)
#add a column for reach ID
reach.dat$reachID <- 1:47
#add probabilities of selecting each reach, within each stream (prop to length)
length.totals <- reach.dat %>%
group_by(stream) %>%
summarise(totals = sum(length))
reach.dat <- merge(reach.dat, length.totals, by = "stream")
reach.dat$length.probs <- with(reach.dat, length/totals)
#reorder columns for organization
reach.dat <- reach.dat[, c("stream","streamID","reach","reachID", "length",
"length.probs", "totals", "Avg_RPD", "Avg_Substrate_Pools", "SUM_LWD")]
reach.dat <- reach.dat[,1:8]
# Remove the NA values in specific columns. Specify Column
completeFun <- function(reach.dat, desiredCols) {
completeVec <- complete.cases(reach.dat[, desiredCols])
return(reach.dat[completeVec, ])
}
reach.dat <- completeFun(reach.dat, "Avg_RPD")
droplevels(reach.dat)
nsim <- 100
# Running Simulations -----------------------------------------------------
#this function draws a unequal probability sample within each stream
sample.fun.pi <- function(stream.no, n.vec){
sample <-
#ifelse statement to deal with the streams that have only 1 reach
ifelse(length(reach.dat[reach.dat$streamID==stream.no, "reachID"]) == 1,
#this line says to sample that one reach, if the stream only has one reach
reach.dat[which(reach.dat$streamID==stream.no), "reachID"],
#if more than one reach, draw a uneq prob sample of size n.vec
list(sample(subset(reach.dat, streamID == stream.no)$reachID,
size = n.vec[stream.no],
#probabilities proportional to length
prob = reach.dat[reach.dat$streamID == stream.no, "length.probs"],
#without replacement
replace = FALSE)))
return(unlist(sample))
}
strata.uneq.pi <- function(n.vec, nsim){
#store nsim samples in a matrix called sample.all
sample.all <- matrix(NA, nrow = sum(n.vec), ncol = nsim)
for(i in 1:nsim){
#for each sample, apply the above function to all the streams
sample.all[, i] <- unlist(apply(cbind(1:length(unique(reach.dat$streamID))), 1,
sample.fun.pi, n.vec))
}
return(sample.all)
}
#define sample sizes
n1.6<-c(1,1,1,1,1)
n7.8<-c(1,1,1,1,1)
n9.10<-c(2,1,1,1,1)
#Set seed
set.seed(303)
#build a matrix of samples for every sample size
sample.1.6 <- strata.uneq.pi(n1.6, nsim)
sample.7.8 <- strata.uneq.pi(n7.8, nsim)
sample.9.10 <- strata.uneq.pi(n9.10, nsim)
#for each sample size, calculate the inclusion probabilities for each reach by counting the number of times that reach is
# selected out of the total number of simulations
pi.1.6 <- numeric(length(reach.dat$reach))
for(i in 1:length(reach.dat$reach)) {
#total number of times the reach is sampled out of the total num of sims
pi.1.6[i] <- sum(sample.1.6==i)/nsim
}
pi.7.8 <- numeric(length(reach.dat$reach))
for(i in 1:length(reach.dat$reach)) {
#total number of times the reach is sampled out of the total num of sims
pi.7.8[i] <- sum(sample.7.8==i)/nsim
}
pi.9.10 <- numeric(length(reach.dat$reach))
for(i in 1:length(reach.dat$reach)) {
#total number of times the reach is sampled out of the total num of sims
pi.9.10[i] <- sum(sample.9.10==i)/nsim
}
# Use this to enter the variable values of choice
reach.dat$y<-(reach.dat$Avg_RPD)
#find number of reaches in each stream for calculating ht estimator
reach.dat$stream <- as.factor(as.character(reach.dat$stream))
num.reaches <- unlist(with(reach.dat, tapply(reach, stream, length)))
#find true mean of responses within each stream
true_y <- reach.dat %>%
group_by(stream) %>%
summarise(true_y=mean(y)) %>%
ungroup()
#this function calculates the ht estimator for each simulated sample from above
#uses the estimated inclusion probabilities
ht.fun <- function(simnum, sample.n, pi.n){
#reaches that were selected in the sample
reach <- sample.n[, simnum]
#estimated inclusion probabilities for those reaches
pi <- pi.n[sample.n[, simnum]]
#y-values that were selected
y <- reach.dat[sample.n[, simnum], "y"]
#the streams that the selected reaches belong to
stream <- reach.dat[sample.n[, simnum], "stream"]
#organize into a dataframe so we can use dplyr on it
data <- cbind.data.frame(reach, pi, y, stream)
#use dplyr to calculate the ht estimates of the total and the mean
ht.strata <- data %>%
tbl_df %>%
group_by(stream) %>%
summarise(total = sum(y/pi)) %>%
mutate(size = num.reaches) %>%
mutate(mean = total/size)
return(ht.strata[, "mean"])
}
#apply to every one of the nsim samples for each sampling rate
ht.means.1.6 <- data.frame(apply(cbind(1:nsim), 1, ht.fun, sample.n = sample.1.6,
pi.n = pi.1.6))
ht.means.7.8 <- data.frame(apply(cbind(1:nsim), 1, ht.fun, sample.n = sample.7.8,
pi.n = pi.7.8))
ht.means.9.10 <- data.frame(apply(cbind(1:nsim), 1, ht.fun, sample.n = sample.9.10,
pi.n = pi.9.10))
最佳答案 您可以进行以下一些更改:
#add a column for stream ID
reach.dat$streamID <- as.numeric(reach.dat$stream)
#add a column for reach ID
reach.dat$reachID <- 1:nrow(reach.dat)
#add probabilities of selecting each reach, within each stream (prop to length)
reach.dat <- reach.dat %>%
group_by(stream) %>%
mutate(totals = sum(length),
length.probs=length/totals)
#reorder columns for organization
reach.dat <- reach.dat[, c("stream","streamID","reach","reachID", "length",
"length.probs", "totals", "Avg_RPD", "Avg_Substrate_Pools", "SUM_LWD")]
reach.dat <- reach.dat[,1:8]
reach.dat <- reach.dat[!is.na(reach.dat$Avg_RPD),]
reach.dat <- droplevels(reach.dat)
#add a column for reach ID (Again)
reach.dat$reachID <- 1:nrow(reach.dat)
nsim <- 10
# Running Simulations -----------------------------------------------------
#define sample sizes
sizes <- list(
n1.6=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),
n7.8=c(1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1),
n9.10=c(2,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1),
n11.13=c(2,1,1,1,1,1,1,1,1,1,1,1,1,3,1,1),
n14=c(2,1,2,1,1,1,1,1,1,1,1,1,1,3,1,1),
n15=c(3,1,2,1,1,1,1,1,1,1,1,1,1,3,1,1),
n16=c(3,1,2,1,1,1,1,1,1,1,1,1,1,4,1,1),
n17.18=c(3,2,2,1,1,1,1,1,1,1,1,1,1,4,1,1),
n19=c(3,2,2,1,2,1,1,1,1,1,1,1,2,4,1,1),
n20=c(3,2,2,1,2,1,1,1,1,1,1,1,2,5,1,1),
n21=c(4,2,2,1,2,1,1,1,1,1,1,1,2,5,1,1),
n22=c(4,2,2,1,2,1,2,1,1,1,1,1,2,5,1,1),
n23=c(4,2,3,1,2,1,2,1,1,1,1,1,2,5,1,1),
n24=c(4,2,3,1,2,1,2,1,1,1,1,1,2,6,1,1),
n25.26=c(4,2,3,1,2,1,2,1,1,1,2,1,2,6,1,1),
n27=c(5,2,3,1,2,1,2,1,1,1,2,1,2,6,1,1),
n28=c(5,3,3,1,2,1,2,1,1,1,2,1,2,6,1,1),
n29=c(5,3,3,1,2,1,2,1,1,1,2,1,2,7,1,1),
n30.31=c(5,3,3,1,2,1,2,1,1,1,2,1,2,7,2,2),
n32=c(5,3,4,1,3,1,2,1,1,1,2,1,3,7,2,2),
n33.35=c(6,3,4,1,3,1,2,1,1,1,2,1,3,8,2,2),
n36=c(6,3,4,1,3,1,3,1,1,1,2,1,3,8,2,2),
n37.38=c(6,3,4,1,3,1,3,1,1,1,2,1,3,9,2,2),
n39.40=c(7,4,4,1,3,1,3,1,1,1,2,1,3,9,2,2),
n41=c(7,4,5,1,3,1,3,1,1,1,2,1,3,9,2,2),
n42.43=c(7,4,5,1,3,1,3,1,1,1,3,1,3,10,2,2),
n44=c(7,4,5,1,4,1,3,1,1,1,3,1,4,10,2,2),
n45=c(8,4,5,1,4,1,3,1,1,1,3,1,4,10,2,2),
n46.49=c(8,4,5,1,4,1,3,1,1,1,3,1,4,11,2,2),
n50=c(9,5,6,1,4,1,4,1,1,1,3,1,4,12,3,3))
#Set seed
set.seed(303)
s <- split(reach.dat, reach.dat[,"streamID"])
s.reach <- lapply(s, '[[', "reachID")
s.probs <- lapply(s, '[[', "length.probs")
ind1 <- lengths(s.probs) == 1
strata.uneq.pi <- function(n.vec, nsim) {
replicate(nsim, {out <- vector("list", length(n.vec))
out[ind1] <- s.reach[ind1]
out[!ind1] <- Map(sample, x=s.reach[!ind1],
size=n.vec[!ind1],
prob=s.probs[!ind1],
replace=FALSE)
unlist(out)
})
}
#build a matrix of samples for every sample size
samples <- lapply(sizes, function(x) strata.uneq.pi(x, nsim))
#for each sample size, calculate the inclusion probabilities for each reach by counting the number of times that reach is
# selected out of the total number of simulations
getpi <- function(smple, nsim) {
len <- length(reach.dat$reach)
pi <- numeric(len)
for(i in 1:len) pi[i] <- sum(smple == i)/nsim
pi
}
pi.lst <- lapply(samples, getpi, nsim=nsim)
# Use this to enter the variable values of choice
reach.dat$y <- reach.dat$Avg_RPD
#find number of reaches in each stream for calculating ht estimator
num.reaches <- table(reach.dat$stream)
#find true mean of responses within each stream
true_y <- reach.dat %>%
group_by(stream) %>%
summarise(true_y=mean(y))
reach.dat$reachID
as.data.frame(reach.dat)
#this function calculates the ht estimator for each simulated sample from above
#uses the estimated inclusion probabilities
ht.fun <- function(sample.n, pi.n) {
apply(sample.n, 2, function(ind) {
df <- data.frame(pi=pi.n[ind],
reach.dat[ind, "y"],
reach.dat[ind, "stream"], stringsAsFactors = FALSE)
#dim(df)
unique(df$stream)
df$size <- num.reaches[df$stream]
df <- na.omit(df %>% group_by(stream) %>%
summarise(total=sum(y/pi),
mean=total/size[1]))
df$mean}
)
}
#apply to every one of the nsim samples for each sampling rate
ht.lst <- Map(ht.fun, samples, pi.lst)
n <- rep(names(sizes), each=16)
stream <- as.character(rep(sort(unique(reach.dat$stream)), 30))
true_mean <- rep(true_y$true_y,30)
sim_dat <- data.frame(cbind(stream, n, true_mean, do.call(rbind, ht.lst)))