我正在寻找更快的方法来计算这类东西. (真正的数据库有109 964 694行).
DT<-data.table(id=c(1,2,1,4,2,1,7,8,8,10),
effect=c("A","A","B","B","B","C","C","C","A","D"),
value=1)
我希望id具有效果==“A”AND效果==“C”
intersect(DT[effect=="A",(id)], DT[effect=="C",(id)])
还有其他更快的方法吗?
我也希望id具有效果==“A”AND效果==“C”..但不具有效果==“B”
setdiff(
intersect(DT[effect=="A",(id)],DT[effect=="C",(id)]),
DT[effect=="B",(id)]
)
任何更好(更快)的想法?
问候
PS:我试过这种事:
DT[,effect=="diag_998" | effect=="diag_1900",by=id][,sum(V1)==2,by=id][V1==TRUE,(id)]
但它非常非常慢……
编辑:谢谢大家的提议.
似乎没有什么比V1和A1更快:
system.time(V1<-intersect(DT[effect=="A",(id)],DT[effect=="C",(id)]))# 0.014 sec
system.time(V2<-DT[effect=="A" | effect=="C", unique(id[duplicated(id)])]) #17sec
system.time(V3<-DT[,list(cond=all(c("A","C") %in% effect)),by=id][cond==TRUE,id] ) #more than 1 min
system.time(V4<-DT[effect=="A" | effect=="C", .N[.N > 1], by = id]$id) # 17 sec
system.time(V5<-DT[, .GRP[sum(c("A", "C") %chin% effect)==2], id]$id) # 103 sec
system.time(V6<-DT[, .GRP[sum(c("A", "C") %in% effect)==2], id]$id)#more than 1 min
setkey(DT, effect)
system.time(V7<-DT[.(c("A", "C")), if (.N > 1) TRUE, by = id]$id)#0.19 sec
system.time(A1<-setdiff(intersect(DT[effect=="A",(id)],DT[effect=="C",(id)]),DT[effect=="D",(id)])) # 0.014 sec
system.time(A2<-DT[,list(cond=all(c("A","C") %in% effect) & (!"D" %in% effect)),by=id][cond==TRUE,id])#more than 1 min
system.time(A3<-DT[,list(cond=all(c("A","C") %chin% effect) & (!"D" %chin% effect)),by=id][cond==TRUE,id])
system.time(A4<-DT[.(c("A", "C", "D")), if (.N == 2 & !("D" %in% effect)) TRUE, by = id]$id)# 1sec
编辑:另一个基准,感谢@MichaelChirico
microbenchmark(times=50L,
+ bakal=intersect(DT[effect=="A", id], DT[effect=="C", id]),
+ bakal.keyed=intersect(DT["A", id], DT["C", id]),
+ rscr1=DT[effect %in% c("A","C"), unique(id[duplicated(id)])],
+ rscr1.keyed=DT[.(c("A","C")), unique(id[duplicated(id)])])
Unit: milliseconds
expr min lq mean median uq max neval
bakal 10.963171 11.003494 11.072844 11.019909 11.060331 12.641751 50
bakal.keyed 10.738537 10.794715 10.878960 10.838630 10.892020 12.416713 50
rscr1 9.504886 9.572026 9.662024 9.598426 9.645478 11.127047 50
rscr1.keyed 9.013076 9.037370 9.065215 9.065669 9.083492 9.206366 50
最佳答案 看来你走的是正确的道路;我只能建议加快速度.
这是一个小型的样本数据集,用于清除最糟糕的尝试:
set.seed(10239)
NN<-1e6
DT<-data.table(id=sample(8e5,NN,T),
effect=sample(LETTERS[1:4],NN,T),
val=rnorm(NN),key="effect")
我最初的建议是使用键控,但不是最好的方式,基本上:setkey(DT,效果); DT [.(c(“A”,“C”)),if(.N> 1)TRUE,by = id] $id.
下面是这种方法的TRUE基准,你原来的做法,和我在评论中提到(最优化的一点点其他的,如更换COND ==用(在@尼古拉的做法COND)和效果==“A” |效果==“C”在@ RichardScriven中的%c(“A”,C“)中有效%%:
library(microbenchmark)
microbenchmark(times=50L,
bakal=intersect(DT[effect=="A", id], DT[effect=="C", id]),
rscr1=DT[effect %in% c("A","C"), unique(id[duplicated(id)])],
nicol=DT[,.(cond=all(c("A","C") %in% effect)), by=id][(cond), id],
rscr2=DT[effect %in% c("A","C"), .N[.N > 1], by = id]$id,
akrun=DT[, .GRP[sum(c("A", "C") %chin% effect) == 2], id]$id,
mikec=DT[.(c("A", "C")), if (.N > 1) TRUE, by = id]$id)
结果在我的机器上:
Unit: milliseconds
expr min lq mean median uq max neval cld
bakal 14.82926 15.18540 17.86200 15.56453 16.70924 64.99443 50 a
rscr1 13.41102 13.98252 20.11127 14.93054 18.02248 66.14476 50 a
nicol 1329.82013 1377.03884 1436.45650 1404.48956 1483.47944 1758.00831 50 e
rscr2 260.54888 269.86605 294.05219 276.66802 310.76356 479.50419 50 c
akrun 997.43300 1075.17322 1103.06220 1095.08246 1118.16848 1360.80793 50 d
mikec 154.39418 158.90884 180.01096 163.32763 206.59246 235.45523 50 b
让我们看看这些如何在您提到的数据表上执行:
set.seed(12039)
NN<-1e8
DT<-data.table(id=sample(8e5,NN,T),
effect=sample(LETTERS[1:4],NN,T),
val=rnorm(NN),key="effect")
microbenchmark(times=50L,
bakal=intersect(DT[effect=="A", id], DT[effect=="C", id]),
bakal.keyed=intersect(DT["A", id], DT["C", id]),
rscr1=DT[effect %in% c("A","C"), unique(id[duplicated(id)])],
rscr1.keyed=DT[.(c("A","C")), unique(id[duplicated(id)])])
Unit: seconds
expr min lq mean median uq max neval cld
bakal 3.772309 3.991414 4.395669 4.408319 4.681609 5.170224 50 a
bakal.keyed 3.708500 3.807447 4.289518 4.384870 4.653427 5.085429 50 a
rscr1 4.962940 5.210845 5.721636 5.707369 6.162103 6.779900 50 b
rscr1.keyed 4.904702 5.117411 5.727848 5.807186 6.194990 6.975508 50 b