/ 猿问

# do.call（rbind，list）列数不均

2019-11-19 10:44:11

x <- list()

x[[1]] <- letters[seq(2,20,by=2)]

names(x[[1]]) <- LETTERS[c(1:length(x[[1]]))]

x[[2]] <- letters[seq(3,20, by=3)]

names(x[[2]]) <- LETTERS[seq(3,20, by=3)]

x[[3]] <- letters[seq(4,20, by=4)]

names(x[[3]]) <- LETTERS[seq(4,20, by=4)]

do.call(rbind,x)

## 3 回答

rbind.fill是一个很棒的功能，在data.frames列表上确实表现出色。但是恕我直言，在这种情况下，当列表仅包含（命名）向量时，可以更快地完成。

require(plyr)

rbind.fill(lapply(x,function(y){as.data.frame(t(y),stringsAsFactors=FALSE)}))

rbind.named.fill <- function(x) {

nam <- sapply(x, names)

unam <- unique(unlist(nam))

len <- sapply(x, length)

out <- vector("list", length(len))

for (i in seq_along(len)) {

out[[i]] <- unname(x[[i]])[match(unam, nam[[i]])]

}

setNames(as.data.frame(do.call(rbind, out), stringsAsFactors=FALSE), unam)

}

# generate some huge random data:

set.seed(45)

sample.fun <- function() {

nam <- sample(LETTERS, sample(5:15))

val <- sample(letters, length(nam))

setNames(val, nam)

}

ll <- replicate(1e4, sample.fun())

# plyr's rbind.fill version:

rbind.fill.plyr <- function(x) {

rbind.fill(lapply(x,function(y){as.data.frame(t(y),stringsAsFactors=FALSE)}))

}

rbind.named.fill <- function(x) {

nam <- sapply(x, names)

unam <- unique(unlist(nam))

len <- sapply(x, length)

out <- vector("list", length(len))

for (i in seq_along(len)) {

out[[i]] <- unname(x[[i]])[match(unam, nam[[i]])]

}

setNames(as.data.frame(do.call(rbind, out), stringsAsFactors=FALSE), unam)

}

foo <- function (...)

{

dargs <- list(...)

all.names <- unique(names(unlist(dargs)))

out <- do.call(rbind, lapply(dargs, `[`, all.names))

colnames(out) <- all.names

as.data.frame(out, stringsAsFactors=FALSE)

}

require(microbenchmark)

microbenchmark(t1 <- rbind.named.fill(ll),

t2 <- rbind.fill.plyr(ll),

t3 <- do.call(foo, ll), times=10)

identical(t1, t2) # TRUE

identical(t1, t3) # TRUE

Unit: milliseconds

expr        min         lq     median         uq        max neval

t1 <- rbind.named.fill(ll)   243.0754   258.4653   307.2575   359.4332   385.6287    10

t2 <- rbind.fill.plyr(ll) 16808.3334 17139.3068 17648.1882 17890.9384 18220.2534    10

t3 <- do.call(foo, ll)   188.5139   204.2514   229.0074   339.6309   359.4995    10

foo <- function (...)

{

dargs <- list(...)

if (!all(vapply(dargs, is.vector, TRUE)))

stop("all inputs must be vectors")

if (!all(vapply(dargs, function(x) !is.null(names(x)), TRUE)))

stop("all input vectors must be named.")

all.names <- unique(names(unlist(dargs)))

out <- do.call(rbind, lapply(dargs, `[`, all.names))

colnames(out) <- all.names

out

}

R > do.call(foo, x)

A   B   C   D   E   F   G   H   I   J   L   O   R   P   T

[1,] "b" "d" "f" "h" "j" "l" "n" "p" "r" "t" NA  NA  NA  NA  NA

[2,] NA  NA  "c" NA  NA  "f" NA  NA  "i" NA  "l" "o" "r" NA  NA

[3,] NA  NA  NA  "d" NA  NA  NA  "h" NA  NA  "l" NA  NA  "p" "t"

rbindlist(l2, fill=TRUE)

# generate some huge random data:

set.seed(45)

sample.fun <- function() {

nam <- sample(LETTERS, sample(5:15))

val <- sample(letters, length(nam))

setNames(val, nam)

}

l1 <- replicate(1e6, sample.fun()) # Arun's data, just bigger

l2 <- lapply(l1, as.list) # same data converted with as.list

library(microbenchmark)

library(data.table)

# Arun's function

rbind.named.fill <- function(x) {

nam <- sapply(x, names)

unam <- unique(unlist(nam))

len <- sapply(x, length)

out <- vector("list", length(len))

for (i in seq_along(len)) {

out[[i]] <- unname(x[[i]])[match(unam, nam[[i]])]

}

setNames(as.data.frame(do.call(rbind, out), stringsAsFactors=FALSE), unam)

}

# GSee's function

foo <- function (...)

{

dargs <- list(...)

all.names <- unique(names(unlist(dargs)))

out <- do.call(rbind, lapply(dargs, `[`, all.names))

colnames(out) <- all.names

as.data.frame(out, stringsAsFactors=FALSE)

}

microbenchmark(t1 <- rbind.named.fill(l1),

t2 <- rbindlist(l2, fill=TRUE),

t3 <- do.call(foo, l1),

times=10)

#> Unit: seconds

#>                                 expr      min        lq        mean    median        uq      max neval

#> t1 <- rbind.named.fill(l1)      6.536782  7.545538   9.118771  9.304844 10.505814 11.28260    10

#> t2 <- rbindlist(l2, fill=TRUE)  5.250387  5.787712   6.910340  6.226065  7.579503 10.40524    10

#> t3 <- do.call(foo, l1)          9.590615 11.043557  13.504694 12.550535 15.364464 19.95877    10

identical(t1, data.frame(t2))

#> [1] TRUE

identical(t3, data.frame(t2))

#> [1] TRUE

• 3 回答
• 0 关注
• 40 浏览

0/150