Related
I have a large list that stored measurements (a product of other lapply() runs). I now want to gather these measurements and calculate median/mean/sd etc but I don't know how to access them. The structure of this list is like this:
foo[[i]][[j]][[k]][[1]]
foo[[i]][[j]][[k]][[2]]$bar
I can't figure out a function that would return e.g. mean of $bar (but not of $x) and keep relation the values of the indices i,j,k.
A sample list can be generated with the following R code:
library(purrr)
metrics <- function(y){
tt10r <- median(y)
list(y, flatten(list(bar = tt10r)))
}
example_list <- list()
for (i in 1:10)
{
v <- list()
for (j in 1:10)
{
w <- 1:10
v[j] <- list(w)
}
example_list[[i]] <- v
}
foo <- list()
for (i in 1:length(example_list))
{
u <- list()
values <- list()
for (j in 1:length(example_list[[i]]))
{
u[[j]] <- lapply(example_list[[i]][[j]], function(x) mean(x))
values[[j]] <- lapply(u[[j]], function(x) metrics(x))
}
foo[[i]] <- values
}
The following code works nicely, but I am not sure if it is efficient (loops!). Gives the anticipated result:
final <- matrix(nrow = tail(cumsum(unlist(lapply(foo, function(x) lengths(x) -2))), n=1), ncol = 3)
final <- data.frame(final)
j=1
i=1
all_js <- c(0, cumsum(lengths(foo)))
starts <- c(0, cumsum(unlist(lapply(foo, function(x) lengths(x) -2)))) + 1
ends <- c(0, cumsum(unlist(lapply(foo, function(x) lengths(x) -2))))
for (i in 1:length(foo))
{
a <- foo[[i]]
for (j in 1:length(a))
{
b <- a[[j]]
data <- unlist(lapply(lapply(b[1], '[', 2), '[[', 1))
for (k in 2:c(length(b)-2))
{
data <- rbind(data,unlist(lapply(lapply(b[k], '[', 2), '[[', 1)))
}
row.names(data) <- NULL
colnames(final) <- c("i", "j", colnames(data))
first <- starts[all_js[i] + j]
last <- ends[all_js[i] + j+1]
final[first:last,] <- data.frame(cbind(i = i, j = j, data))
}
}
I am looking for the r equivalent of this simple code in python
mylist = []
for this in that:
df = 1
mylist.append(df)
basically just creating an empty list, and then adding the objects created within the loop to it.
I only saw R solutions where one has to specify the index of the new element (say mylist[[i]] <- df), thus requiring to create an index i in the loop.
Is there any simpler way than that to just append after the last element.
There is a function called append:
ans <- list()
for (i in 1992:1994){
n <- 1 #whatever the function is
ans <- append(ans, n)
}
ans
## [[1]]
## [1] 1
##
## [[2]]
## [1] 1
##
## [[3]]
## [1] 1
##
Note: Using apply functions instead of a for loop is better (not necessarily faster) but it depends on the actual purpose of your loop.
Answering OP's comment: About using ggplot2 and saving plots to a list, something like this would be more efficient:
plotlist <- lapply(seq(2,4), function(i) {
require(ggplot2)
dat <- mtcars[mtcars$cyl == 2 * i,]
ggplot() + geom_point(data = dat ,aes(x=cyl,y=mpg))
})
Thanks to #Wen for sharing Comparison of c() and append() functions:
Concatenation (c) is pretty fast, but append is even faster and therefor preferable when concatenating just two vectors.
There is: mylist <- c(mylist, df) but that's usually not the recommended way in R. Depending on what you're trying to achieve, lapply() is often a better option.
mylist <- list()
for (i in 1:100){
n <- 1
mylist[[(length(mylist) +1)]] <- n
}
This seems to me the faster solution.
x <- 1:1000
aa <- microbenchmark({xx <- list(); for(i in x) {xx <- append(xx, values = i)} })
bb <- microbenchmark({xx <- list(); for(i in x) {xx <- c(xx, i)} } )
cc <- microbenchmark({xx <- list(); for(i in x) {xx[(length(xx) + 1)] <- i} } )
sapply(list(aa, bb, cc), (function(i){ median(i[["time"]]) / 10e5 }))
#{append}=4.466634 #{c}=3.185096 #{this.one}=2.925718
mylist <- list()
for (i in 1:100) {
df <- 1
mylist <- c(mylist, df)
}
Use
first_list = list(a=0,b=1)
newlist = c(first_list,list(c=2,d=3))
print(newlist)
$a
[1] 0
$b
[1] 1
$c
[1] 2
$d
[1] 3
Here's an example:
glmnet_params = list(family="binomial", alpha = 1,
type.measure = "auc",nfolds = 3, thresh = 1e-4, maxit = 1e3)
Now:
glmnet_classifier = do.call("cv.glmnet",
c(list(x = dtm_train, y = train$target), glmnet_params))
I try to do a spelling checker with R that correct a spelling mistake of a word or a document.
I try with this R code to do a correction for a word, which it works very well:
> Correct("speling", dtm = counts)
$l4
[1] "spelling"
but when I try to do the correction of a document, I get this error :
> CorrectDocument("the quick bruwn fowx jumpt ovre tha lasy dog", dtm = counts)
Error in strsplit(word, NULL) : non-character argument
# This is a text processing function, which I
# borrowed from a CMU Data mining course professor.
strip.text <- function(txt) {
# remove apostrophes (so "don't" -> "dont", "Jane's" -> "Janes", etc.)
txt <- gsub("'","",txt)
# convert to lowercase
txt <- tolower(txt)
# change other non-alphanumeric characters to spaces
txt <- gsub("[^a-z0-9]"," ",txt)
# change digits to #
txt <- gsub("[0-9]+"," ",txt)
# split and make one vector
txt <- unlist(strsplit(txt," "))
# remove empty words
txt <- txt[txt != ""]
return(txt)
}
# Words within 1 transposition.
Transpositions <- function(word = FALSE) {
N <- nchar(word)
if (N > 2) {
out <- rep(word, N - 1)
word <- unlist(strsplit(word, NULL))
# Permutations of the letters
perms <- matrix(c(1:(N - 1), 2:N), ncol = 2)
reversed <- perms[, 2:1]
trans.words <- matrix(rep(word, N - 1), byrow = TRUE, nrow = N - 1)
for(i in 1:(N - 1)) {
trans.words[i, perms[i, ]] <- trans.words[i, reversed[i, ]]
out[i] <- paste(trans.words[i, ], collapse = "")
}
}
else if (N == 2) {
out <- paste(word[2:1], collapse = "")
}
else {
out <- paste(word, collapse = "")
}
return(out)
}
# Single letter deletions.
Deletes <- function(word = FALSE) {
N <- nchar(word)
word <- unlist(strsplit(word, NULL))
out <- list()
for(i in 1:N) {
out[i] <- paste(word[-i], collapse = "")
}
return(out)
}
# Single-letter insertions.
Insertions <- function(word = FALSE) {
N <- nchar(word)
out <- list()
for (letter in letters) {
out[[letter]] <- rep(word, N + 1)
for (i in 1:(N + 1)) {
out[[letter]][i] <- paste(substr(word, i - N, i - 1), letter,
substr(word, i, N), sep = "")
}
}
out <- unlist(out)
return(out)
}
# Single-letter replacements.
Replaces <- function(word = FALSE) {
N <- nchar(word)
out <- list()
for (letter in letters) {
out[[letter]] <- rep(word, N)
for (i in 1:N) {
out[[letter]][i] <- paste(substr(word, i - N, i - 1), letter,
substr(word, i + 1, N + 1), sep = "")
}
}
out <- unlist(out)
return(out)
}
# All Neighbors with distance "1"
Neighbors <- function(word) {
neighbors <- c(word, Replaces(word), Deletes(word),
Insertions(word), Transpositions(word))
return(neighbors)
}
# Probability as determined by our corpus.
Probability <- function(word, dtm) {
# Number of words, total
N <- length(dtm)
word.number <- which(names(dtm) == word)
count <- dtm[word.number]
pval <- count/N
return(pval)
}
# Correct a single word.
Correct <- function(word, dtm) {
neighbors <- Neighbors(word)
# If it is a word, just return it.
if (word %in% names(dtm)) {
out <- word
}
# Otherwise, check for neighbors.
else {
# Which of the neighbors are known words?
known <- which(neighbors %in% names(dtm))
N.known <- length(known)
# If there are no known neighbors, including the word,
# look farther away.
if (N.known == 0) {
print(paste("Having a hard time matching '", word, "'...", sep = ""))
neighbors <- unlist(lapply(neighbors, Neighbors))
}
# Then out non-words.
neighbors <- neighbors[which(neighbors %in% names(dtm))]
N <- length(neighbors)
# If we found some neighbors, find the one with the highest
# p-value.
if (N > 1) {
P <- 0*(1:N)
for (i in 1:N) {
P[i] <- Probability(neighbors[i], dtm)
}
out <- neighbors[which.max(P)]
}
# If no neighbors still, return the word.
else {
out <- word
}
}
return(out)
}
# Correct an entire document.
CorrectDocument <- function(document, dtm) {
by.word <- unlist(strsplit(document, " "))
N <- length(by.word)
for (i in 1:N) {
by.word[i] <- Correct(by.word[i], dtm = dtm)
}
corrected <- paste(by.word, collapse = " ")
return(corrected)
}
words <- scan("http://norvig.com/big.txt", what = character())
words <- strip.text(words)
counts <- table(words)
Correct("speling", dtm = counts)
#---correct a document
CorrectDocument("the quick bruwn fowx jumpt ovre tha lasy dog", dtm = counts)
Any idea please?
Thank you
The function Correct has a bug, you should add an unlist, i.e. the line :
Correct <- function(word, dtm) {
neighbors <- Neighbors(word)
should be changed as :
Correct <- function(word, dtm) {
neighbors <- unlist(Neighbors(word))
EDIT :
Here's a function which correct the lines of a document file (overwriting it) :
CorrectDocumentFile <- function(file,dtm){
# read the file lines
textLines <- unlist(readLines(file))
# for each line not empty or blank, correct the text
for(i in which(!grepl("^\\s*$",textLines))){
line <- textLines[[i]]
textLines[i] <- CorrectDocument(line,dtm)
}
# overwrite the file with the correction
writeLines(textLines, file)
}
Usage:
CorrectDocumentFile(file="fileToBeCorrected.txt", dtm=counts)
Consider the following comma-separated string of numbers:
s <- "1,2,3,4,8,9,14,15,16,19"
s
# [1] "1,2,3,4,8,9,14,15,16,19"
Is it possible to collapse runs of consecutive numbers to its corresponding ranges, e.g. the run 1,2,3,4 above would be collapsed to the range 1-4. The desired result looks like the following string:
s
# [1] "1-4,8,9,14-16,19"
I took some heavy inspiration from the answers in this question.
findIntRuns <- function(run){
rundiff <- c(1, diff(run))
difflist <- split(run, cumsum(rundiff!=1))
unlist(lapply(difflist, function(x){
if(length(x) %in% 1:2) as.character(x) else paste0(x[1], "-", x[length(x)])
}), use.names=FALSE)
}
s <- "1,2,3,4,8,9,14,15,16,19"
s2 <- as.numeric(unlist(strsplit(s, ",")))
paste0(findIntRuns(s2), collapse=",")
[1] "1-4,8,9,14-16,19"
EDIT: Multiple solutions: benchmarking time!
Unit: microseconds
expr min lq median uq max neval
spee() 277.708 295.517 301.5540 311.5150 1612.207 1000
seb() 294.611 313.025 321.1750 332.6450 1709.103 1000
marc() 672.835 707.549 722.0375 744.5255 2154.942 1000
#speendo's solution is the fastest at the moment, but none of these have been optimised yet.
I was too slow... but here's another solution.
It uses less R-specific functions so it could be ported to other languages (on the other hand maybe it's less elegant)
s <- "1,2,3,4,8,9,14,15,16,19"
collapseConsecutive <- function(s){
x <- as.numeric(unlist(strsplit(s, ",")))
x_0 <- x[1]
out <- toString(x[1])
hasDash <- FALSE
for(i in 2:length(x)) {
x_1 <- x[i]
x_2 <- x[i+1]
if((x_0 + 1) == x_1 && !is.na(x_2) && (x_1 + 1) == x_2) {
if(!hasDash) {
out <- c(out, "-")
hasDash <- TRUE
}
} else {
if(hasDash) {
hasDash <- FALSE
} else {
out <- c(out, ",")
}
out <- c(out, x_1)
hasDash <- FALSE
}
x_0 <- x_1
}
outString <- paste(out, collapse="")
outString
}
collapseConsecutive(s)
# [1] "1-4,8,9,14-16,19"
Another fairly compact option
in.seq <- function(x) {
# returns TRUE for elments within ascending sequences
(c(diff(x, 1), NA) == 1 & c(NA, diff(x,2), NA) == 2)
}
contractSeqs <- function(x) {
# returns string formatted with contracted sequences
x[in.seq(x)] <- ""
gsub(",{2,}", "-", paste(x, collapse=","), perl=TRUE)
}
s <- "1,2,3,4,8,9,14,15,16,19"
s1 <- as.numeric(unlist(strsplit(s, ","))) # as earlier answers
# assumes: numeric vector, length > 2, positive integers, ascending sequences
contractSeqs(s1)
# [1] "1-4,8,9,14-16,19"
I also wrote a bells & whistles version that can handle both numeric and string input including named objects, descending sequences and alternative punctuation, as well as performing error checking and reporting. If anyone is interested, I can add this to my answer.
Here's a function that should do what you want:
conseq <- function(s){
s <- as.numeric(unlist(strsplit(s, ",")))
dif <- s[seq(length(s))][-1] - s[seq(length(s)-1)]
new <- !c(0, dif == 1)
cs <- cumsum(new)
res <- vector(mode="list", max(cs))
for(i in seq(res)){
s.i <- s[which(cs == i)]
if(length(s.i) > 2){
res[[i]] <- paste(min(s.i), max(s.i), sep="-")
} else {
res[[i]] <- as.character(s.i)
}
}
paste(unlist(res), collapse=",")
}
Example
> s <- "1,2,3,4,8,9,14,15,16,19"
> conseq(s)
[1] "1-4,8,9,14-16,19"
I am wondering whether a proper framework for interval manipulation and comparison does exist in R.
After some search, I was only able to find the following:
- function findInterval in base Package. (but I hardly understand it)
- some answers here and there about union and intersection (notably: http://r.789695.n4.nabble.com/Union-Intersect-two-continuous-sets-td4224545.html)
Would you know of an initiative to implement a comprehensive set of tools to easily handles frequent tasks in interval manipulation, like inclusion/setdiff/union/intersection/etc. (eg see here for a list of functionalities)?
or would you have advice in developing such an approach?
below are some drafts on my side for doing so. it is surely awkward and still has some bugs but it might illustrate what I am looking for.
preliminary aspects about the options taken
- should deal seamlessly with intervals or intervals set
- intervals are represented as 2 columns data.frames (lower boundary, higher boundary), on one row
- intervals sets are represented as 2 columns with several rows
- a third column might be needed for identification of intervals sets
UNION
interval_union <- function(df){ # for data frame
df <- interval_clean(df)
if(is.empty(df)){
return(as.data.frame(NULL))
} else {
if(is.POSIXct(df[,1])) {
dated <- TRUE
df <- colwise(as.numeric)(df)
} else {
dated <- FALSE
}
M <- as.matrix(df)
o <- order(c(M[, 1], M[, 2]))
n <- cumsum( rep(c(1, -1), each=nrow(M))[o])
startPos <- c(TRUE, n[-1]==1 & n[-length(n)]==0)
endPos <- c(FALSE, n[-1]==0 & n[-length(n)]==1)
M <- M[o]
if(dated == TRUE) {
df2 <- colwise(mkDateTime)(as.data.frame(cbind(M[startPos], M[endPos])), from.s = TRUE)
} else {
df2 <- as.data.frame(cbind(M[startPos], M[endPos]))
}
colnames(df2) <- colnames(df)
# print(df2)
return(df2)
}
}
union_1_1 <- function(test, ref){
names(ref) <- names(test)
tmp <- interval_union(as.data.frame(rbind(test, ref)))
return(tmp)
}
union_1_n <- function(test, ref){
return(union_1_1(test, ref))
}
union_n_n <- function(test, ref){
testnn <- adply(.data = test, 1, union_1_n, ref, .expand = FALSE)
return(testnn)
}
ref_interval_union <- function(df, ref){
tmp0 <- adply(df, 1, union_1_1, ref, .expand = FALSE) # set to FALSE to keep ID
return(tmp0)
}
INTERSECTION
interval_intersect <- function(df){
# adapted from : http://r.789695.n4.nabble.com/Union-Intersect-two-continuous-sets-td4224545.html
M <- as.matrix(df)
L <- max(M[, 1])
R <- min(M[, 2])
Inew <- if (L <= R) c(L, R) else c()
if (!is.empty(Inew)){
df2 <- t(as.data.frame(Inew))
colnames(df2) <- colnames(df)
rownames(df2) <- NULL
} else {
df2 <- NULL
}
return(as.data.frame(df2))
}
ref_interval_intersect <- function(df, ref){
tmpfun <- function(a, b){
names(b) <- names(a)
tmp <- interval_intersect(as.data.frame(rbind(a, b)))
return(tmp)
}
tmp0 <- adply(df, 1, tmpfun, ref, .expand = FALSE) # [,3:4]
#if(!is.empty(tmp0)) colnames(tmp0) <- colnames(df)
return(tmp0)
}
int_1_1 <- function(test, ref){
te <- as.vector(test)
re <- as.vector(ref)
names(re) <- names(te)
tmp0 <- c(max(te[1, 1], re[1, 1]), min(te[1, 2], re[1, 2]))
if(tmp0[1]>tmp0[2]) tmp0 <- NULL # inverse of a correct interval --> VOID
if(!is.empty(tmp0)){
tmp1 <- colwise(mkDateTime)(as.data.frame(t(as.data.frame(tmp0))))
colnames(tmp1) <- colnames(test)
} else {
tmp1 <- data.frame(NULL)
}
return(tmp1)
}
int_1_n <- function(test, ref){
test1 <- adply(.data = ref, 1, int_1_1, test = test, .expand = FALSE)
if(is.empty(test1)){
return(data.frame(NULL))
} else {
testn <- interval_union(test1[,2:3])
return(testn)
}
}
int_n_n <- function(test, ref){
testnn <- adply(.data = test, 1, int_1_n, ref, .expand = FALSE)
# return(testnn[,2:3]) # return interval set without index (1st column)
return(testnn) # return interval set with index (1st column) --> usefull to go with merge to keep metadata going alon g with interval description
}
int_intersect <- function(df, ref){
mycols <- colnames(df)
df$X1 <- 1:nrow(df)
test <- df[, 1:2]
tmp <- int_n_n(test, ref)
intersection <- merge(tmp, df, by = "X1", suffixes = c("", "init"))
return(intersection[,mycols])
}
EXCLUSION
excl_1_1 <- function(test, ref){
te <- as.vector(test)
re <- as.vector(ref)
names(re) <- names(te)
if(te[1] < re[1]){ # Lower Bound
if(te[2] > re[1]){ # overlap
x <- unlist(c(te[1], re[1]))
} else { # no overlap
x <- unlist(c(te[1], te[2]))
}
} else { # test > ref on lower bound side
x <- NULL
}
if(te[2] > re[2]){ # Upper Bound
if(te[1] < re[2]){ # overlap
y <- unlist(c(re[2], te[2]))
} else { # no overlap
y <- unlist(c(te[1], te[2]))
}
} else { # test < ref on upper bound side
y <- NULL
}
if(is.empty(x) & is.empty(y)){
tmp0 <- NULL
tmp1 <- tmp0
} else {
tmp0 <- as.data.frame(rbind(x, y))
colnames(tmp0) <- colnames(test)
tmp1 <- interval_union(tmp0)
}
return(tmp1)
}
excl_1_n <- function(test, ref){
testn0 <- adply(.data = ref, 1, excl_1_1, test = test, .expand=FALSE)
# boucle pour intersecter successivement les intervalles sets, pour gérer les intervalles disjoints (identifiés par X1, col1)
tmp <- range(testn0)
names(tmp) <- colnames(testn0)[2:3]
tmp <- as.data.frame(t(tmp))
for(i in unique(testn0[,1])){
tmp <- int_n_n(tmp, testn0[testn0[,1]==i, 2:3])
}
return(tmp)
}
INCLUSION
incl_1_1 <- function(test, ref){
te <- as.vector(test)
re <- as.vector(ref)
if(te[1] >= re[1] & te[2] <= re[2]){ return(TRUE) } else { return(FALSE) }
}
incl_1_n <- function(test, ref){
testn <- adply(.data = ref, 1, incl_1_1, test = test)
return(any(testn[,ncol(testn)]))
}
incl_n_n <- function(test, ref){
testnn <- aaply(.data = test, 1, incl_1_n, ref, .expand = FALSE)
names(testnn) <- NULL
return(testnn)
}
flat_incl_n_n <- function(test, ref){
ref <- interval_union(ref)
return(incl_n_n(test, ref))
}
# testing for a vector, instead of an interval set
incl_x_1 <- function(x, ref){
test <- (x>=ref[1,1] & x<ref[1,2])
return(test)
}
incl_x_n <- function(x, ref){
test <- any(x>=ref[,1] & x<ref[,2])
return(test)
}
I think you might be able to make good use of the many interval-related functions in the sets package.
Here's a small example illustrating the package's support for interval construction, intersection, set difference, union, and complementation, as well as its test for inclusion in an interval. These and many other related functions are documented on the help page for ?interval.
library(sets)
i1 <- interval(1,6)
i2 <- interval(5,10)
i3 <- interval(200,400)
i4 <- interval(202,402)
i5 <- interval_union(interval_intersection(i1,i2),
interval_symdiff(i3,i4))
i5
# [5, 6] U [200, 202) U (400, 402]
interval_complement(i5)
# [-Inf, 5) U (6, 200) U [202, 400] U (402, Inf]
interval_contains_element(i5, 5.5)
# [1] TRUE
interval_contains_element(i5, 201)
# [1] TRUE
If your intervals are currently encoded in a two-column data.frame, you could use something like mapply() to convert them to intervals of the type used by the sets package:
df <- data.frame(lBound = c(1,5,100), uBound = c(10, 6, 200))
Ints <- with(df, mapply("interval", l=lBound, r=uBound, SIMPLIFY=FALSE))
Ints
# [[1]]
# [1, 10]
# [[2]]
# [5, 6]
# [[3]]
# [100, 200]