I am trying to break my habit of for loops by using apply but I've gotten stumped on this one. I have a for loop that collapses every two rows into one row for an object, obj.tmp(366 by 34343), but it is slow.
Here's a much shortened example:
df <- data.frame(X1 = letters[1:10], X2 = letters[11:20], stringsAsFactors = FALSE)
Thus:
> df
X1 X2
a k
b l
c m
d n
e o
f p
g q
h r
i s
j t
for(i in 1:(nrow(df)/2)){
df2[i,] <- apply( df[(i*2-1):(i*2),], 2, paste, collapse = "")
}
Output:
> df2
X1 X2
ab kl
cd mn
ef op
gh qr
ij st
Suggestions on a better method?
Based on your sample data, here is one possibility:
# Sample data
df <- data.frame(X1 = letters[1:10], X2 = letters[11:20], stringsAsFactors = FALSE);
do.call(rbind, lapply(split(df, gl(nrow(df) / 2, 2, nrow(df))), function(x) sapply(x, paste0, collapse = "")))
# X1 X2
#1 "ab" "kl"
#2 "cd" "mn"
#3 "ef" "op"
#4 "gh" "qr"
#5 "ij" "st"
Explanation: Split df every two rows and store in list, paste entries by column, and rbind into final object.
If you want to avoid rbinding the list element, you can also do:
t(sapply(split(df, gl(nrow(df) / 2, 2, nrow(df))), function(x) sapply(x, paste0, collapse = "")));
# X1 X2
#1 "ab" "kl"
#2 "cd" "mn"
#3 "ef" "op"
#4 "gh" "qr"
#5 "ij" "st"
We can use the aggregate function:
df1=cbind(df,id=rep(1:(nrow(df)/2)# Create a new df with an id that shows the rows to be combined
aggregate(.~id,df1,each=2)),paste0,collapse="")[-1]#Combine the rows
X1 X2
1 ab kl
2 cd mn
3 ef op
4 gh qr
5 ij st
You can do all this in one line:
aggregate(.~id,cbind(df,id=rep(1:(nrow(df)/2),each=2)),paste0,collapse="")[-1]
You can also try:
matrix(do.call(paste0,data.frame(matrix(unlist(df),,2,T))),,2)
[,1] [,2]
[1,] "ab" "kl"
[2,] "cd" "mn"
[3,] "ef" "op"
[4,] "gh" "qr"
[5,] "ij" "st"
Some thing like this ? If isn't, Can you be more clear? And pass the code to replicate what you are doing. But I hope this solves your problem.
df <- data.frame(X1 = letters[1:10], stringsAsFactors = FALSE)
df2 <- data.frame(X1 = character(), stringsAsFactors = FALSE)
sapply(1:round(nrow(df)/2), FUN = function(x) {
df2[x,] <<- paste(df[(x*2-1):(x*2),], collapse = "")
})
df2
Related
How can I get the names of the leafs of a nested list (containing a dataframe)
p <- list(a=1,b=list(b1=2,b2=3),c=list(c1=list(c11='a',c12='x'),c.2=data.frame("t"=1)))
into a vector format:
[[1]]
[1] "a"
[[2]]
[1] "b" "b1"
[[3]]
[1] "b" "b2"
[[4]]
[1] "c" "c1" "c11"
[[5]]
[1] "c" "c1" "c12"
[[6]]
[1] "c" "c.2"
The problem is that my list contains names with a dot (e.g. "c.2"). By using unlist, one gets "c.c.2" and I (or possibly strsplit) can't tell if the point is a delimiter of unlist or part of the name. That is the difference to this question.
It should ignore data.frames. My approach so far is adapted from here, but struggles with the points created by unlist:
listNames = function(l, maxDepth = 2) {
n = 0
listNames_rec = function(l, n) {
if(!is.list(l) | is.data.frame(l) | n>=maxDepth) TRUE
else {
n = n + 1
# print(n)
lapply(l, listNames_rec, n)
}
}
n = names(unlist(listNames_rec(l, n)))
return(n)
}
listNames(p, maxDepth = 3)
[1] "a" "b.b1" "b.b2" "c.c1.c11" "c.c1.c12" "c.c.2"
Like this?
subnames <- function(L, s) {
if (!is.list(L) || is.data.frame(L)) return(L)
names(L) <- gsub(".", s, names(L), fixed = TRUE)
lapply(L, subnames, s)
}
res <- listNames(subnames(p, ":"), maxDepth = 3)
gsub(":", ".",
gsub(".", "$", res, fixed = TRUE),
fixed = TRUE
)
#[1] "a" "b$b1" "b$b2" "c$c1$c11" "c$c1$c12" "c$c.2"
Not a full answer but I imagine rrapply package could help you here?
One option could be to extract all names:
library(rrapply)
library(dplyr)
rrapply(p, how = "melt") %>%
select(-value)
# L1 L2 L3
# 1 a <NA> <NA>
# 2 b b1 <NA>
# 3 b b2 <NA>
# 4 c c1 c11
# 5 c c1 c12
# 6 c c.2 t
The problem here is that data.frame names are included above too so you could extract them separately:
#extract data frame name
rrapply(p, classes = "data.frame", how = "melt") %>%
select(-value)
# L1 L2
# 1 c c.2
Then you could play around with these two datasets and perhaps extract duplicates but keep dataframe names
rrapply(p, how = "melt") %>%
bind_rows(rrapply(p, classes = "data.frame", how = "melt"))
#then filter etc...
A way might be:
listNames = function(l, n, N) {
if(!is.list(l) | is.data.frame(l) | n<1) list(rev(N))
else unlist(Map(listNames, l, n=n-1, N=lapply(names(l), c, N)), FALSE, FALSE)
}
listNames(p, 3, NULL)
#[[1]]
#[1] "a"
#
#[[2]]
#[1] "b" "b1"
#
#[[3]]
#[1] "b" "b2"
#
#[[4]]
#[1] "c" "c1" "c11"
#
#[[5]]
#[1] "c" "c1" "c12"
#
#[[6]]
#[1] "c" "c.2"
Having the vector,
v1 <- LETTERS[1:4]
I would like to get a two-columns data.frame of the type:
A B
A C
A D
B A
B C
B D
D A
D B
D C
I have tried with combn, outer and expand.grid but I didn’t get what I wanted.
outer(v1, v1, paste, sep="")
[,1] [,2] [,3] [,4]
[1,] "AA" "AB" "AC" "AD"
[2,] "BA" "BB" "BC" "BD"
[3,] "CA" "CB" "CC" "CD"
[4,] "DA" "DB" "DC" "DD"
do.call(rbind, lapply(seq_along(v1), function(i) data.frame(x = v1[i], y = v1[-i])))
OR
do.call(rbind, lapply(seq_along(v1), function(i){
do.call(rbind, lapply(seq_along(v1)[-i], function(j){
c(v1[i], v1[j])
}))
}))
OR
d = expand.grid(replicate(2, v1, simplify = FALSE))
d[d$Var1 != d$Var2,]
If you want all possible combinations of AA - DD in two columns, you can use the tidyr crossing function.
This would be my approach:
library(tidyverse)
v1 <- LETTERS[1:4]
v2 <- v1
final_df <- crossing(v1, v2)
All solutions are perfectly fine. And with one extra line of command you can filter out the row that contain A A, and B B, and C C, and D D.
Using the example by Matt,
final_df <- final_df %>%
filter(v1 != v2)
Thanks!
I'm trying to re-organize my dataframes by Column orders
for Example
x <- data.frame("A" = c(1,1), "B" = c(2,2), "C" = c(3,3))
y <- data.frame("A" = c(2,2), "B" = c(3,3), "C" = c(4,4))
z <- data.frame("A" = c(3,3), "B" = c(4,4), "C" = c(5,5))
Say I have dataframes as above.
What I want to do is make new dataframes by column orders of those above dataframes. (Simply put, I want to put all the "A"s ,"B"s and "C"s, to 3 new dataframes.
the below dataframes are my wanted results
a <- data.frame("A" = c(1,1), "A" = c(2,2), "A" = c(3,3))
b <- data.frame("B" = c(2,2), "B" = c(3,3), "B" = c(4,4))
c <- data.frame("C" = c(3,3), "C" = c(4,4), "C" = c(5,5))
We can do this with tidyverse
library(tidyverse)
list(x, y, z) %>%
transpose %>%
map(~ do.call(cbind, .x))
Or with base R
lapply(names(x), function(nm) cbind(x[, nm], y[, nm], z[, nm]))
Assuming you have equal number of columns in all the dataframes, one way is to use lapply over list of dataframes and subset them sequentially.
lst1 <- list(x, y, z)
lapply(seq_len(ncol(x)), function(i) cbind.data.frame(lapply(lst1, `[`, i)))
#[[1]]
# A A A
#1 1 2 3
#2 1 2 3
#[[2]]
# B B B
#1 2 3 4
#2 2 3 4
#[[3]]
# C C C
#1 3 4 5
#2 3 4 5
If your dataframes are not already sorted by names you might want to do that first.
lst1 <- lapply(list(x, y, z), function(i) i[order(names(i))])
We can also use purrr using the same logic
library(purrr)
map(seq_len(ncol(x)), ~cbind.data.frame(map(lst1, `[`, .)))
I have a data set with the structure shown below.
# example data set
a <- "a"
b <- "b"
d <- "d"
id1 <- c(a,a,a,a,b,b,d,d,a,a,d)
id2 <- c(b,d,d,d,a,a,a,a,b,b,d)
id3 <- c(b,d,d,a,a,a,a,d,b,d,d)
dat <- rbind(id1,id2,id3)
dat <- data.frame(dat)
I need to find across each row the first sequence with repeated elements "a" and identify the element following the sequence immediately.
# desired results
dat$s3 <- c("b","b","d")
dat
I was able to break the problem in 3 steps and solve the first one but as my programming skills are quite limited, I would appreciate any advice on how to approach steps 2 and 3. If you have an idea that solves the problem in another way that would be extremely helpful as well.
Here is what I have so far:
# Step 1: find the first occurence of "a" in the fist sequence
dat$s1 <- apply(dat, 1, function(x) match(a,x))
# Step 2: find the last occurence in the first sequence
# Step 3: find the element following the last occurence in the first sequence
Thanks in advance!
I'd use filter:
fun <- function(x) {
x <- as.character(x)
isa <- (x == "a") #find "a" values
#find sequences with two TRUE values and the last value FALSE
ids <- stats::filter(isa, c(1,1,1), sides = 1) == 2L & !isa
na.omit(x[ids])[1] #subset
}
apply(dat, 1, fun)
#id1 id2 id3
#"b" "b" "d"
Try this (assuming that you have repeated a at each row):
library(stringr)
dat$s3 <-apply(dat, 1, function(x) str_match(paste(x, collapse=''),'aa([^a])')[,2])
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 s3
id1 a a a a b b d d a a d b
id2 b d d d a a a a b b d b
id3 b d d a a a a d b d d d
Well, here is one attempt which is a bit messy,
l1 <- lapply(apply(dat, 1, function(i) as.integer(which(i == a))),
function(j) j[cumsum(c(1, diff(j) != 1)) == 1])
ind <- unname(sapply(l1, function(i) tail(i, 1) + 1))
dat$s3 <- diag(as.matrix(dat[ind]))
dat$s3
#[1] "b" "b" "d"
or wrap it in a function,
fun1 <- function(df){
l1 <- lapply(apply(df, 1, function(i) as.integer(which(i == a))),
function(j) j[cumsum(c(1, diff(j) != 1)) == 1])
ind <- unname(sapply(l1, function(i) tail(i, 1) + 1))
return(diag(as.matrix(df[ind])))
}
fun1(dat)
#[1] "b" "b" "d"
I have a dataframe similar to the below:
v1 v2 v3 v4 v5
a a1 a2 a3 a4
b b1 b2
c c1 c2 c3
I want to convert this to lists as below.
lista <- list(base="a", alts=c("a1","a2","a3","a4"))<br>
listb <- list(base="b", alts=c("b1","b2"))<br>
listb <- list(base="c", alts=c("c1","c2","c3"))
I have looked at solutions posted on here and tried some suggestions, but nothing works?1
Any help will be great! I am still new to R - Cheers
If df contains your data frame, you could try like this:
l <- lapply(as.data.frame(t(df), stringsAsFactors = FALSE),
function(x) {
x <- unname(x)
list(base = x[1], alts = x[-1])
})
names(l) <- paste0("list", df[, 1])
list2env(l, envir = .GlobalEnv)
lista
# $base
# [1] "a"
#
# $alts
# [1] "a1" "a2" "a3" "a4"