Move several chunks of columns dynamically to another position - r

My data is:
df <- data.frame(a = 1:2,
x = 1:2,
b = 1:2,
y = 3:4,
x_2 = 1:2,
y_2 = 3:4,
c = 1:2,
x_3 = 5:6,
y_3 = 1:2)
I now want to put together the x vars, and the y vars so that the order of columns would be:
a, x, x_2, x_3, b, y, y_2, y_3, c
I thought, I could use tidyverse's relocate function in combination with lapply or map or reduce (?), but it doesn't work out.
E.g. if I do:
move_names <- c("x", "y")
library(tidyverse)
moved_data <- lapply(as.list(move_names), function(x)
{
df <- df |>
relocate(!!!syms(paste0(x, "_", 2:3)),
.after = all_of(x))
}
)
It does the moving for x and y separately, but it creates separate list, but I want to have just my original df with relocated columns.
Update:
I should have been clear that my real data frame has ~500 columns where the to-be-moved columns are all over the place. So providing the full vector of desired column name order won't be feasible.
What I instead have: I have the names of my original columns, i.e. x and y, and I have the names of the to-be-moved columns, i.e. x_2, x_3, y_2, y_3.

In base R:
df[match(c('a', 'x', 'x_2', 'x_3', 'b', 'y', 'y_2', 'y_3', 'c'), names(df))]
#> a x x_2 x_3 b y y_2 y_3 c
#> 1 1 1 1 5 1 3 3 1 1
#> 2 2 2 2 6 2 4 4 2 2

Not sure if it's what you want.
Vector with order of column names
Let's say you have a vector relocate_name that contains the order of your columns:
library(tidyverse)
relocate_name <- c("a", "x", "x_2", "x_3", "b", "y", "y_2", "y_3", "c")
df %>% relocate(any_of(relocate_name))
Vector with prefix of column names
Or if you only have the prefix of the order, let's call it relocate_name2:
relocate_name2 <- c("a", "x", "b", "y", "c")
df %>% relocate(starts_with(relocate_name2))
Group x and y together
Or if you only want to "group" x and y together:
df %>%
relocate(starts_with("x"), .after = "x") %>%
relocate(starts_with("y"), .after = "y")
Output
All of the above output is the same.
a x x_2 x_3 b y y_2 y_3 c
1 1 1 1 5 1 3 3 1 1
2 2 2 2 6 2 4 4 2 2

library(rlist)
# split based in colname-part before _
L <- split.default(df, f = gsub("(.*)_.*", "\\1", names(df)))
# remove names with an underscore
# this is the new order, it should match the names of list L !!
neworder <- names(df)[!grepl("_", names(df))]
# [1] "a" "x" "b" "y" "c"
# cbind list elements together
ans <- rlist::list.cbind(L[neworder])
# a x.x x.x_2 x.x_3 b y.y y.y_2 y.y_3 c
# 1 1 1 1 5 1 3 3 1 1
# 2 2 2 2 6 2 4 4 2 2
# create tidy names again
names(ans) <- gsub(".*\\.(.*)", "\\1", names(ans))
# a x x_2 x_3 b y y_2 y_3 c
# 1 1 1 1 5 1 3 3 1 1
# 2 2 2 2 6 2 4 4 2 2

Ok, this is probably the worst workaround ever and I don't really understand what exactly I'm doing (especially with the <<-), but it is does the trick.
My general idea after realizing the problem a bit more with the help of you guys here was to "loop" through both of my x and y names, remove these new _2 and _3 columns from the vector of column names and re-append them after their "base" x and y columns.
search_names <- c("x", "y")
df_names <- names(df)
new_names <- lapply(search_names, function(x)
{
start <- which(df_names == x)
without_new_names <- setdiff(df_names, paste0(x, "_", 2:3))
df_names <<- append(without_new_names, values = paste0(x, "_", 2:3), after = start)
})[[length(search_names)]]
df |>
relocate(any_of(new_names))
a x x_2 x_3 b y y_2 y_3 c
1 1 1 1 5 1 3 3 1 1
2 2 2 2 6 2 4 4 2 2

Related

Recursively sum data frames for matching rows

I would like to combine a set of data frames into a single data frame by summing columns that have matching variables (instead of appending columns).
For example, given
df1 <- data.frame(A = c(0,0,1,1,1,2,2), B = c(1,2,1,2,3,1,5), x = c(2,3,1,5,3,7,0))
df2 <- data.frame(A = c(0,1,1,2,2,2), B = c(1,1,3,2,4,5), x = c(4,8,4,1,0,3))
df3 <- data.frame(A = c(0,1,2), B = c(5,4,2), x = c(5,3,1))
I want to match by "A" and "B" and sum the values of "x". For this example, I can get the desired result as follows:
library(plyr)
library(dplyr)
# rename columns so that join_all preserves them all:
colnames(df1)[3] <- "x1"
colnames(df2)[3] <- "x2"
colnames(df3)[3] <- "x3"
# join the data frames by matching "A" and "B" values:
res <- join_all(list(df1, df2, df3), by = c("A", "B"), type = "full")
# get the sums and drop superfluous columns:
arrange(res, A, B) %>%
rowwise() %>%
mutate(x = sum(x1, x2, x3, na.rm = TRUE)) %>%
select(A, B, x)
Result:
A B x
<dbl> <dbl> <dbl>
1 0 1 6
2 0 2 3
3 0 5 5
4 1 1 9
5 1 2 5
6 1 3 7
7 1 4 3
8 2 1 7
9 2 2 2
10 2 4 0
11 2 5 3
A more general solution is
library(dplyr)
# function to get the desired result for two data frames:
my_merge <- function(df1, df2)
{
m1 <- merge(df1, df2, by = c("A", "B"), all = TRUE)
m1 <- rowwise(res) %>%
mutate(x = sum(x.x, x.y, na.rm = TRUE)) %>%
select(A, B, x)
return(m1)
}
l1 <- list(df2, df3) # omit the first data frame
res <- df1 # initial value of the result
for(df in l1) res <- my_merge(res, df) # call the function repeatedly
Is there a more efficient option for combining a large set of data frames? Ideally it should be recursive (i.e. it's better not to join all data frames into one massive data frame before calculating the sums).
An easier option is to bind the rows of the datasets, then group by the columns of interest and get the summarised output by getting the sum of 'x'
library(tidyverse)
bind_rows(df1, df2, df3) %>%
group_by(A, B) %>%
summarise(x = sum(x))
# A tibble: 11 x 3
# Groups: A [?]
# A B x
# <dbl> <dbl> <dbl>
# 1 0 1 6
# 2 0 2 3
# 3 0 5 5
# 4 1 1 9
# 5 1 2 5
# 6 1 3 7
# 7 1 4 3
# 8 2 1 7
# 9 2 2 2
#10 2 4 0
#11 2 5 3
If there are many objects in the global environment with the pattern "df" followed by some digits
mget(ls(pattern= "^df\\d+")) %>%
bind_rows %>%
group_by(A, B) %>%
summarise(x = sum(x))
As the OP mentioned about memory constraints, if we do the join first and then use rowSums or + with reduce, it would be more efficient
mget(ls(pattern= "^df\\d+")) %>%
reduce(full_join, by = c("A", "B")) %>%
transmute(A, B, x = rowSums(.[3:5], na.rm = TRUE)) %>%
arrange(A, B)
# A B x
#1 0 1 6
#2 0 2 3
#3 0 5 5
#4 1 1 9
#5 1 2 5
#6 1 3 7
#7 1 4 3
#8 2 1 7
#9 2 2 2
#10 2 4 0
#11 2 5 3
This could also be done with data.table
library(data.table)
rbindlist(mget(ls(pattern= "^df\\d+")))[, .(x = sum(x)), by = .(A, B)]
Ideally it should be recursive (i.e. it's better not to join all data frames into one massive data frame before calculating the sums).
If you're memory constrained and willing to sacrifice speed (vs #akrun's data.table approach), use one table at a time in a loop:
library(data.table)
tabs = c("df1", "df2", "df3")
# enumerate all combos for the results table
# initializing sum to 0
res = CJ(A = 0:2, B = 1:5, x = 0)
# loop over tabs, adding on
for (i in seq_along(tabs)){
tab = get(tabs[[i]])
res[tab, on=.(A, B), x := x + i.x][]
rm(tab)
}
If you need to read tables from disk, change tabs to file names and get to fread or whatever function.
I am skeptical that you can fit all the tables in memory, but cannot also fit an rbind-ed copy of them together.
Similarly (thanks to #akrun's comment), use his approach pairwise:
res = data.table(get(tabs[[1]]))[0L]
for (i in seq_along(tabs)){
tab = get(tabs[[i]])
res = rbind(res, tab)[, .(x = sum(x)), by=.(A,B)]
rm(tab)
}

Using dplyr mutate_at when a function takes multiple arguments which are different columns

I have a data.frame with a large number of columns whose names follow a pattern. Such as:
df <- data.frame(
x_1 = c(1, NA, 3),
x_2 = c(1, 2, 4),
y_1 = c(NA, 2, 1),
y_2 = c(5, 6, 7)
)
I would like to apply mutate_at to perform the same operation on each pair of columns. As in:
df %>%
mutate(
x = ifelse(is.na(x_1), x_2, x_1),
y = ifelse(is.na(y_1), y_2, y_1)
)
Is there a way I can do that with mutate_at/mutate_each?
This:
df %>%
mutate_each(vars(x_1, y_1), funs(ifelse(is.na(.), vars(x_2, y_2), .)))
and various variations I've tried all fail.
The question is similar to Using functions of multiple columns in a dplyr mutate_at call, but different in that the second argument to the function call is not a single column, but a different column for each column in vars.
Thanks in advance.
I don't know if you can get it that way, but here's a different perspective on the problem. If you find yourself with really wide data (e.g., tons of columns with similar names) and you want to do something with them, it might help to tidy the data (long in stata terms) with tidyr::gather (see docs here http://tidyr.tidyverse.org/).
> df %>% gather()
key value
1 x_1 1
2 x_1 NA
3 x_1 3
4 x_2 1
5 x_2 2
6 x_2 4
7 y_1 NA
8 y_1 2
9 y_1 1
10 y_2 5
11 y_2 6
12 y_2 7
After converting the data to this format, it's easier to combine and rearrange values using group_by instead of trying to mutate_at things. E.g., you can ge the first values with df %>% gather() %>% mutate(var = substr(key,1,1)) and manipulate the xs and ys differently using group_by(var).
Old question, but I agree with Jesse that you need to tidy your data a bit. gather would be the way to go, but it lacks somehow the possibility of stats::reshape where you can specify groups of columns to gather. So here's a solution with reshape:
df %>%
reshape(varying = list(c("x_1", "y_1"), c("x_2", "y_2")),
times = c("x", "y"),
direction = "long") %>%
mutate(x = ifelse(is.na(x_1), x_2, x_1)) %>%
reshape(idvar = "id",
timevar = "time",
direction = "wide") %>%
rename_all(funs(gsub("[a-zA-Z]+(_*)([0-9]*)\\.([a-zA-Z]+)", "\\3\\1\\2", .)))
# id x_1 x_2 x y_1 y_2 y
# 1 1 1 1 1 NA 5 5
# 2 2 NA 2 2 2 6 2
# 3 3 3 4 3 1 7 1
In order to do that with any number of column pairs, you could do something like:
df2 <- setNames(cbind(df, df), c(t(outer(letters[23:26], 1:2, paste, sep = "_"))))
v <- split(names(df2), purrr::map_chr(names(df2), ~ gsub(".*_(.*)", "\\1", .)))
n <- unique(purrr::map_chr(names(df2), ~ gsub("_[0-9]+", "", .) ))
df2 %>%
reshape(varying = v,
times = n,
direction = "long") %>%
mutate(x = ifelse(is.na(!!sym(v[[1]][1])), !!sym(v[[2]][1]), !!sym(v[[1]][1]))) %>%
reshape(idvar = "id",
timevar = "time",
direction = "wide") %>%
rename_all(funs(gsub("[a-zA-Z]+(_*)([0-9]*)\\.([a-zA-Z]+)", "\\3\\1\\2", .)))
# id w_1 w_2 w x_1 x_2 x y_1 y_2 y z_1 z_2 z
# 1 1 1 1 1 NA 5 5 1 1 1 NA 5 5
# 2 2 NA 2 2 2 6 2 NA 2 2 2 6 2
# 3 3 3 4 3 1 7 1 3 4 3 1 7 1
This assumes that columns which should be compared are next to each other and that all columns for with possible NA values are in columns suffixed by _1 and the replacement value columns are sufficed by _2.
When I asked this question, the answer was "you can't!" That's no longer the answer, since tidyr now supports pivot_wider and pivot_longer.

Filter by group max combination of values in a given order

I would like to filter by groups, the maximal combination of values based on a given order of columns.
A vector of column should specify the order of columns in which looking at maximal values.
For example :
x <- data.frame(id = c("a", "a", "b", "b"),
x = c(1, 1, 1, 2),
y = c(1, 2, 2, 1),
z = c(1, 1, 2, 1))
> x
id x y z
1 a 1 1 1
2 a 1 2 1
3 b 1 2 2
4 b 2 1 1
In this example I would like to group by id and set the 'priority' to x, y, z which means that I want to look the maximal x value, then it's associated maximal y value and then the maximal z value for the maximal x, y couple.
I'm not aware of such a vectorized function so I reccursively group to find the maximum following column maximal value :
> x
id x y z
1 a 1 2 1
2 b 2 1 1
I can do it with base R, with a loop :
group <- "id"
cols <- c("x", "y", "z")
for (i in seq_along(cols)) {
tmp <- aggregate(setNames(list(x[[cols[i]]]), cols[i]), by = as.list(x[group]), FUN = max)
x <- merge(x, tmp, by = c(group, cols[i]))
group <- c(group, cols[i])
}
x <- x[!duplicated(x), ]
> x
id x y z
1 a 1 2 1
2 b 2 1 1
I would like to apply this to larger amount of data, so this code will struggle at some point. Do you have any ideas to improve this ?
Thank you for any help !
We can try with dplyr
library(dplyr)
x %>%
group_by(id) %>%
arrange(desc(y),desc(z)) %>%
slice(which.max(x))
# id x y z
# <fctr> <dbl> <dbl> <dbl>
#1 a 1 2 1
#2 b 2 1 1
Here is a base R solution using the split-apply-combine methodology.
dfNew <- do.call(rbind, lapply(split(x, x$id),
function(x) x[with(x, order(x, y, z, decreasing=TRUE))[1],]))
which returns
dfNew
id x y z
a a 1 2 1
b b 2 1 1
split splits the dataframe by id and returns a list, This list is fed to lapply which then applies an anonymous function that returns the row with the maximum values according to order. Finally, the list of single row data.frames are appended with rbind and do.call.

Add (not merge!) two data frames with unequal rows and columns

I want to efficiently sum the entries of two data frames, though the data frames are not guaranteed to have the same dimensions or column names. Merge isn't really what I'm after here. Instead I want to create an output object with all of the row and column names that belong to either of the added data frames. In each position of that output, I want to use the following logic for the computed value:
If a row/column pairing belongs to both input data frames I want the output to include their sum
If a row/column pairing belongs to just one input data frame I want to include that value in the output
If a row/column pairing does not belong to any input matrix I want to have 0 in that position in the output.
As an example, consider the following input data frames:
df1 = data.frame(x = c(1,2,3), y = c(4,5,6))
rownames(df1) = c("a", "b", "c")
df2 = data.frame(x = c(7,8), z = c(9,10), w = c(2, 3))
rownames(df2) = c("a", "d")
> df1
x y
a 1 4
b 2 5
c 3 6
> df2
x z w
a 7 9 2
d 8 10 3
I want the final result to be
> df2
x y z w
a 8 4 9 2
b 2 5 0 0
c 3 6 0 0
d 8 0 10 3
What I've done so far -
bind_rows / bind_cols in dplyr can throw the following:
"Error: incompatible number of rows (3, expecting 2)"
I have duplicated column names, so 'merge' isn't working for my purposes either - returns an empty df for some reason.
Seems like you could merge on the rownames, then take care of the sums and conversion of NA to zero with some additional munging:
library(dplyr)
df.new = df1 %>% add_rownames %>%
full_join(df2 %>% add_rownames, by="rowname") %>%
mutate_each(funs(replace(., which(is.na(.)), 0))) %>%
mutate(x = x.x + x.y) %>%
select(rowname,x,y,z,w)
Or, with #DavidArenburg's much more elegant and extensible solution:
df.new = df1 %>% add_rownames %>%
full_join(df2 %>% add_rownames) %>%
group_by(rowname) %>%
summarise_each(funs(sum(., na.rm = TRUE)))
df.new
rowname x y z w
1 a 8 4 9 2
2 b 2 5 0 0
3 c 3 6 0 0
4 d 8 0 10 3
This seems like some type of a simple merge on common column names (+ row names) and then a simple aggregation, this is how I would tackle this
library(data.table)
merge(setDT(df1, keep.rownames = TRUE), # Convert to data.table + keep rows
setDT(df2, keep.rownames = TRUE), # Convert to data.table + keep rows
by = intersect(names(df1), names(df2)), # merge on common column names
all = TRUE)[, lapply(.SD, sum, na.rm = TRUE), by = rn] # Sum all columns by group
# rn x y z w
# 1: a 8 4 9 2
# 2: b 2 5 0 0
# 3: c 3 6 0 0
# 4: d 8 0 10 3
Are a pretty straight forward base R solution
df1$rn <- row.names(df1)
df2$rn <- row.names(df2)
res <- merge(df1, df2, all = TRUE)
rowsum(res[setdiff(names(res), "rn")], res[, "rn"], na.rm = TRUE)
# x y z w
# a 8 4 9 2
# b 2 5 0 0
# c 3 6 0 0
# d 8 0 10 3
First, I would grab the names of all the rows and columns of the new entity:
(all.rows <- unique(c(row.names(df1), row.names(df2))))
# [1] "a" "b" "c" "d"
(all.cols <- unique(c(names(df1), names(df2))))
# [1] "x" "y" "z" "w"
Then I would construct an output matrix with those rows and column names (with matrix data initialized to all 0s), adding df1 and df2 to the relevant parts of that matrix.
out <- matrix(0, nrow=length(all.rows), ncol=length(all.cols))
rownames(out) <- all.rows
colnames(out) <- all.cols
out[row.names(df1),names(df1)] <- unlist(df1)
out[row.names(df2),names(df2)] <- out[row.names(df2),names(df2)] + unlist(df2)
out
# x y z w
# a 8 4 9 2
# b 2 5 0 0
# c 3 6 0 0
# d 8 0 10 3
Using xtabs on melted / stacked data frames:
out <- rbind(cbind(rn=rownames(df1),stack(df1)), cbind(rn=rownames(df2),stack(df2)))
as.data.frame.matrix(xtabs(values ~ rn + ind, data=out))
# x y w z
#a 8 4 2 9
#b 2 5 0 0
#c 3 6 0 0
#d 8 0 3 10
I’m not convinced the accepted (or alternative merge) method is the best. It will give incorrect results if you have common rows, they’ll get joined and not summed.
This can be shown trivialy by changing df2 to:
df2 = data.frame(x = c(1,2), y = c(4,5), z = c(9,10), w = c(2, 3))
rownames(df2) = c("a", "d")
expected results:
rn x y z w
1: a 2 8 9 2
2: b 2 5 0 0
3: c 3 6 0 0
4: d 2 5 10 3
actual results
merge(setDT(df1, keep.rownames = TRUE),
setDT(df2, keep.rownames = TRUE),
by = intersect(names(df1), names(df2)),
all = TRUE)[, lapply(.SD, sum, na.rm = TRUE), by = rn]
rn x y z w
1: a 1 4 9 2
2: b 2 5 0 0
3: c 3 6 0 0
4: d 2 5 10 3
You need to combine both the outer join with an inner join (or left/right joins, merge all=T/all=F). Or alternatively using plyr’s rbind.fill :
base R solution
res <- rbind.fill(df1,df2)
rowsum(res[setdiff(names(res), "rn")], res[, "rn"], na.rm = TRUE)
data table solution
as.data.table(rbind.fill(
setDT(df1, keep.rownames = TRUE),
setDT(df2, keep.rownames = TRUE)
))[, lapply(.SD, sum, na.rm = TRUE), by = rn]
I prefer the rbind.fill method as you can "merge" > 2 data frames using the same syntax.

Bind data frames on longer identifiers R

I've got two data frames in which the unique identifiers common to both frames differ in the number of observations. I would like to create a dataframe from both in which the observations from each frame are taken if they have more observations for a common identifier. For example:
f1 <- data.frame(x = c("a", "a", "b", "c", "c", "c"), y = c(1,1,2,3,3,3))
f2 <- data.frame(x = c("a","b", "b", "c", "c"), y = c(4,5,5,6,6))
I would like this to generate a merge based on the longer x such that it produces:
x y
a 1
a 1
b 5
b 5
c 3
c 3
c 3
Any and all thoughts would be great.
Here's a solution using split
dd<-rbind(cbind(f1, s="f1"), cbind(f2, s="f2"))
keep<-unsplit(lapply(split(dd$s, dd$x), FUN=function(x) {
y<-table(x)
x == names(y[which.max(y)])
}), dd$x)
dd <- dd[keep,]
Normally i'd prefer to use the ave function here but because i'm changing data.types from a factor to a logical, it wasn't as appropriate so I basically copied the idea that ave uses and used split.
dplyr solution
library(dplyr)
First we combine the data:
with rbind() and introduce a new variable called ref to know where each observation came from:
both <- rbind( f1, f2 )
both$ref <- rep( c( "f1", "f2" ) , c( nrow(f1), nrow(f2) ) )
then count the observations:
make another new variable that contains how many observations for each ref and x combination:
both_with_counts <- both %>%
group_by( ref ,x ) %>%
mutate( counts = n() )
then filter for the largest count:
both_with_counts %>% group_by( x ) %>% filter( n==max(n) )
note: you could also select only the x and y cols with select(x,y)...
this gives:
## Source: local data frame [7 x 4]
## Groups: x
##
## x y ref counts
## 1 a 1 f1 2
## 2 a 1 f1 2
## 3 c 3 f1 3
## 4 c 3 f1 3
## 5 c 3 f1 3
## 6 b 5 f2 2
## 7 b 5 f2 2
Altogether now...
what_I_want <-
rbind(cbind(f1,ref = "f1"),cbind(f2,ref = "f2")) %>%
group_by(ref,x) %>%
mutate(counts = n()) %>%
group_by( x ) %>%
filter( counts==max(counts) ) %>%
select( x, y )
and thus:
> what_I_want
# Source: local data frame [7 x 2]
# Groups: x
#
# x y
# 1 a 1
# 2 a 1
# 3 c 3
# 4 c 3
# 5 c 3
# 6 b 5
# 7 b 5
Not a elegant answer but still give the desired result. Hope this help.
f1table <- data.frame(table(f1$x))
colnames(f1table) <- c("x","freq")
f1new <- merge(f1,f1table)
f2table <- data.frame(table(f2$x))
colnames(f2table) <- c("x","freq")
f2new <- merge(f2,f2table)
table <- rbind(f1table, f2table)
table <- table[with(table, order(x,-freq)), ]
table <- table[!duplicated(table$x), ]
data <-rbind(f1new, f2new)
merge(data, table, by=c("x","freq"))[,c(1,3)]
x y
1 a 1
2 a 1
3 b 5
4 b 5
5 c 3
6 c 3
7 c 3

Resources