Related
I have a data table in R that looks like this
DT = data.table(a = c(1,2,3,4,5), a_mean = c(1,1,2,2,2), b = c(6,7,8,9,10), b_mean = c(3,2,1,1,2))
I want to create two more columns a_final and b_final defined as a_final = (a - a_mean) and b_final = (b - b_mean). In my real life use case, there can be a large number of such column pairs and I want a scalable solution in the spirit of R's data tables.
I tried something along the lines of
DT[,paste0(c('a','b'),'_final') := lapply(.SD, function(x) ((x-get(paste0(colnames(.SD),'_mean'))))), .SDcols = c('a','b')]
but this doesn't quite work. Any idea of how I can access the column name of the column being processed within the lapply statement?
We can create a character vector with columns names, subset it from the original data.table, get their corresponding "mean" columns, subtract and add as new columns.
library(data.table)
cols <- unique(sub('_.*', '', names(DT))) #Thanks to #Sotos
#OR just
#cols <- c('a', 'b')
DT[,paste0(cols, '_final')] <- DT[,cols, with = FALSE] -
DT[,paste0(cols, "_mean"), with = FALSE]
DT
# a a_mean b b_mean a_final b_final
#1: 1 1 6 3 0 3
#2: 2 1 7 2 1 5
#3: 3 2 8 1 1 7
#4: 4 2 9 1 2 8
#5: 5 2 10 2 3 8
Another option is using mget with Map:
cols <- c('a', 'b')
DT[, paste0(cols,'_final') := Map(`-`, mget(cols), mget(paste0(cols,"_mean")))]
Relying on the .SD construct you could do something along the lines of:
cols <- c('a', 'b')
DT[, paste0(cols, "_final") :=
DT[, .SD, .SDcols = cols] -
DT[, .SD, .SDcols = paste0(cols, "_mean")]]
I've got a data.table that i'd like to dcast based on three columns (V1, V2, V3). there are, however, some duplicates in V3 and I need an aggregate function that looks at a fourth column V4 and decides for the value of V3 based on maximum value of V4. I'd like to do this without having to aggregate DT separately prior to dcasting. can this aggregation be done in aggregate function of dcast or do I need to aggregate the table separately first?
Here is my data.table DT:
> DT <- data.table(V1 = c('a','a','a','b','b','c')
, V2 = c(1,2,1,1,2,1)
, V3 = c('st', 'cc', 'B', 'st','st','cc')
, V4 = c(0,0,1,0,1,1))
> DT
V1 V2 V3 V4
1: a 1 st 0
2: a 2 cc 0
3: a 1 B 1 ## --> i want this row to be picked in dcast when V1 = a and V2 = 1 because V4 is largest
4: b 1 st 0
5: b 2 st 1
6: c 1 cc 1
and the dcast function could look something like this:
> dcast(DT
, V1 ~ V2
, value.var = "V3"
#, fun.aggregate = V3[max.which(V4)] ## ?!?!?!??!
)
My desired output is:
> desired
V1 1 2
1: a B cc
2: b st st
3: c cc <NA>
Please note that aggregating DT before dcasting to get rid of the duplicates will solve the issue. I'm just wondering if dcasting can be done with the duplicates.
Here is one option where you take the relevent subset before dcasting:
DT[order(V4, decreasing = TRUE)
][, dcast(unique(.SD, by = c("V1", "V2")), V1 ~ V2, value.var = "V3")]
# V1 1 2
# 1: a B cc
# 2: b st st
# 3: c cc <NA>
Alternatively order and use a custom function in dcast():
dcast(
DT[order(V4, decreasing = TRUE)],
V1 ~ V2,
value.var = "V3",
fun.aggregate = function(x) x[1]
)
dplyr/tidyr option would be to group_by V1 and V2 select the maximum value in each group and then spread to wide format.
library(dplyr)
library(tidyr)
DT %>%
group_by(V1, V2) %>%
slice(which.max(V4)) %>%
select(-V4) %>%
spread(V2, V3)
# V1 `1` `2`
# <chr> <chr> <chr>
#1 a B cc
#2 b st st
#3 c cc NA
I'm looking for the (1) name and (2) a (cleaner) method in R (base and data.table preferred) of the following.
Input
> d1
id x y
1 1 1 NA
2 2 NA 3
3 3 4 NA
> d2
id x y z
1 4 NA 30 a
2 3 20 2 b
3 2 14 NA c
4 1 15 97 d
(note that the actual data.frames have hundreds of columns)
Expected output:
> d1
id x y z
1 1 1 97 d
2 2 14 3 c
3 3 4 2 b
Data and current solution:
d1 <- data.frame(id = 1:3, x = c(1, NA, 4), y = c(NA, 3, NA))
d2 <- data.frame(id = 4:1, x = c(NA, 20, 14, 15), y = c(30, 2, NA, 97), z = letters[1:4])
for (col in setdiff(names(d1), "id")) {
# If missing look in d2
missing <- is.na(d1[[col]])
d1[missing, col] <- d2[match(d1$id[missing], d2$id), col]
}
for (col in setdiff(names(d2), names(d1))) {
# If column missing then add
d1[[col]] <- d2[match(d1$id, d2$id), col]
}
PS:
Likely this questions has been asked before but I'm lacking in vocabulary to search it.
Assuming you are working with 2 data.frames, here is a base solution
#expand d1 to have the same columns as d2
d <- merge(d1, d2[, c("id", setdiff(names(d2), names(d1))), drop=FALSE],
by="id", all.x=TRUE, all.y=FALSE)
#make sure that d2 also have same number of columns as d1
d2 <- merge(d2, d1[, c("id", setdiff(names(d1), names(d2))), drop=FALSE],
by="id", all.x=TRUE, all.y=FALSE)
#align rows and columns to match those in d1
mask <- d2[match(d1$id, d2$id), names(d)]
#replace NAs with those mask
replace(d, is.na(d), mask[is.na(d)])
If you dont mind, we can rewrite your question into a general matrix-coalesce question (i.e. any number of matrices, columns, rows) which seems like it has not been asked before.
edit:
Another base R solution is a hack of coalesce1a from How to implement coalesce efficiently in R
coalesce.mat <- function(...) {
ans <- ..1
for (elt in list(...)[-1]) {
rn <- match(ans$id, elt$id)
ans[is.na(ans)] <- elt[rn, names(ans)][is.na(ans)]
}
ans
}
allcols <- Reduce(union, lapply(list(d1, d2), names))
do.call(coalesce.mat,
lapply(list(d1, d2), function(x) {
x[, setdiff(allcols, names(x))] <- NA
x
}))
edit:
a possible data.table solution using coalesce1a from How to implement coalesce efficiently in R by Martin Morgan.
coalesce1a <- function(...) {
ans <- ..1
for (elt in list(...)[-1]) {
i <- which(is.na(ans))
ans[i] <- elt[i]
}
ans
}
setDT(d1)
setDT(d2)
#melt into long formats and full outer join the 2
mdt <- merge(melt(d1, id.vars="id"), melt(d2, id.vars="id"), by=c("id","variable"), all=TRUE)
#perform a coalesce on vectors
mdt[, value := do.call(coalesce1a, .SD), .SDcols=grep("value", names(mdt), value=TRUE)]
#pivot into original format and subset to those in d1
dcast.data.table(mdt, id ~ variable, value.var="value")[
d1, .SD, on=.(id)]
Here is a possibility using dplyr::left_join:
left_join(d1, d2, by = "id") %>%
mutate(
x = ifelse(!is.na(x.x), x.x, x.y),
y = ifelse(!is.na(y.x), y.x, y.y)) %>%
select(id, x, y, z)
# id x y z
#1 1 1 97 d
#2 2 14 3 c
#3 3 4 2 b
We can use data.table with coalesce from dplyr. Create a vector of column names that are common ('nm1') and difference ('nm2') in both datasets. Convert the first dataset to 'data.table' (setDT(d1)), join on the 'id' column, assign (:=) the coalesced columns of the first and second (with prefix i. - if there are common columns) to update the values in the first dataset
library(data.table)
nm1 <- setdiff(intersect(names(d1), names(d2)), 'id')
nm2 <- setdiff(names(d2), names(d1))
setDT(d1)[d2, c(nm1, nm2) := c(Map(dplyr::coalesce, mget(nm1),
mget(paste0("i.", nm1))), mget(nm2)), on = .(id)]
d1
# id x y z
#1: 1 1 97 d
#2: 2 14 3 c
#3: 3 4 2 b
Hi I have two data frames as followed:
df1:
ID x y z
1 a b c
2 a b c
3 a b c
4 a b c
and df2:
ID x y
2 d NA
3 NA e
and I am after a result like this:
df1:
ID x y z
1 a b c
2 d b c
3 a e c
4 a b c
I have been trying to use the match function as suggested by some other posts but I keep getting the issue where my df1 dataframe being replaced with NA values from df2.
This is the code I have been using without luck
for (i in names(df2)[2:length(names(df2))]) {
df1[i] <- df2[match(df1$ID, df2$ID)]
}
Thanks
Your code didn't work for me so I change it a little but it works. If you are reading data from an external file use the stringAsFactor = FALSE when you read it so you don't run into problems.
df1 = data.frame("ID" = 1:4,"x" = rep("a",4), "y" =rep("b",4),"z" = rep("c",4),
stringsAsFactors=FALSE)
df2 = data.frame("ID" = 2:3,"x" = c("d",NA), "y" = c(NA,"e"),stringsAsFactors=FALSE)
for(i in 1:nrow(df2)){
new_data = df2[i,-which(apply(df2[i,],2,is.na))]
pos = as.numeric(new_data[1])
col_replace = intersect(colnames(new_data),colnames(df1))
df1[pos,col_replace] = new_data
}
A solution using dplyr. The idea is to convert both data frames to long format, conduct join and replace the values, and convert the format back to wide format. df5 is the final output.
library(dplyr)
library(tidyr)
df3 <- df1 %>% gather(Col, Value, -ID)
df4 <- df2 %>% gather(Col, Value, -ID, na.rm = TRUE)
df5 <- df3 %>%
left_join(df4, by = c("ID", "Col")) %>%
mutate(Value.x = ifelse(!is.na(Value.y), Value.y, Value.x)) %>%
select(ID, Col, Value.x) %>%
spread(Col, Value.x)
df5
# ID x y z
# 1 1 a b c
# 2 2 d b c
# 3 3 a e c
# 4 4 a b c
DATA
df1 <- read.table(text = "ID x y z
1 a b c
2 a b c
3 a b c
4 a b c",
header = TRUE, stringsAsFactors = FALSE)
df2 <- read.table(text = "ID x y
2 d NA
3 NA e",
header = TRUE, stringsAsFactors = FALSE)
As mentioned by alistaire this is an update join. It is available with the data.table package:
library(data.table)
setDT(df1)
setDT(df2)
df1[df2, on = "ID", x := ifelse(is.na(i.x), x, i.x)]
df1[df2, on = "ID", y := ifelse(is.na(i.y), y, i.y)]
df1
ID x y z
1: 1 a b c
2: 2 d b c
3: 3 a e c
4: 4 a b c
If there are many columns with replacement values, it might be worthwhile to follow www's suggestion to do the replacement after reshaping to long format where column names are treated as data:
library(data.table)
melt(setDT(df1), "ID")[
melt(setDT(df2), "ID", na.rm = TRUE), on = .(ID, variable), value := i.value][
, dcast(.SD, ID ~ variable)]
ID x y z
1: 1 a b c
2: 2 d b c
3: 3 a e c
4: 4 a b c
Data
df1 <- fread(
"ID x y z
1 a b c
2 a b c
3 a b c
4 a b c")
df2 <- fread(
"ID x y
2 d NA
3 NA e")
I often find myself in a situation where I have a table that contains multiple groups of wide columns, like so:
replicate groupA VA1 VA2 groupB VB1 VB2
1 1 a 0.3429166 -2.30336406 f 0.05363582 1.6454078
2 2 b -1.3183732 -0.13516849 g -0.42586417 0.1541541
3 3 c -0.7908358 -0.10746447 h 1.05134242 1.4297350
4 4 d -0.9963677 -1.82557058 i -1.14532536 1.0815733
5 5 e -1.3634609 0.04385812 j -0.65643595 -0.1452877
And I'd like to turn the columns into one long table, like so:
replicate group key value
1 1 a V1 0.34291665
2 2 b V1 -1.31837322
3 3 c V1 -0.79083580
4 4 d V1 -0.99636772
5 5 e V1 -1.36346088
6 1 a V2 -2.30336406
7 2 b V2 -0.13516849
8 3 c V2 -0.10746447
9 4 d V2 -1.82557058
10 5 e V2 0.04385812
11 1 f V1 0.05363582
12 2 g V1 -0.42586417
13 3 h V1 1.05134242
14 4 i V1 -1.14532536
15 5 j V1 -0.65643595
16 1 f V2 1.64540784
17 2 g V2 0.15415408
18 3 h V2 1.42973499
19 4 i V2 1.08157329
20 5 j V2 -0.14528774
I can do this by selecting the two groups of columns individually, tidying, and then rbinding together (code below). However, this approach doesn't seem particularly elegant, and it becomes cumbersome if there are more than two groups of columns. I'm wondering whether there's a more elegant approach, using a single pipe chain of data transformations.
The fundamental question here is: How do we automate the process of breaking the table into groups of columns, tidying those, and then combining back together.
My current code:
library(dplyr)
library(tidyr)
# generate example code
df_wide <- data.frame(replicate = 1:5,
groupA = letters[1:5],
VA1 = rnorm(5),
VA2 = rnorm(5),
groupB = letters[6:10],
VB1 = rnorm(5),
VB2 = rnorm(5))
# tidy columns with A in the name
dfA <- select(df_wide, replicate, groupA, VA1, VA2) %>%
gather(key, value, VA1, VA2) %>%
mutate(key = case_when(key == "VA1" ~ "V1",
key == "VA2" ~ "V2")) %>%
select(replicate, group = groupA, key, value)
# tidy columns with B in the name
dfB <- select(df_wide, replicate, groupB, VB1, VB2) %>%
gather(key, value, VB1, VB2) %>%
mutate(key = case_when(key == "VB1" ~ "V1",
key == "VB2" ~ "V2")) %>%
select(replicate, group = groupB, key, value)
# combine
df_long <- rbind(dfA, dfB)
Note: Similar questions have been asked here and here, but I think the accepted answer shows that this here is a subtly different problem.
1
Although the question asked for a tidyverse solution, there is a convenient option with melt from data.table, which also can take multiple patterns in the measure argument.
library(data.table)
setnames(melt(melt(setDT(df1), measure = patterns('group', 'VA', 'VB')),
id.var = 1:3)[, -4, with = FALSE], 2:3, c('key', 'group'))[]
2. a
with tidyverse we can subset the datasets into a list, then loop through the list with map_df convert it to 'long' format with gather to get a single data.frame
library(tidyverse)
list(df1[1:4], df1[c(1,5:7)]) %>%
map_df(~gather(., key, value, 3:4) %>%
{names(.)[2] <- 'group';.}) %>%
mutate(key = sub('(.).(.)', '\\1\\2', key))
# replicate group key value
#1 1 a V1 0.34291660
#2 2 b V1 -1.31837320
#3 3 c V1 -0.79083580
#4 4 d V1 -0.99636770
#5 5 e V1 -1.36346090
#6 1 a V2 -2.30336406
#7 2 b V2 -0.13516849
#8 3 c V2 -0.10746447
#9 4 d V2 -1.82557058
#10 5 e V2 0.04385812
#11 1 f V1 0.05363582
#12 2 g V1 -0.42586417
#13 3 h V1 1.05134242
#14 4 i V1 -1.14532536
#15 5 j V1 -0.65643595
#16 1 f V2 1.64540780
#17 2 g V2 0.15415410
#18 3 h V2 1.42973500
#19 4 i V2 1.08157330
#20 5 j V2 -0.14528770
2.b
If we need to split based on the occurence of 'group'
split.default(df1[-1], cumsum(grepl('group', names(df1)[-1]))) %>%
map(~bind_cols(df1[1], .)) %>%
map_df(~gather(., key, value, 3:4) %>%
{names(.)[2] <- 'group';.}) %>%
mutate(key = sub('(.).(.)', '\\1\\2', key))
2.c
Included rename_at instead of names assignment in the spirit of tidyverse options
df1[-1] %>%
split.default(cumsum(grepl('group', names(df1)[-1]))) %>%
map_df(~bind_cols(df1[1], .) %>%
gather(., key, value, 3:4) %>%
rename_at(2, funs(substring(.,1, 5))))
NOTE:
1) Both 2.a, 2.b, 2.c used tidyverse functions
2) It doesn't depend upon on the substring 'A' or 'B' in the column names
3) Assumed the patterns in the OP's dataset will be 'group' followed by value columns
1) This solution consists of a:
gather which generates the desired number of rows
a mutate which combines the groupA and groupB columns and changes the key column to that requested and
select which picks out the columns wanted.
First gather the columns whose names start with V and then create a new group column from groupA and groupB choosing groupA if the key has an A in it and groupB if the key has B in it. (We used mapply(switch, ...) here for easy extension to the 3+ group case but we could have used an ifelse, viz. ifelse(grepl("A", key), as.character(groupA), as.character(groupB)), given that we have only two groups.) The mutate also reduces the key names from VA1 to V1, etc. and finally select out the columns desired.
DF %>%
gather(key, value, starts_with("V")) %>%
mutate(group = mapply(switch, gsub("[^AB]", "", key), A = groupA, B = groupB),
key = sub("[AB]", "", key)) %>%
select(replicate, group, key, value)
giving:
replicate group key value
1 1 a V1 0.34291660
2 2 b V1 -1.31837320
3 3 c V1 -0.79083580
4 4 d V1 -0.99636770
5 5 e V1 -1.36346090
6 1 a V2 -2.30336406
7 2 b V2 -0.13516849
8 3 c V2 -0.10746447
9 4 d V2 -1.82557058
10 5 e V2 0.04385812
11 1 f V1 0.05363582
12 2 g V1 -0.42586417
13 3 h V1 1.05134242
14 4 i V1 -1.14532536
15 5 j V1 -0.65643595
16 1 f V2 1.64540780
17 2 g V2 0.15415410
18 3 h V2 1.42973500
19 4 i V2 1.08157330
20 5 j V2 -0.14528770
2) Another approach would be to split the columns into groups such that all columns in a group have the same name after removing A and B from their names. Performi unlist on each such group to reduce the list to a list of plain vectors and convert that list to a data.frame. Finally gather the V columns and rearrange. Note that rownames_to_column is from the tibble package.
DF %>%
as.list %>%
split(sub("[AB]", "", names(.))) %>%
lapply(unlist) %>%
as.data.frame %>%
rownames_to_column %>%
gather(key, value, starts_with("V")) %>%
arrange(gsub("[^AB]", "", rowname), key) %>%
select(replicate, group, key, value)
2a) If the row order is not important then the rownames_to_column, arrange and select lines could be omitted shortening it to this:
DF %>%
as.list %>%
split(sub("[AB]", "", names(.))) %>%
lapply(unlist) %>%
as.data.frame %>%
gather(key, value, starts_with("V"))
Solutions (2) and (2a) could easily be converted to base-only solutions by replacing the gather with the appropriate reshape from base as in the second reshape, i.e. the one producing d2, in (3).
3) Although the question asked for a tidyverse solution there is a fairly convenient base solution consisting of two reshape calls. The varying produced by the split is: list(group = c("groupA", "groupB"), V1 = c("VA1", "VB1"), V2 = c("VA2", "VB2")) -- that is it matches up the ith column in each set of columns.
varying <- split(names(DF)[-1], gsub("[AB]", "", names(DF))[-1])
d <- reshape(DF, dir = "long", varying = varying, v.names = names(varying))
d <- subset(d, select = -c(time, id))
d2 <- reshape(d, dir = "long", varying = list(grep("V", names(d))), v.names = "value",
timevar = "key")
d2 <- subset(d2, select = c(replication, group, key, value))
d2
Note: The input in reproducible form is:
DF <- structure(list(replicate = 1:5, groupA = structure(1:5, .Label = c("a",
"b", "c", "d", "e"), class = "factor"), VA1 = c(0.3429166, -1.3183732,
-0.7908358, -0.9963677, -1.3634609), VA2 = c(-2.30336406, -0.13516849,
-0.10746447, -1.82557058, 0.04385812), groupB = structure(1:5, .Label = c("f",
"g", "h", "i", "j"), class = "factor"), VB1 = c(0.05363582, -0.42586417,
1.05134242, -1.14532536, -0.65643595), VB2 = c(1.6454078, 0.1541541,
1.429735, 1.0815733, -0.1452877)), .Names = c("replicate", "groupA",
"VA1", "VA2", "groupB", "VB1", "VB2"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5"))