a <- data.frame(x = c(1, 2))
b <- data.frame(x = c(3, 4))
for (df in list(a, b)) {
print(df)
df$y <- c(5, 6)
}
Each of the data frames gets printed out correctly, but adding an additional column fails.
Extensive web search suggested something like
lapply(list(a, b), function(df){
df$y <- c(5, 6)
})
but this didn't help me.
What I'd be also very interested is, why the print statement in the for loop works, but the addition of the y column fails.
This is surprising for me.
You have to return the df's with that additional column.
Try:
lapply(list(a, b), function(df){
df$y <- c(5, 6); return(df)
})
The output is:
[[1]]
x y
1 1 5
2 2 6
[[2]]
x y
1 3 5
2 4 6
As #dash2 supposes you may want to assign those changed df's to your list of df's. So the full code could look like:
a <- data.frame(x = c(1, 2))
b <- data.frame(x = c(3, 4))
l <- list(a, b)
l <- lapply(l, function(df){ df$y <- c(5, 6); return(df) })
> a <- data.frame(x = c(1, 2))
> b <- data.frame(x = c(3, 4))
> l <- list(a = a, b = b)
> list2env(lapply(l, function(x) {x$y <- c(5, 6);x}), envir = .GlobalEnv)
<environment: R_GlobalEnv>
> a
x y
1 1 5
2 2 6
> b
x y
1 3 5
2 4 6
Related
Suppose I have the following data:
dat1 <- data.frame(id = c("a", "b", "c", "d"),
x = c(1, 2, 3, 4),
y = rep(NA, 4))
dat2 <- data.frame(id = c("a", "b", "c"),
y = c(9, 8, 7))
dat3 <- data.frame(id = c("d"),
y = c(6))
Now, I want to merge/join the data from dat2 and dat3 to dat1 one after the other in a way that the dat1$y values are replaced by the dat2.y or dat3.y values instead of adding these as new columns.
The problem is that merge or left_join would not add the values to the existing y column, but add a y.y column and rename the one from dat1 to y.x.
I also thought I could use the rows_update function from the tidyverse, but the problem is that in my real life case I'm not only matching by one column (here: id), but by several id columns together, but rows_update only allows the by variable to be one vector.
NOTE: in my real-life use case I have
~50 data frames to merge
the uniqueness of my rows can only be determined through multiple id columns
the id columns have different names in my dat1 and all other dat2 to dat50 data frames.
The expected output after merging dat2 and dat3 to dat1 would be:
id x y
"a" 1 9
"b" 2 8
"c" 3 7
"d" 4 6
Try with indexing using %in% to test the id variables:
#Data
dat1 <- data.frame(id = c("a", "b", "c", "d"),
x = c(1, 2, 3, 4),
y = rep(NA, 4))
dat2 <- data.frame(id = c("a", "b", "c"),
y = c(9, 8, 7))
dat3 <- data.frame(id = c("d"),
y = c(6))
#Code
dat1$y[dat1$id %in% dat2$id] <- dat2$y[dat2$id %in% dat1$id]
dat1$y[dat1$id %in% dat3$id] <- dat3$y[dat3$id %in% dat1$id]
Output:
id x y
1 a 1 9
2 b 2 8
3 c 3 7
4 d 4 6
You can use a loop with a list to store the objects from dat2 to datn and then make the assignation of values:
#Data
dat1 <- data.frame(id = c("a", "b", "c", "d"),
x = c(1, 2, 3, 4),
y = rep(NA, 4))
dat2 <- data.frame(id = c("a", "b", "c"),
y = c(9, 8, 7))
dat3 <- data.frame(id = c("d"),
y = c(6))
#Store Objects in a list
List <- list(dat2,dat3)
#Loop
for(i in 1:length(List))
{
#Data
df <- List[[i]]
#Assign
dat1$y[dat1$id %in% df$id] <- df$y[df$id %in% dat1$id]
}
Output:
dat1
id x y
1 a 1 9
2 b 2 8
3 c 3 7
4 d 4 6
You can get dataframes in a list and left_join them using reduce. If every row has only one y value we can use rowSums/rowMeans ignoring NA value.
library(dplyr)
mget(paste0('dat', 1:3)) %>%
purrr::reduce(left_join, by = 'id') %>%
mutate(y = rowSums(select(., starts_with('y')), na.rm = TRUE)) %>%
select(id, x, y)
# id x y
#1 a 1 9
#2 b 2 8
#3 c 3 7
#4 d 4 6
A very simple answer - but maybe not too generalizable - would be:
dat1$y = c(dat2$y, dat3$y)
With a loop, to do this to several data frames:
newy = numeric()
for(i in 2:ndf){ # Where "ndf" is the number of data frames you have
newy = c(newy, eval(parse(text=paste("dat",i,"$y",sep=""))))}
OBS: evaluating objects by strings, with the eval(parse(text=...)) normaly isn't the best way to do it in R. It is probably best if the data frames were created together in a list (as listing them now would be very manual, atleast with my knowledge), and the loop would be:
newy = numeric()
for(i in 2:ndf){
newy = c(newy, df.list[[i]]$y)}
In the latest DataFramesMeta.jl package for Julia one can do
x = [:x1, :x2]
n = [:n1, :n2]
#transform(df, cols(x) = :y .+ cols(n))
and it will output the columns x1 = y + x1 and x2 = y + x2.
The question, how best to do that in R elegantly? I can do it like this
library(dplyr)
df = data.frame(y = 1:3, n1 = 1:3, n2 = 1:3)
x = c("m1", "m2")
n = c("n1", "n2")
code = glue::glue_data(list(x=x, n=n), "{x}=y+{n}")
code = glue::glue("vars({paste(code, collapse=',')})")
mutate(df, !!!eval(parse(text=code)))
or in Base
res = lapply(n, function(n) {
df$y + df[, n]
}) %>% data.frame
names(res) <- x
df = cbind(df, res)
But it feels hacky. data.table and Base solutions welcome.
You can do :
df[x] <- lapply(df[n], function(p) df$y + p)
df
# y n1 n2 m1 m2
#1 1 1 1 2 2
#2 2 2 2 4 4
#3 3 3 3 6 6
Similarly with purrr::map :
df[x] <- purrr::map(df[n], ~df$y + .x)
and data.table :
library(data.table)
setDT(df)[, (x):= lapply(.SD, function(p) p + y), .SDcols = n]
Late realisation that we don't need any apply family of functions, we can directly do.
df[x] <- df[n] + df$y
We can use sweep
df[x] <- sweep(df[n], 1, df$y, FUN = `+`)
is it possible to do something like this in R (assuming both df1 and df2 have the same number of rows?
if (df1$var1 = 8) df2$var1 = 1.
if (df1$var2 = 9) df2$var2 = 1.
A simple two line code can be done with Base R ifelse statement
df1 <- data.frame(var1 = c(1:10), var2 = c(1:10))
df2 <- data.frame(var1 = c(1:10), var2 = c(1:10))
df2$var1 <- ifelse(df1$var1 == 8, 1,df2$var1)
df2$var2 <- ifelse(df1$var2 == 9, 1,df2$var2)
Here is one simple option in base R, where we replicate the values 8, 9 to make the lengths same and compare with the subset of columns of 'df1', resulting in a logical matrix. Subset the 'df2' and assign those columns to 1
nm1 <- c('var1', 'var2')
df2[nm1][df1[nm1] == c(8, 9)[col(df1[nm1])]] <- 1
df2
# var1 var2 var3
#1 5 1 1
#2 3 1 2
#3 1 3 3
#4 1 4 4
#5 4 2 5
Or this can be done in two steps
df2$var1[df1$var1 == 8] <- 1
df2$var2[df1$var2 == 9] <- 1
Or using Map
df2[nm1] <- Map(function(x, y, z) replace(x, y == z, 1),
df2[nm1], df1[nm1], c(8, 9))
The if/else loop can be also done, but it is not vectorized i.e. it expects input to be of length 1. If we do a loop, then it can be done (but would be inefficient in R)
vals <- c(8, 9)
for(i in seq_len(nrow(df1))) {
for(j in seq_along(nm1)) {
if(df1[[nm1[j]]][i] == vals[j]) df2[[nm1[j]]][i] <- 1
}
}
data
df1 <- data.frame(var1 = c(1, 3, 8, 5, 2), var2 = c(9, 3, 1, 8, 4),
var3 = 1:5)
df2 <- data.frame(var1 = c(5, 3, 2, 1, 4), var2 = c(3, 1, 3, 4, 2),
var3 = 1:5)
Using BASE R, I wonder how to answer the following question:
Are there any value on X or Y that occurs only in one row but not others? If yes, produce my desired output below.
f <- data.frame(id = c(rep("AA",4), rep("BB",2), rep("CC",2)), X = c(1,2,2,3,1,4,3,3),
Y = c(99,7,8,7,6,7,7,7))
Desired output:
list(BB = c(X = 4, Y = 6), AA = c(Y = c(99, 8)))
# $BB
# X Y
# 4 6
# $AA
# Y1 Y2 # Would be a plus if shows `Y Y` instead of `Y1 Y2`
# 99 8
There are two big ideas with this base approach:
Since we need to compare all the values, we should just recombine everything into one data.frame.
Making the unsplit data.frame long will save us some extra steps.
#https://stackoverflow.com/questions/58786052/find-variables-that-occur-only-once-across-a-split-data-frame-in-r/58788854#58788854
f <- data.frame(id = c(rep("AA",4), rep("BB",2), rep("CC",2)), X = c(1,2,2,3,1,4,3,3),
Y = c(99,7,8,7,6,7,7,7))
m <- split(f, f$id) # Here is `m`
unsplit <- do.call(rbind, c(m, make.row.names = F))
molten <- data.frame(unsplit[, 1, drop = F], stack(unsplit[, -1]))
# res <- subset(molten, !duplicated(values) & !duplicated(values, fromLast = T))
res <- molten[as.logical(ave(molten[['values']], molten[['ind']], FUN = function(x) !duplicated(x) & !duplicated(x, fromLast = T))), ]
#I would stop here
res
#> id values ind
#> 6 BB 4 X
#> 9 AA 99 Y
#> 11 AA 8 Y
#> 13 BB 6 Y
#to get exact output
res_vector <- res$values
names(res_vector) <- res$ind
split(res_vector, as.character(res$id))
#> $AA
#> Y Y
#> 99 8
#>
#> $BB
#> X Y
#> 4 6
Created on 2019-11-10 by the reprex package (v0.3.0)
Here's another base approach that may be less complicated:
####Way 1 with rapply
vec <- rapply(lapply(m, '[', mods), I)
unique_vec <- vec[!duplicated(vec) & !duplicated(vec, fromLast = T)]
vec_names <- do.call(rbind, strsplit(names(unique_vec), '.', fixed = T))
names(unique_vec) <- substr(vec_names[, 2], 1, 1) #turns Y1 into Y
split(unique_vec, vec_names[, 1])
###Way 2 with data.frame already do.call(rbind, m)
vec <- unlist(
lapply(f[, -1],
function(x){
ind <- !duplicated(x) & !duplicated(x, fromLast = T)
ret <- x[ind]
names(ret) <- f[ind, 1]
ret
}
)
)
#this is likely overly simplified:
split(vec, sub('.*\\.', '', names(vec)))
#this leads to exact result
vec_names <- do.call(rbind, strsplit(names(vec), '.', fixed = T))
names(vec) <- vec_names[, 1]
split(vec, vec_names[, 2])
$AA
Y Y
99 8
$BB
X Y
4 6
OP brings up using table() in a hint. duplicated() is very performant:
unlist(lapply(f[mods], function(y) names(which(table(y) == 1))))
# X Y1 Y2 Y3
# "4" "6" "8" "99"
vec
#X.BB Y.AA Y.AA Y.BB
# 4 99 8 6
# A tibble: 2 x 13
expression min median `itr/sec` mem_alloc
<bch:expr> <bch> <bch:> <dbl> <bch:byt>
1 table_meth 321us 336us 2794. 10.3KB
2 dup_meth 132us 136us 7105. 31.7KB
bench::mark(
table_meth = {unlist(lapply(f[mods], function(y) names(which(table(y) == 1))))},
dup_meth = {
#could get slight performance boost with
#f_id <- f[['id']]
unlist(
lapply(f[, -1],
function(x){
ind <- !duplicated(x) & !duplicated(x, fromLast = T)
ret <- x[ind]
names(ret) <- f[ind, 1]
#names(ret) <- f_id[ind]
ret
}
)
)}
, check = F
)
And similar idea in data.table:
library(data.table)
molten_dt <- melt(rbindlist(m), id.vars = 'id')
molten_dt[!duplicated(value, by = variable) &
!duplicated(value, by = variable, fromLast = T)]
And similar idea in dplyr:
library(dplyr)
library(tidyr)
m%>%
bind_rows()%>%
pivot_longer(cols = -id)%>%
group_by(name)%>%
filter(!duplicated(value) & !duplicated(value, fromLast = T))%>%
group_by(id)%>%
group_split()
It's not pure functional programming but it is base R:
lapply(split(df, df$id), function(z){
X <- z$X[which(!(z$X %in% df$X[duplicated(df$X)]))]
Y <- z$Y[which(!(z$Y %in% df$Y[duplicated(df$Y)]))]
cbind(X, Y)
}
)
Data:
df <-
structure(list(
id = structure(
c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L),
.Label = c("AA", "BB", "CC"),
class = "factor"
),
X = c(1,
2, 2, 3, 1, 4, 3, 3),
Y = c(99, 7, 8, 7, 6, 7, 7, 7)
),
class = "data.frame",
row.names = c(NA,-8L))
I am struggling to remove rows from a data frame in R, where values from different columns match two values from different columns in a second data frame.
For example, given the following pseudo-data:
ID1 <- c(5,10,6)
ID2 <- c(3,5,4)
Value <- rnorm(3)
DF1 <- data.frame(ID1, ID2, Value)
x <- c()
y <- c()
z <- c()
for (i in 1:10){
a <- rep(i, 10)
b <- c(1:10)
c <- rnorm(10)
x <- c(x, a)
y <- c(y, b)
z <- c(z, c)
}
DF2 <- data.frame(x, y, z)
I would like to remove the rows from DF2 where the combination of x and y matches ID1 and ID2 from DF1 (ie x = 5 and y = 3, x = 10 and y = 5, x = 6 and y = 4, but also x = 3 and y = 5, x = 5 and y = 10, x = 4 and y = 6).
Make exclude list
excl <- data.frame(
x = c(DF1$ID1, DF1$ID2),
y = c(DF1$ID2, DF1$ID1))
Then use anti join:
library(dplyr)
anti_join(DF2, excl, by = c("x", "y"))
Or using paste as suggested in the comments:
DF2[! paste(DF2$x, DF2$y) %in%
c(paste(DF1$ID1, DF1$ID2),
paste(DF1$ID2, DF1$ID1)), ]
Another option by using #zx8754's excl and match_df function of plyr package
library(plyr)
DF2[-as.numeric(rownames(match_df(DF2,excl))),]