lapply: vector names as list names

lapply: vector names as list names - r

When using lapply() over a vector, each element of the resulting list doesn't have a name, but only an index:
library(dplyr)
vector = c("df1", "df2")
df1 = data.frame(a = rnorm(5), b = rnorm(5, sd = 1.1))
df2 = data.frame(a = rnorm(5), b = rnorm(5, sd = 1.1))
lapply(vector, function(x){
x = get(x) %>%
mutate(c = a+b)
})
#> [[1]]
#> a b c
#> 1 -0.4098768 1.6712810 1.2614041
#> 2 0.7101722 -0.1025184 0.6076538
#> 3 -0.6696859 0.5690932 -0.1005928
#> 4 1.1642214 -0.4660378 0.6981836
#> 5 -0.5158280 1.4574039 0.9415759
#>
#> [[2]]
#> a b c
#> 1 0.91047848 -1.308707 -0.3982281
#> 2 1.87336493 -1.429567 0.4437977
#> 3 0.54171333 1.849589 2.3913028
#> 4 -0.02978158 2.500763 2.4709817
#> 5 1.49926589 1.602463 3.1017284
It does has names when applying over list:
list = list(
df1 = data.frame(a = rnorm(5), b = rnorm(5, sd = 1.1)),
df2 = data.frame(a = rnorm(5), b = rnorm(5, sd = 1.1))
)
lapply(list, function(x){
x = x %>%
mutate(c = a+b)
})
#> $df1
#> a b c
#> 1 0.8228400 -2.5232496 -1.70040963
#> 2 0.3890213 -0.4349408 -0.04591949
#> 3 0.5084719 1.4089123 1.91738415
#> 4 0.2533964 -0.7379615 -0.48456516
#> 5 -0.2474338 1.0520906 0.80465685
#>
#> $df2
#> a b c
#> 1 0.1376350 -1.32304077 -1.1854058
#> 2 0.1314702 1.14775210 1.2792223
#> 3 0.9757047 -1.24806193 -0.2723573
#> 4 -0.5118045 0.09277009 -0.4190344
#> 5 -0.1631715 -0.47573087 -0.6389024
Is there a simple way to use the vector elements as the resulting list names?

Instead of using get on each list separately, you can use mget to get all the dataframes in a list with their names :
lapply(mget(vector), function(x) transform(x, c = a + b))
#$df1
# a b c
#1 -0.60421251 2.4792735 1.8750610
#2 0.06163947 2.0295196 2.0911590
#3 -0.56318825 2.1496891 1.5865009
#4 -2.46292843 1.1641211 -1.2988073
#5 -1.05692446 0.4365812 -0.6203432
#$df2
# a b c
#1 -0.33388039 0.6690467 0.3351663
#2 0.83637236 1.3321715 2.1685439
#3 0.05839826 0.1017032 0.1601015
#4 -0.20686296 0.8667050 0.6598420
#5 0.52682053 0.4629632 0.9897837

You could name the vector, before lapply
vector = c("df1", "df2")
names(vector) <- vector # <-- here
df1 = data.frame(a = rnorm(5), b = rnorm(5, sd = 1.1))
df2 = data.frame(a = rnorm(5), b = rnorm(5, sd = 1.1))
lapply(vector, function(x){
x = get(x) %>%
mutate(c = a+b)
})
# $df1
# a b c
# 1 -0.36671838 0.8733203 0.5066019
# 2 -0.05029296 -0.8823471 -0.9326401
# 3 0.54252815 -0.2087211 0.3338071
# 4 -1.16789527 0.2598863 -0.9080090
# 5 -0.80664672 -0.4968422 -1.3034889
#
# $df2
# a b c
# 1 0.9042845 1.2978663 2.2021508
# 2 -0.3848533 -0.4563623 -0.8412156
# 3 -1.1681873 1.3224087 0.1542215
# 4 1.4872564 -2.0281272 -0.5408708
# 5 -0.2717229 -0.3780464 -0.6497694

We can also use map
library(purrr)
library(dplyr)
map(mget(vector), ~ .x %>%
mutate(c = a + b))

Related

Change the column names of a list of dataframes in R [duplicate]

This question already has answers here:
Changing Column Names in a List of Data Frames in R
(6 answers)
Rename Columns of dataframe based on names of list in R
(2 answers)
Closed 2 years ago.
I have a list of dataframes in this form.
d1 <- data.frame(i = c("a","b","c"), var = 1:3, stringsAsFactors=FALSE)
d2 <- data.frame(i = c("b","c","d"), var = 5:8, stringsAsFactors=FALSE)
d3 <- data.frame(i = c("c","d","a"), var = 2:4, stringsAsFactors=FALSE)
dfList <- list(d1,d2,d3)
I want to change the var variables to var_d1, var_d2, var_d3 respectively to do a full-join later. How do I implement this? How do I retrive the name of the data frames and make them into strings?

Start with naming the list
names(dfList) <- paste0('d', seq_along(dfList))
Once you do that you can use Map to rename columns :
Map(function(x, y) {names(x)[-1] <- paste(names(x)[-1], y, sep = "_");x},
dfList, names(dfList))
#$d1
# i var_d1
#1 a 1
#2 b 2
#3 c 3
#$d2
# i var_d2
#1 b 5
#2 c 6
#3 d 7
#$d3
# i var_d3
#1 c 2
#2 d 3
#3 a 4
Or in tidyverse :
library(dplyr)
library(purrr)
imap(dfList, function(x, y) x %>% rename_with(~paste(., y, sep = "_"), -1))

dfList <- mget(paste0("d", 1:3))
mapply(function(df, name) {
names(df)[names(df) == "var"] <- paste0("var_", name)
df
}, dfList, names(dfList), SIMPLIFY = FALSE)
#> $d1
#> i var_d1
#> 1 a 1
#> 2 b 2
#> 3 c 3
#>
#> $d2
#> i var_d2
#> 1 b 5
#> 2 c 6
#> 3 d 7
#>
#> $d3
#> i var_d3
#> 1 c 2
#> 2 d 3
#> 3 a 4

To changes the variables and then save them in a list of strings you can do something like this.
(I think you made a mistake in d2 so I changed it)
d1 <- data.frame(i = c("a","b","c"), var = 1:3, stringsAsFactors=FALSE)
d2 <- data.frame(i = c("b","c","d"), var = 5:7, stringsAsFactors=FALSE)
d3 <- data.frame(i = c("c","d","a"), var = 2:4, stringsAsFactors=FALSE)
dfList <- list(d1,d2,d3)
column_names <- list()
for (i in 1:length(dfList)){
colnames(dfList[[i]]) <- c("i",paste0("var_d",i))
column_names[[i]] <- names(dfList[[i]])
}
# they are stored here
column_names
[[1]]
[1] "i" "var_d1"
[[2]]
[1] "i" "var_d2"
[[3]]
[1] "i" "var_d3"

Maybe we can try the code below
> Map(function(k) setNames(dfList[[k]],c("i",paste0("var_d",k))),seq_along(dfList))
[[1]]
i var_d1
1 a 1
2 b 2
3 c 3
[[2]]
i var_d2
1 b 6
2 c 7
3 d 8
[[3]]
i var_d3
1 c 2
2 d 3
3 a 4

An approach quite similar to the ones proposed using Map, that uses lapply instead:
dfList <- lapply(
1:length(dfList),
function(x) setNames(dfList[[x]],
c('i', paste0('var_d', x))
)
)

dplyr::mutate_at() relying on multiple columns with a given prefix/suffix

dplyr::mutate_at() can be used to apply the same function to multiple columns. It also allows you to set the results in new columns using a named list.
However, what if I have many columns in pairs (say, data1_a, data1_b, data2_a, data2_b, ...) and I want to multiply those pairs together? Is that possible?
By hand, that would look like
suppressPackageStartupMessages({
library(dplyr)
})
data.frame(data1_a = 1:3, data1_b = 2:4,
data2_a = 3:5, data2_b = 4:6) %>%
mutate(
data1 = data1_a * data1_b,
data2 = data2_a * data2_b
)
#> data1_a data1_b data2_a data2_b data1 data2
#> 1 1 2 3 4 2 12
#> 2 2 3 4 5 6 20
#> 3 3 4 5 6 12 30
My current solution is to write a function which takes the unsuffixed variable name (i.e. "data1"), creates the suffixed names and then performs a simple mutate() on that variable using get(). I then call that function for each output:
foo <- function(df, name) {
a <- paste0(name, "_a")
b <- paste0(name, "_b")
return(
mutate(
df,
!!name := get(a) * get(b)
)
)
}
data.frame(data1_a = 1:3, data1_b = 2:4,
data2_a = 3:5, data2_b = 4:6) %>%
foo("data1") %>%
foo("data2")
#> data1_a data1_b data2_a data2_b data1 data2
#> 1 1 2 3 4 2 12
#> 2 2 3 4 5 6 20
#> 3 3 4 5 6 12 30
(or write a loop over all the variable names if there were more of them)
But if it's possible to use mutate_at or something of the sort, that'd be much cleaner.

We can use pivot_longer/pivot_wider
library(dplyr)
library(tidyr)
df1 %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn, names_to = c('grp', '.value'),
names_sep = "_") %>%
group_by(grp) %>%
transmute(rn, new = a * b) %>%
pivot_wider(names_from = grp, values_from = new) %>%
select(-rn) %>%
bind_cols(df1, .)
# A tibble: 3 x 6
# data1_a data1_b data2_a data2_b data1 data2
# <int> <int> <int> <int> <int> <int>
#1 1 2 3 4 2 12
#2 2 3 4 5 6 20
#3 3 4 5 6 12 30
Or another option is to split into a list based on the column names and then do the *
library(purrr)
library(stringr)
df1 %>%
split.default(str_remove(names(.), "_.*")) %>%
map_dfr(reduce, `*`) %>%
bind_cols(df1, .)
# A tibble: 3 x 6
# data1_a data1_b data2_a data2_b data1 data2
# <int> <int> <int> <int> <int> <int>
#1 1 2 3 4 2 12
#2 2 3 4 5 6 20
#3 3 4 5 6 12 30
With mutate, it is possible, but it would be more manual
df1 %>%
mutate(data1 = select(., starts_with('data1')) %>%
reduce(`*`),
data2 = select(., starts_with('data2')) %>%
reduce(`*`))
data
df1 <- data.frame(data1_a = 1:3, data1_b = 2:4,
data2_a = 3:5, data2_b = 4:6)

After adopting #akrun's elegant solution, I noticed it was unfortunately very inefficient (since it has to recreate two dataframes), taking almost a second on a dataset with 20,000 rows and 11 "groups".
So a while ago I developed the following function (with a bit of help from #user12728748... sorry for not posting here sooner), which takes the names of the groups ("data1", "data2", etc) and a formula using the prefixes, allowing for bquote-style quoting for constant names:
suppressPackageStartupMessages(library(dplyr))
mutateSet <- function(df, colNames, formula,
isPrefix = TRUE,
separator = "_") {
vars <- all.vars(formula)
# extracts names wrapped in `.()`
escapedNames <- function (expr)
{
unquote <- function(e) {
if (is.pairlist(e) || length(e) <= 1L) NULL
else if (e[[1L]] == as.name(".")) deparse(e[[2L]])
else unlist(sapply(e, unquote))
}
unquote(substitute(expr))
}
escapedVars <- eval(rlang::expr(escapedNames(!!formula)))
# remove escaped names from mapping variables
vars <- setdiff(vars, escapedVars)
# get output prefix/suffix as string
lhs <- rlang::f_lhs(formula) %>%
all.vars()
# get operation as string
# deparse() can have line breaks; paste0() brings it back to one line
rhs <- rlang::f_rhs(formula) %>%
deparse() %>%
paste0(collapse = "")
# dummy function to cover for bquote escaping
. <- function(x) x
for (i in colNames) {
if (isPrefix) {
aliases <- paste0(vars, separator, i)
newCol <- paste0(lhs, separator, i)
} else {
aliases <- paste0(i, separator, vars)
newCol <- paste0(i, separator, lhs)
}
if (length(lhs) == 0) newCol <- i
mapping <- rlang::list2(!!!aliases)
names(mapping) <- vars
mapping <- do.call(wrapr::qc, mapping)
df <- rlang::expr(wrapr::let(
mapping,
df %>% dplyr::mutate(!!newCol := ...RHS...)
)) %>%
deparse() %>%
gsub(
pattern = "...RHS...",
replacement = rhs
) %>%
{eval(parse(text = .))}
}
return(df)
}
df <- data.frame(a_data1 = 1:3, b_data1 = 2:4,
a_data2 = 3:5, b_data2 = 4:6,
static = 5:7)
mutateSet(df, "data1", ~ a + b)
#> a_data1 b_data1 a_data2 b_data2 static data1
#> 1 1 2 3 4 5 3
#> 2 2 3 4 5 6 5
#> 3 3 4 5 6 7 7
mutateSet(df, c("data1", "data2"), x ~ sqrt(a) + b)
#> a_data1 b_data1 a_data2 b_data2 static x_data1 x_data2
#> 1 1 2 3 4 5 3.000000 5.732051
#> 2 2 3 4 5 6 4.414214 7.000000
#> 3 3 4 5 6 7 5.732051 8.236068
mutateSet(df, c("data1", "data2"), ~ a + b + .(static))
#> a_data1 b_data1 a_data2 b_data2 static data1 data2
#> 1 1 2 3 4 5 8 12
#> 2 2 3 4 5 6 11 15
#> 3 3 4 5 6 7 14 18
Created on 2020-04-28 by the reprex package (v0.3.0)
This can probably be cleaned up (especially that heinous for-loop), but it works for now.
Repeating #user12728748's performance test, we see this is ~100x faster:
suppressPackageStartupMessages({
invisible(lapply(c("dplyr", "tidyr", "rlang", "wrapr", "microbenchmark"),
require, character.only = TRUE))
})
polymutate <- function(df, formula) {
form <- rlang::f_rhs(formula)
df %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn, names_to = c('.value', 'grp'),
names_sep = "_") %>%
group_by(grp) %>%
transmute(rn, new = eval(form)) %>%
pivot_wider(names_from = grp, values_from = new) %>%
select(-rn) %>%
bind_cols(df, .)
}
set.seed(1)
df <- setNames(data.frame(matrix(sample(1:12, 6E6, replace=TRUE), ncol=6)),
c("a_data1", "b_data1", "a_data2", "b_data2", "a_data3", "b_data3"))
pd <- polymutate(df, ~ a + b)
pd2 <- mutateSet(df, c("data1", "data2", "data3"), ~ a + b)
all.equal(pd, pd2)
#> [1] TRUE
microbenchmark(polymutate(df, ~ a + b),
mutateSet(df, c("data1", "data2", "data3"), ~ a + b),
times=10L)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> polymutate 1612.306 1628.9776 1690.78586 1670.15600 1741.3490 1806.1412 10
#> mutateSet 8.757 9.6302 13.27135 10.45965 19.2976 20.4657 10

This is now possible using the cur_column() function within across().
library(tidyverse)
dat <- data.frame(
data1_a = 1:3,
data1_b = 2:4,
data2_a = 3:5,
data2_b = 4:6
)
mutate(
dat,
across(ends_with("a"), ~ . * dat[[str_replace(cur_column(), "a$", "b")]],
.names = "updated_{col}")
)
Returns:
data1_a data1_b data2_a data2_b updated_data1_a updated_data2_a
1 1 2 3 4 2 12
2 2 3 4 5 6 20
3 3 4 5 6 12 30
Where updated_data1_a and updated_data2_a contain the desired output variables.

What's a tidyverse approach to iterating over rows in a data frame when vectorisation is not feasible?

I want to know the best way to iterate over rows of a data frame when the value of a variable at row n depends on the value of variable(s) at row n-1 and/or n-2. Ideally I would like to do this in a "tidyverse" way, perhaps with purrr::pmap().
For example, say I have this data frame:
library(dplyr)
x <- tibble(t = c(1:10),
a = c(seq(100, 140, 10), rep(NA_real_, 5)),
b = c(runif(5), rep(NA_real_, 5)),
c = c(runif(5), rep(NA_real_, 5)))
x
#> # A tibble: 10 x 4
#> t a b c
#> <int> <dbl> <dbl> <dbl>
#> 1 1 100 0.750 0.900
#> 2 2 110 0.898 0.657
#> 3 3 120 0.731 0.000137
#> 4 4 130 0.208 0.696
#> 5 5 140 0.670 0.882
#> 6 6 NA NA NA
#> 7 7 NA NA NA
#> 8 8 NA NA NA
#> 9 9 NA NA NA
#> 10 10 NA NA NA
I have known values up to time (t) = 5. Beyond that, I wish to project values, using the following formulae:
a = lag(a) * 1.1
b = a * lag(b)
c = b * lag(a, 2)
This code achieves the desired output, but it's a clunky, horrible for loop that scales poorly to larger datasets:
for(i in 1:nrow(x)) {
x <- x %>%
mutate(a = if_else(!is.na(a), a, lag(a, 1) * 1.1),
b = if_else(!is.na(b), b, a * lag(b, 1)),
c = if_else(!is.na(c), c, b * lag(a, 2)))
}
x
#> # A tibble: 10 x 4
#> t a b c
#> <int> <dbl> <dbl> <dbl>
#> 1 1 100 7.50e- 1 9.00e- 1
#> 2 2 110 8.98e- 1 6.57e- 1
#> 3 3 120 7.31e- 1 1.37e- 4
#> 4 4 130 2.08e- 1 6.96e- 1
#> 5 5 140 6.70e- 1 8.82e- 1
#> 6 6 154 1.03e+ 2 1.34e+ 4
#> 7 7 169. 1.75e+ 4 2.45e+ 6
#> 8 8 186. 3.26e+ 6 5.02e+ 8
#> 9 9 205. 6.68e+ 8 1.13e+11
#> 10 10 225. 1.51e+11 2.80e+13

I think that for this sort of intrinsically iterative process it is genuinely hard to beat a for loop. The method proposed by #Shree depends on the NAs being continuous and starting in a known spot.
Here is my mild improvement on your loop, which I think is more readable and about 2.5 times the speed and will probably scale up better than your approach which combines vectorized operations with the loop. By moving out of the tidyverse altogether and embracing a rowwise loop that really works on each row one at a time, we get some efficiencies on both counts:
method_peter <- function(x){
for(i in 2:nrow(x)){
x[i, "a"] <- ifelse(is.na(x[i, "a"]), x[i - 1, "a"] * 1.1, x[i, "a"])
x[i, "b"] <- ifelse(is.na(x[i, "b"]), x[i, "a"] * x[i - 1, "b"], x[i, "b"])
x[i, "c"] <- ifelse(is.na(x[i, "c"]), x[i, "b"] * x[i - 2, "a"], x[i, "c"])
}
return(x)
}
There's doubtless more efficiencies possible, and of course this is an ideal candidate to rewrite it in C++ :).
This is about twice as fast as your method as seen by this:
method_matt <- function(x){
for(i in 1:nrow(x)) {
x <- x %>%
mutate(a = if_else(!is.na(a), a, lag(a, 1) * 1.1),
b = if_else(!is.na(b), b, a * lag(b, 1)),
c = if_else(!is.na(c), c, b * lag(a, 2)))
}
return(x)
}
set.seed(123)
x <- tibble(t = c(1:10),
a = c(seq(100, 140, 10), rep(NA_real_, 5)),
b = c(runif(5), rep(NA_real_, 5)),
c = c(runif(5), rep(NA_real_, 5)))
stopifnot(identical(method_matt(x), method_peter(x)))
library(microbenchmark)
microbenchmark(
method_matt(x),
method_peter(x)
)
which returns:
Unit: milliseconds
expr min lq mean median uq max neval
method_matt(x) 24.1975 25.50925 30.64438 26.33310 31.8681 74.5093 100
method_peter(x) 10.0005 10.56050 13.33751 11.06495 13.5913 42.0568 100
#Shree's method is much faster again and is ideal for the example data, but I'm not sure it is flexible enough to work in all your use cases.
I would like to see a tidyverse solution if there is one.

Edit: Added tidyverse approach
Here's a readable and flexible tidyverse approach. The downside is that it is very slow.
accumutate <- function(df, ...){
df %>% group_by(row_number()) %>%
nest() %>%
pull(data) %>%
accumulate(function(x,y){bind_rows(x,y) %>% mutate(!!!enquos(...)) }) %>%
.[[length(.)]]
}
x %>%
accumutate(a = ifelse(is.na(a), 1.1 * lag(a,1), a)) %>%
accumutate(b = ifelse(is.na(b), a * lag(b), b)) %>%
accumutate(c = ifelse(is.na(c),b * lag(a, 2), c))
#> # A tibble: 10 x 4
#> t a b c
#> <int> <dbl> <dbl> <dbl>
#> 1 1 100 2.88e- 1 4.56e- 2
#> 2 2 110 7.88e- 1 5.28e- 1
#> 3 3 120 4.09e- 1 8.92e- 1
#> 4 4 130 8.83e- 1 5.51e- 1
#> 5 5 140 9.40e- 1 4.57e- 1
#> 6 6 154 1.45e+ 2 1.88e+ 4
#> 7 7 169. 2.45e+ 4 3.43e+ 6
#> 8 8 186. 4.57e+ 6 7.04e+ 8
#> 9 9 205. 9.37e+ 8 1.59e+11
#> 10 10 225. 2.11e+11 3.94e+13
Created on 2020-10-07 by the reprex package (v0.3.0)
Here's another approach that you might find interesting. It's not concise or especially readable, but it's tidyverse (or at least functionally) inspired. And it performs fairly well.
It uses a semigroup pattern, converting the mutate expressions into binary functions, creating corresponding lists and then using accumulate.
library(tidyverse)
library(dplyr)
library(microbenchmark)
options(width =100)
set.seed(123)
# Create the data frame
x <- tibble(t = c(1:100),
a = c(seq(100, 140, 10), rep(NA_real_,100- 5)),
b = c(runif(5), rep(NA_real_, 100-5)),
c = c(runif(5), rep(NA_real_, 100-5)))
a_mappend <- function(a1, a2) {
ifelse(is.na(a2), a1 * 1.1, a2)
}
b_mappend <- function(ab1, ab2) {
list(a = ab2$a, b = ifelse(is.na(ab2$b), ab2$a * ab1$b,ab2$b))
}
c_mappend <- function(abc12, abc23) {
list(abc1 = list(a = abc12$abc2$a, b = abc12$abc2$b, c = abc12$abc2$c),
abc2 = list(a = abc23$abc2$a, b = abc23$abc2$b, c = ifelse(is.na(abc23$abc2$c),abc12$abc1$a * abc23$abc2$b,abc23$abc2$c)))
}
method_ian <- function(x) {
x %>%
mutate(a = accumulate(a, a_mappend)) %>%
mutate(b = list(a, b) %>%
pmap(~ list(a = .x, b = .y)) %>%
accumulate(b_mappend) %>% map_dbl(~ .x$b)) %>%
mutate(c = list(a, b, c, c(a[-1], NA), c(b[-1], NA), c(c[-1], NA)) %>%
pmap(~ list(abc1 = list(a = ..1, b = ..2, c = ..3),
abc2 = list(a = ..4, b = ..5, c = ..6))) %>%
accumulate(c_mappend) %>% map_dbl(~ .x$abc1$c))
}
method_matt <- function(x){
for(i in 1:nrow(x)) {
x <- x %>%
mutate(a = if_else(!is.na(a), a, lag(a, 1) * 1.1),
b = if_else(!is.na(b), b, a * lag(b, 1)),
c = if_else(!is.na(c), c, b * lag(a, 2)))
}
return(x)
}
method_peter <- function(x){
for(i in 2:nrow(x)){
x[i, "a"] <- ifelse(is.na(x[i, "a"]), x[i - 1, "a"] * 1.1, x[i, "a"])
x[i, "b"] <- ifelse(is.na(x[i, "b"]), x[i, "a"] * x[i - 1, "b"], x[i, "b"])
x[i, "c"] <- ifelse(is.na(x[i, "c"]), x[i, "b"] * x[i - 2, "a"], x[i, "c"])
}
return(x)
}
stopifnot(identical(method_matt(x), method_ian(x)))
microbenchmark( method_matt(x), method_peter(x), method_ian(x))
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> method_matt(x) 324.90086 330.93192 337.46518 334.55447 338.38461 426.30457 100
#> method_peter(x) 208.27498 211.60526 213.59438 212.66088 214.36421 242.59854 100
#> method_ian(x) 13.06774 13.43105 14.30003 13.86428 14.32263 19.54843 100
Created on 2020-10-06 by the reprex package (v0.3.0)

I don't think there's any simple way in tidyverse to do calculations with row-dependencies. Something with Reduce or gather + spread could be possible but I don't expect them to score poits on readability.
Anyways, on the bright side, your calculations are vectorizable using dplyr and zoo packages -
x %>%
mutate(
a = ifelse(is.na(a), na.locf(a) * 1.1^(t-5), a),
b = ifelse(is.na(b), na.locf(b) * c(rep(1, 5), cumprod(a[6:n()])), b),
c = ifelse(is.na(c), b * lag(a, 2), c)
)
# A tibble: 10 x 4
t a b c
<int> <dbl> <dbl> <dbl>
1 1 100 1.85e- 1 9.43e- 1
2 2 110 7.02e- 1 1.29e- 1
3 3 120 5.73e- 1 8.33e- 1
4 4 130 1.68e- 1 4.68e- 1
5 5 140 9.44e- 1 5.50e- 1
6 6 154 1.45e+ 2 1.89e+ 4
7 7 169. 2.46e+ 4 3.45e+ 6
8 8 186. 4.59e+ 6 7.07e+ 8
9 9 205. 9.40e+ 8 1.59e+11
10 10 225. 2.12e+11 3.95e+13
Data -
set.seed(2)
x <- tibble(t = c(1:10),
a = c(seq(100, 140, 10), rep(NA_real_, 5)),
b = c(runif(5), rep(NA_real_, 5)),
c = c(runif(5), rep(NA_real_, 5)))

How to keep track of duplicated rows in a data frame when do a reduction by unique(df)?

This is a followup question of this question.
Imagine the following data frame:
a <- c(rep("A", 3), rep("B", 3), rep("A",2))
b <- c(1,1,2,4,1,1,2,2)
df <-data.frame(a,b)
which gives
a b
1 A 1
2 A 1
3 A 2
4 B 4
5 B 1
6 B 1
7 A 2
8 A 2
I reduce it to it's unique rows by:
df_unique <- unique(df)
Now, I am wondering how can I keep track of the merged rows. I would like to create a new column in which each component has a list of row names that have been merged. Something like the following:
df_unique_informative =
a b track
1 A 1 [1,2]
3 A 2 [3,7,8]
4 B 4 [4]
5 B 1 [5,6]

res = aggregate(x = list(track = 1:NROW(df)), by = list(a = df$a, b = df$b), function(x) x)
# OR perhaps you want
#res = aggregate(x = list(track = 1:NROW(df)), by = list(a = df$a, b = df$b), function(x)
# paste(x, collapse = ", "))
res
# a b track
#1 A 1 1, 2
#2 B 1 5, 6
#3 A 2 3, 7, 8
#4 B 4 4
#Shorter code
res = aggregate(list(track = 1:NROW(df)), df[,1:2], '[')
Update
a <- c(rep("A", 3), rep("B", 3), rep("A",2))
b <- c(1,1,2,4,1,1,2,2)
c = letters[1:8]
df <-data.frame(a,b,c, stringsAsFactors = FALSE)
res = aggregate(x = list(track = 1:NROW(df)), by = list(a = df$a, b = df$b), function(x) df$c[x])
res
# a b track
#1 A 1 a, b
#2 B 1 e, f
#3 A 2 c, g, h
#4 B 4 d

Here is one option with tidyverse
library(tidyverse)
rownames_to_column(df, 'rn') %>%
group_by(a, b) %>%
summarise(track = list(rn))

Sum of two Columns of Data Frame with NA Values

I have a data frame with some NA values. I need the sum of two of the columns. If a value is NA, I need to treat it as zero.
a b c d
1 2 3 4
5 NA 7 8
Column e should be the sum of b and c:
e
5
7
I have tried a lot of things, and done two dozen searches with no luck. It seems like a simple problem. Any help would be appreciated!

dat$e <- rowSums(dat[,c("b", "c")], na.rm=TRUE)
dat
# a b c d e
# 1 1 2 3 4 5
# 2 5 NA 7 8 7

dplyr solution, taken from here:
library(dplyr)
dat %>%
rowwise() %>%
mutate(e = sum(b, c, na.rm = TRUE))

Here is another solution, with concatenated ifelse():
dat$e <- ifelse(is.na(dat$b) & is.na(dat$c), dat$e <-0, ifelse(is.na(dat$b), dat$e <- 0 + dat$c, dat$b + dat$c))
# a b c d e
#1 1 2 3 4 5
#2 5 NA 7 8 7
Edit, here is another solution that uses with as suggested by #kasterma in the comments, this is much more readable and straightforward:
dat$e <- with(dat, ifelse(is.na(b) & is.na(c ), 0, ifelse(is.na(b), 0 + c, b + c)))

if you want to keep NA if both columns has it you can use:
Data, sample:
dt <- data.table(x = sample(c(NA, 1, 2, 3), 100, replace = T), y = sample(c(NA, 1, 2, 3), 100, replace = T))
Solution:
dt[, z := ifelse(is.na(x) & is.na(y), NA_real_, rowSums(.SD, na.rm = T)), .SDcols = c("x", "y")]
(the data.table way)

I hope that it may help you
Some cases you have a few columns that are not numeric. This approach will serve you both.
Note that: c_across() for dplyr version 1.0.0 and later
df <- data.frame(
TEXT = c("text1", "text2"), a = c(1,5), b = c(2, NA), c = c(3,7), d = c(4,8))
df2 <- df %>%
rowwise() %>%
mutate(e = sum(c_across(a:d), na.rm = TRUE))
# A tibble: 2 x 6
# Rowwise:
# TEXT a b c d e
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 text1 1 2 3 4 10
# 2 text2 5 NA 7 8 20

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

lapply: vector names as list names - r

We can also use map library(purrr) library(dplyr) map(mget(vector), ~ .x %>% mutate(c = a + b))

Related

Change the column names of a list of dataframes in R [duplicate]

dplyr::mutate_at() relying on multiple columns with a given prefix/suffix

What's a tidyverse approach to iterating over rows in a data frame when vectorisation is not feasible?

How to keep track of duplicated rows in a data frame when do a reduction by unique(df)?

Sum of two Columns of Data Frame with NA Values

Categories

Resources