Iteration over 2 lists in R - r

I would like to do an iteration with 2 lists.
For a single case, I have one dataframe df1 and one vector v1.
My reproducible example as below.
df1 <- data.frame(n1 = c(2,2,0),
n2 = c(2,1,1),
n3 = c(0,1,1),
n4 = c(0,1,1))
v1 <- c(1,2,3)
Now, I calculate an value (ses.value) for each row using this code
x <- (v1 - apply(df1, 1, mean))/apply(df1,1,sd)
Let's say we will have a list of multiple dataframes l1 and a list of vectors l2 (each list has the same number of elements) Now, I would like to run a loop for those lists by using the above code (the element of l1 must go with the element of l2 with the same position).
# 3 dataframes and 3 vectors
df1 <- data.frame(n1 = c(2,2,0), n2 = c(2,1,1), n3 = c(0,1,1), n4 = c(0,1,1))
df2 <- data.frame(n1 = c(1,6,0), n2 = c(2,1,8), n3 = c(0,2,1), n4 = c(0,7,1))
df3 <- data.frame(n1 = c(1,6,0), n2 = c(9,1,5), n3 = c(4,2,1), n4 = c(0,7,2))
v1 <- c(1,2,3)
v2 <- c(2,3,4)
v3 <- c(4,5,6)
# list
l1 <- list(df1,df2,df3)
l2 <- list(v1,v2,v3)
Since my lists are too big, using for loop might be not such a good idea, any suggestions using lapply or something similar?

We can use Map to loop over the corresponding elements of each list and then do the calculation based on OP's code
Map(function(x, y) (y - apply(x, 1, mean))/apply(x,1,sd), l1, l2)
-output
[[1]]
[1] 0.0 1.5 4.5
[[2]]
[1] 1.3055824 -0.3396831 0.4057513
[[3]]
[1] 0.1237179 0.3396831 1.8516402
Also, if the datasets are really big, use dapply from collapse, which is more efficient
library(collapse)
Map(function(x, y) (y - dapply(x, MARGIN = 1,
FUN = fmean))/dapply(x, MARGIN = 1, FUN = fsd), l1, l2)

Since your lists apparently are large, you probably could benefit from rowMeans2 and rowSds of the matrixStats package.
library(matrixStats)
Map(\(x, y) (y - rowMeans2(as.matrix(x))) / rowSds(as.matrix(x)), l1, l2)
# [[1]]
# [1] 0.0 1.5 4.5
#
# [[2]]
# [1] 1.3055824 -0.3396831 0.4057513
#
# [[3]]
# [1] 0.1237179 0.3396831 1.8516402
Data:
l1 <- list(structure(list(n1 = c(2, 2, 0), n2 = c(2, 1, 1), n3 = c(0,
1, 1), n4 = c(0, 1, 1)), class = "data.frame", row.names = c(NA,
-3L)), structure(list(n1 = c(1, 6, 0), n2 = c(2, 1, 8), n3 = c(0,
2, 1), n4 = c(0, 7, 1)), class = "data.frame", row.names = c(NA,
-3L)), structure(list(n1 = c(1, 6, 0), n2 = c(9, 1, 5), n3 = c(4,
2, 1), n4 = c(0, 7, 2)), class = "data.frame", row.names = c(NA,
-3L)))
l2 <- list(c(1, 2, 3), c(2, 3, 4), c(4, 5, 6))

Related

Split dataset based on column with a loop

I have been trying to get a loop that splits a dataset in multiple datasets based on a column value. However, the dataset is of a format I haven't handled before (i.e. a list containing both lists and data.tables). The dataset is reproducible by:
table1 <- data.table::data.table(Scenario =
c(rep(
c("A", "B", "C", "D"),
4)),
A = c(
rep("x", 4), rep("b", 4), rep("s", 4),
rep("u", 4)),
Correlation = c(1, 0.125, 0.1, 0,
0.125, 1, 0.2, 0,
0.1, 0.2, 1, 0,
0, 0, 0, 1),
Matrix = "IM",
stringsAsFactors = FALSE,
check.names = FALSE)
table2 <- data.table::data.table(Scenario =
c(rep(
c("A", "B", "C", "D"),
4)),
A = c(
rep("x", 4), rep("b", 4), rep("s", 4),
rep("u", 4)),
Correlation = c(1, 0.125, 0.1, 0,
0.125, 1, 0.2, 0,
0.1, 0.2, 1, 0,
0, 0, 0, 1),
Matrix = "IM",
stringsAsFactors = FALSE,
check.names = FALSE)
table3 <- data.table::data.table(Scenario =
c(rep(
c("A", "B", "C", "D"),
4)),
A = c(
rep("x", 4), rep("b", 4), rep("s", 4),
rep("u", 4)),
Correlation = c(1, 0.125, 0.1, 0,
0.125, 1, 0.2, 0,
0.1, 0.2, 1, 0,
0, 0, 0, 1),
Matrix = "IM",
stringsAsFactors = FALSE,
check.names = FALSE)
list1 <- list("a" = "2019", "b" = "2020", "c" = "2021")
list2 <- list("a" = "test", "b" = "test", "c" = "test")
input_data <- list("table1" = table1, "table2" = table2, "table3" = table3,
"list1"=list1, "list2" = list2)
I need a loop that splits this dataset based on all unique instances in the scenario column. The first dataset (for scenario value "A") is reproducible by:
table1 <- data.table::data.table(Scenario =
c(rep(
c("A"),
4)),
A = c(
rep("x", 1), rep("b", 1), rep("s", 1),
rep("u", 1)),
Correlation = c(1, 0.125, 0.1, 0 ),
Matrix = "IM",
stringsAsFactors = FALSE,
check.names = FALSE)
table2 <- data.table::data.table(Scenario =
c(rep(
c( "A"),
4)),
A = c(
rep("x", 1), rep("b", 1), rep("s", 1),
rep("u", 1)),
Correlation = c(1, 0.125, 0.1, 0),
Matrix = "IM",
stringsAsFactors = FALSE,
check.names = FALSE)
table3 <- data.table::data.table(Scenario =
c(rep(
c("A"),
4)),
A = c(
rep("x", 1), rep("b", 1), rep("s", 1),
rep("u", 1)),
Correlation = c(1, 0.125, 0.1, 0),
Matrix = "IM",
stringsAsFactors = FALSE,
check.names = FALSE)
list1 <- list("a" = "2019", "b" = "2020", "c" = "2021")
list2 <- list("a" = "test", "b" = "test", "c" = "test")
input_data <- list("table1" = table1, "table2" = table2, "table3" = table3,
"list1"=list1, "list2" = list2)
Please let me know if additional information is needed.
You can write a function that wraps lapply, utilizing inherits as a check for the type of each object in the list. If the object inherits from data.frame and contains a column called Scenario then you can simply subset it. Items that are not data frames or data tables, or those that do not have columns called Scenario are left unaltered:
get_scenario <- function(S) {
lapply(input_data, function(x) {
if(!inherits(x, "data.frame"))
return(x)
else if(!"Scenario" %in% names(x))
return(x)
return(x[x$Scenario == S,])
})
}
This allows:
get_scenario("A")
#> $table1
#> Scenario A Correlation Matrix
#> 1: A x 1.000 IM
#> 2: A b 0.125 IM
#> 3: A s 0.100 IM
#> 4: A u 0.000 IM
#>
#> $table2
#> Scenario A Correlation Matrix
#> 1: A x 1.000 IM
#> 2: A b 0.125 IM
#> 3: A s 0.100 IM
#> 4: A u 0.000 IM
#>
#> $table3
#> Scenario A Correlation Matrix
#> 1: A x 1.000 IM
#> 2: A b 0.125 IM
#> 3: A s 0.100 IM
#> 4: A u 0.000 IM
#>
#> $list1
#> $list1$a
#> [1] "2019"
#>
#> $list1$b
#> [1] "2020"
#>
#> $list1$c
#> [1] "2021"
#>
#>
#> $list2
#> $list2$a
#> [1] "test"
#>
#> $list2$b
#> [1] "test"
#>
#> $list2$c
#> [1] "test"
And if you want all subgroups as one uber-list, you can do:
lapply(c("A", "B", "C"), get_scenario)

Recursive loop with dplyr::group_by

I have a function for exponential smoothing. I need to apply this for time series by group. In the beginning I need to set fixed initial values, then for each year the function calculates results that depends on the previous year's result (or on initial values if first year).
I have quite a lot data and the speed is the primary concern. So how to do this with dplyr or tidyverse?
The code below works, but just builds on the initialValues.
library(tidyverse)
library(expm)
# Function:
f <- function(L1, L2, L3, L4, L5, A) {
solve(A) %*% (expm(A) %*% (A %*% initialValues + c(L1, L2, L3, L4, L5)))
}
# Data:
df <- as_tibble(list(year = rep(2000:2002, 2),
id = rep(letters[1:2], 3),
L1 = sample(1:10, 6),
L2 = sample(1:10, 6),
L3 = sample(1:10, 6),
L4 = sample(1:10, 6),
L5 = sample(1:10, 6),
A = list(matrix(runif(25, 0, 1), ncol = 5),
matrix(runif(25, 0, 1), ncol = 5),
matrix(runif(25, 0, 1), ncol = 5),
matrix(runif(25, 0, 1), ncol = 5),
matrix(runif(25, 0, 1), ncol = 5),
matrix(runif(25, 0, 1), ncol = 5)
)))
initialValues <- c(5, 5, 6, 8, 9)
# Call:
final <- df %>%
group_by(id) %>%
mutate(result = pmap(list(L1, L2, L3, L4, L5, A), f))
The above function f works for the first year but the following year it should be something like:
solve(A) %*% (expm(A) %*% (A %*% dplyr::lag(result) + c(L1, L2,
L3, L4, L5)))
OR:
solve(A) %*% (expm(A) %*% (A %*% result[i - 1] + c(L1, L2, L3, L4,
L5)))
But result itself cannot be referred this way inside pmap.
EDIT: With helper variables and the conditional case_when in the function, I can refer to the previous value by group's id_nr, but this solution is clumsy. Any better ideas?
f1 <- function(id_nr, L1, L2, L3, L4, L5, A) {
case_when(id_nr == 1 ~ solve(A) %*% (expm(A) %*% (A %*% initialValues
+ c(L1, L2, L3, L4, L5))),
TRUE ~ NA_real_ )
}
f2 <- function(id_nr, L1, L2, L3, L4, L5, A, onebefore) {
case_when(id_nr == 2 ~ solve(A) %*% (expm(A) %*% (A %*% onebefore +
c(L1, L2, L3, L4, L5))),
TRUE ~ NA_real_ )
}
f3 <- function(id_nr, L1, L2, L3, L4, L5, A, onebefore) {
case_when(id_nr == 3 ~ solve(A) %*% (expm(A) %*% (A %*% onebefore +
c(L1, L2, L3, L4, L5))),
TRUE ~ NA_real_ )
}
final <- df %>%
group_by(id) %>%
mutate(id_nr = 1:n(),
result = pmap(list(id_nr, L1, L2, L3, L4, L5, A), f1),
result2 = pmap(list(id_nr, L1, L2, L3, L4, L5, A, result[1]), f2),
result3 = pmap(list(id_nr, L1, L2, L3, L4, L5, A, result2[2]), f3)
) %>%
select(year, id, id_nr, result, result2, result3) %>%
as.data.frame()
Gives:
# year id id_nr result
# 1 2000 a 1 69.99273, 187.46908, 133.68695, 39.14645, 192.07844
# 2 2001 b 1 150.08891, 105.06450, 134.75766, 143.28060, 86.68116
# 3 2002 a 2 NA, NA, NA, NA, NA
# 4 2000 b 2 NA, NA, NA, NA, NA
# 5 2001 a 3 NA, NA, NA, NA, NA
# 6 2002 b 3 NA, NA, NA, NA, NA
# result2 result3
# 1 NA, NA, NA, NA, NA
#NA, NA, NA, NA, NA
# 2 NA, NA, NA, NA, NA
#NA, NA, NA, NA, NA
# 3 1630.093, 2488.520, 2012.516, 1407.798, 1377.609
#NA, NA, NA, NA, NA
# 4 1751.489, 1444.543, 1531.545, 1922.810, 1544.579
#NA, NA, NA, NA, NA
# 5 NA, NA, NA, NA, NA 30153.83,
#36416.09, 19069.84, 18595.81, 31028.20
# 6 NA, NA, NA, NA, NA 22072.69,
#22904.23, 20731.95, 14812.70, 18054.79
(I still need to combine columns result, result2, result3.)

How calculate sum of positive values in each variable of dataframe in R

I have dataframe with any number of numeric variables
d <- data.frame(X1 = c(-1, -2, 0), X2 = c(10, 4, NA), X3 = c(-4, NA, NA))
How I may calculate the sum of positive values for each variable to keep them in the list, and if the variable has no positive values or all the values NA, return to this variable is 0.
We can use the apply and ifelse functions to iterate through each column and replace NA or negative values with 0
apply(d, 2, function(x) sum(ifelse(is.na(x) | x < 0, 0, x)))
Edit - concision
As #joel.wilson pointed out, there is a more concise way of coding the logic:
apply(d, 2, function(x) sum(x[x > 0], na.rm = TRUE))
library(expss)
d <- data.frame(X1 = c(-1, -2, 0), X2 = c(10, 4, NA), X3 = c(-4, NA, NA))
sum_col_if(gt(0), d)
# X1 X2 X3
# 0 14 0

count non-NA values and group by variable

I am trying to show how many complete observations there are per variabie ID without using the complete.cases package or any other package.
If I use na.omit to filter out the NA values, I will lose all of the IDs which might have ZERO complete cases.
In the end, I'd like a frequency table with two columns: ID and Number of Complete Observations
> length(unique(data$ID))
[1] 332
> head(data)
ID value
1 1 NA
2 1 NA
3 1 NA
4 1 NA
5 1 NA
6 1 NA
> dim(data)
[1] 772087 2
When I try to create my own function z - which counts non-NA values and apply that in the aggregate() function, the IDs with zero complete observations are left out. I should be left with 332 rows, not 323. How does one resolve this using base functions?
z <- function(x){
sum(!is.na(x))
}
aggregate(value ~ ID, data = data , FUN = "z")
> nrow(aggregate(isna ~ ID, data = data , FUN = "z"))
[1] 323
One of the ways to do this is using table:
df2 <- table(df$Id, !is.na(df$value))[,2]
data.frame(ID = names(df2), value = df2)
Data
structure(list(Id = c(1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 4), value = c(NA,
1, 1, 2, 2, NA, 3, NA, 3, 3, 4, 4)), .Names = c("Id", "value"
), row.names = c(NA, -12L), class = "data.frame")
Base R you can use your utility function like this:
stack(by(data$value, data$ID, FUN=function(x) sum(!is.na(x))))
you can directly use table for this purpose. Below is the sample code:
df1 <- structure(list(Id = c(1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 4), value = c(2,
1, 1, NA, NA, NA, 3, NA, 3, 3, 4, 4)), .Names = c("Id", "value"
), row.names = c(NA, -12L), class = "data.frame")
df2 <- as.data.frame.matrix(with(df1, table(Id, value)))
resultDf <- data.frame(Id=row.names(df2), count=apply(df2, 1, sum))
resultDf
The code makes a table of id and value. Then it just sums the non-na values from the table. Hope this is easy to understand and helps.

Merge two lists with dataframes into one list with merged dataframes

I have two lists of data frames: listA and listB. How to get a list of merged dataframes (listC)?
dfA1 <- data.frame(x1 = c("a", "b"), y1 = c(1, 2), row.names = c("1", "2"))
dfA2 <- data.frame(x1 = c("c", "d"), y1 = c(3, 4), row.names = c("1", "3"))
dfB1 <- data.frame(x2 = c("c", "d"), y2 = c(3, 4), row.names = c("1", "2"))
dfB2 <- data.frame(x2 = c("e", "f"), y2 = c(5, 6), row.names = c("2", "3"))
listA <- list(dfA1, dfA2) # first input list
listB <- list(dfB1, dfB2) # second input list
m1 <- merge(dfA1, dfB1, by = 0, all = T)
m2 <- merge(dfA2, dfB2, by = 0, all = T)
listC <- list(m1, m2) # desired output list
Found following solution:
listC <- mapply(function(x, y) merge(x, y, by = 0, all = T), x = listA, y = listB, SIMPLIFY = F)

Resources