gather multiple columns with nested, repeated measures - r

I have a dataset of people (pid) of different types (type2=c("dad", "mom", "kid"; and for ease, type=c("a", "b", "c")) nested in households (hid) with repeated measurements (time).
Some variables like v1_ are asked to everyone, but the values are spread across three columns. For instance, v1_a contains the values for all of the dads (type==a).
Variables like v2_ are only asked of dads and moms (a's and b's), and the values are spread across two columns.
Variables like v3 are also only asked to dads and moms, but the values are contained in one column.
Variables like v4 are asked to everyone, and the values are contained in one column.
Have:
hid pid type type2 time v1_a v1_b v1_c v2_a v2_b v3 v4
1 1 1 a dad 1 6 NA NA 2 NA 4 3
2 1 2 b mom 1 NA 2 NA NA 5 6 6
3 1 3 c kid 1 NA NA 1 NA NA NA 5
4 2 4 a dad 1 3 NA NA 6 NA 2 6
5 2 5 b mom 1 NA 5 NA NA 2 4 3
6 2 6 c kid 1 NA NA 3 NA NA NA 5
7 1 1 a dad 2 3 NA NA 2 NA 4 3
8 1 2 b mom 2 NA 3 NA NA 5 6 6
9 1 3 c kid 2 NA NA 2 NA NA NA 5
10 2 4 a dad 2 2 NA NA 6 NA 2 6
11 2 5 b mom 2 NA 3 NA NA 2 4 3
12 2 6 c kid 2 NA NA 2 NA NA NA 5
Here is the end result I want:
hid pid type type2 time v1 v2 v3 v4
1 1 1 a dad 1 6 2 4 3
2 1 2 b mom 1 2 5 6 6
3 1 3 c kid 1 1 NA NA 5
4 2 4 a dad 1 3 6 2 6
5 2 5 b mom 1 5 2 4 3
6 2 6 c kid 1 3 NA NA 5
7 1 1 a dad 2 3 2 4 3
8 1 2 b mom 2 3 5 6 6
9 1 3 c kid 2 2 NA NA 5
10 2 4 a dad 2 2 6 2 6
11 2 5 b mom 2 3 2 4 3
12 2 6 c kid 2 2 NA NA 5
I'm looking for a tidyverse approach that will handle a larger actual use case of mixed variables as shown here. The variable naming is consistent. Where do I go after gather()?
library(tidyverse)
df_have <- data.frame(hid=c(1, 1, 1, 2, 2, 2,
1, 1, 1, 2, 2, 2),
pid=c(1, 2, 3, 4, 5, 6,
1, 2, 3, 4, 5, 6),
type=c("a", "b", "c", "a", "b", "c",
"a", "b", "c", "a", "b", "c"),
type2=c("dad", "mom", "kid", "dad", "mom", "kid",
"dad", "mom", "kid", "dad", "mom", "kid"),
time=c(1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2),
v1_a=c(6, NA, NA, 3, NA, NA,
3, NA, NA, 2, NA, NA),
v1_b=c(NA, 2, NA, NA, 5, NA,
NA, 3, NA, NA, 3, NA),
v1_c=c(NA, NA, 1, NA, NA, 3,
NA, NA, 2, NA, NA, 2),
v2_a=c(2, NA, NA, 6, NA, NA,
2, NA, NA, 6, NA, NA),
v2_b=c(NA, 5, NA, NA, 2, NA,
NA, 5, NA, NA, 2, NA),
v3=c(4, 6, NA, 2, 4, NA,
4, 6, NA, 2, 4, NA),
v4=c(3, 6, 5, 6, 3, 5,
3, 6, 5, 6, 3, 5)
)
df_want <- data.frame(hid=c(1, 1, 1, 2, 2, 2,
1, 1, 1, 2, 2, 2),
pid=c(1, 2, 3, 4, 5, 6,
1, 2, 3, 4, 5, 6),
type=c("a", "b", "c", "a", "b", "c",
"a", "b", "c", "a", "b", "c"),
type2=c("dad", "mom", "kid", "dad", "mom", "kid",
"dad", "mom", "kid", "dad", "mom", "kid"),
time=c(1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2),
v1=c(6, 2, 1, 3, 5, 3,
3, 3, 2, 2, 3, 2),
v2=c(2, 5, NA, 6, 2, NA,
2, 5, NA, 6, 2, NA),
v3=c(4, 6, NA, 2, 4, NA,
4, 6, NA, 2, 4, NA),
v4=c(3, 6, 5, 6, 3, 5,
3, 6, 5, 6, 3, 5)
)
df_have %>%
gather(key, value, -hid, -pid, -type, -type2, -time)

Here is another idea using coalesce from dplyr and map from purrr.
library(tidyverse)
# Set target column names
cols <- paste0("v", 1:4)
# Coalesce the numbers based on column names
nums <- map(cols, ~coalesce(!!!as.list(df_have %>% select(starts_with(.x)))))
# Create a data frame
nums_df <- nums %>%
setNames(cols) %>%
as_data_frame()
# Create the final output by bind_cols
df_test <- df_have %>%
select(-starts_with("v")) %>%
bind_cols(nums_df)
df_test
# hid pid type type2 time v1 v2 v3 v4
# 1 1 1 a dad 1 6 2 4 3
# 2 1 2 b mom 1 2 5 6 6
# 3 1 3 c kid 1 1 NA NA 5
# 4 2 4 a dad 1 3 6 2 6
# 5 2 5 b mom 1 5 2 4 3
# 6 2 6 c kid 1 3 NA NA 5
# 7 1 1 a dad 2 3 2 4 3
# 8 1 2 b mom 2 3 5 6 6
# 9 1 3 c kid 2 2 NA NA 5
# 10 2 4 a dad 2 2 6 2 6
# 11 2 5 b mom 2 3 2 4 3
# 12 2 6 c kid 2 2 NA NA 5

This gets me there, but the filter(!is.na(value)) step seems like a hack. Better ideas?
df_test <-
df_have %>%
gather(key, value, -hid, -pid, -type, -time, -type2) %>%
mutate(key = str_replace(key, "_.*", "")) %>%
filter(!is.na(value)) %>%
spread(key, value) %>%
arrange(time, hid, type, pid)
Update from #www:
df_test <-
df_have %>%
gather(key, value, -hid, -pid, -type, -time, -type2, na.rm=TRUE) %>%
mutate(key = str_replace(key, "_.*", "")) %>%
spread(key, value) %>%
arrange(time, hid, type, pid)

Related

Create new variable based on outcome of other variable in group - R

This a similar/followup question to this R: How to code new variable based on grouped variable and conditioned on earlier row but it is different because within donors there are potentially two match runs.
I have a data file with organ donors. I'm looking at lungs that are donated - there are two lungs.
If the lungs are split (L and R) and put up for donation, they are each attempted to match with recipients ("matchrun"). They go through eligible recipients until one matches ("sequence").
If the lung is matched to a recipient, it goes to them ("organ_placed").
If the lung doesn't match, it continues in the sequence and then just remains NA at the maximum sequence number.
I would like to create a new variable that has the outcome of the match run such that if one lung is placed and the other is not, it tells you that the lung was discarded. i.e. see case of Donor 2 in the data - the left lung is placed, but the right doesn't match.
In donor 3, the first match run doesn't match but the match run for the other lung does.
I figure it will be something like group_by(donorid, matchrun) but then how do you make a condition based on the match run?
library(tribble)
library(dplyr)
data <- tribble(
~donorid, ~matchrun, ~sequence, ~organ_placed,
2, 3, 1, NA,
2, 3, 2, NA,
2, 3, 3, "L",
2, 4, 1, NA,
2, 4, 2, NA,
2, 4, 3, NA,
3, 5, 1, NA,
3, 5, 1, NA,
3, 5, 1, NA,
3, 6, 1, NA,
3, 6, 2, NA,
3, 6, 3, "L"
)
desired_outcome <- tribble(
~donorid, ~matchrun, ~sequence, ~organ_placed, ~organ,
2, 3, 1, NA, NA,
2, 3, 2, NA, NA,
2, 3, 3, "L", "Left Single",
2, 4, 1, NA, NA,
2, 4, 2, NA, NA,
2, 4, 3, NA, "Right Discarded",
3, 5, 1, NA, NA,
3, 5, 1, NA, NA,
3, 5, 1, NA, "Right Discarded",
3, 6, 1, NA, NA,
3, 6, 2, NA, NA,
3, 6, 3, "L", "Left Single")
You can try this:
data %>%
group_by(donorid) %>%
mutate(temp = ifelse(n_distinct(organ_placed, na.rm = TRUE) == 1, unique(na.omit(organ_placed)), "B")) %>%
group_by(matchrun, .add = TRUE) %>%
mutate(organ = case_when(organ_placed == "L" ~ "Left Single",
organ_placed == "R" ~ "Right Single",
all(is.na(organ_placed)) & row_number() == max(sequence) & temp == "L" ~ "Right Discarded",
all(is.na(organ_placed)) & row_number() == max(sequence) & temp == "R" ~ "Left Discarded")) %>%
ungroup()
output
donorid matchrun sequence organ_placed temp organ
1 1 1 1 NA B NA
2 1 1 2 NA B NA
3 1 1 3 L B Left Single
4 1 2 1 NA B NA
5 1 2 2 NA B NA
6 1 2 3 R B Right Single
7 2 3 1 NA L NA
8 2 3 2 NA L NA
9 2 3 3 L L Left Single
10 2 4 1 NA L NA
11 2 4 2 NA L NA
12 2 4 3 NA L Right Discarded
Update: we have to add matchrun to the group. Removed prior solution:
data %>%
group_by(donorid, matchrun) %>%
mutate(outcome = case_when(organ_placed == "L" ~ "Left Single",
organ_placed == "R" ~ "Right Single",
organ_placed == "B" ~ "Bilateral",
(is.na(organ_placed) &
row_number() == max(row_number())) &
"L" %in% organ_placed ~ "Right Discarded",
(is.na(organ_placed) &
row_number() == max(row_number())) &
"R" %in% organ_placed ~ "Left Discarded",
TRUE ~ NA_character_))
Groups: donorid, matchrun [4]
donorid matchrun sequence organ_placed outcome
<dbl> <dbl> <dbl> <chr> <chr>
1 2 3 1 NA NA
2 2 3 2 NA NA
3 2 3 3 L Left Single
4 2 4 1 NA NA
5 2 4 2 NA NA
6 2 4 3 NA NA
7 3 5 1 NA NA
8 3 5 1 NA NA
9 3 5 1 NA NA
10 3 6 1 NA NA
11 3 6 2 NA NA
12 3 6 3 L Left Single
We can use
library(data.table)
library(stringr)
setDT(data)[, seq2 := rowid(donorid, matchrun) ]
data[, organ := str_replace_all(organ_placed,
setNames(c("Left Single", "Right Single"), c("L", "R")))]
data[seq2 == max(seq2),
organ := fcase(!is.na(organ), organ, default =
str_replace_all(setdiff(c("Left Single", "Right Single"), organ),
setNames(c("Left Discarded", "Right Discarded"),
c("Left Single", "Right Single")))), donorid
][, seq2 := NULL][]
-output
> data
donorid matchrun sequence organ_placed organ
1: 2 3 1 <NA> <NA>
2: 2 3 2 <NA> <NA>
3: 2 3 3 L Left Single
4: 2 4 1 <NA> <NA>
5: 2 4 2 <NA> <NA>
6: 2 4 3 <NA> Right Discarded
7: 3 5 1 <NA> <NA>
8: 3 5 1 <NA> <NA>
9: 3 5 1 <NA> Right Discarded
10: 3 6 1 <NA> <NA>
11: 3 6 2 <NA> <NA>
12: 3 6 3 L Left Single

R incrementing a variable in dplyr

I have the following grouped data frame:
library(dplyr)
# Create a sample dataframe
df <- data.frame(
student = c("A", "A", "A","B","B", "B", "C", "C","C"),
grade = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
age= c(NA, 6, 6, 7, 7, 7, NA, NA, 9)
)
I want to update the age of each student so that it is one plus the age in the previous year, with their age in the first year they appear in the dataset remaining unchanged. For example, student A's age should be NA, 6, 7, student B's age should be 7,8,9, and student C's age should be NA, NA, 9.
How about this:
library(dplyr)
df <- data.frame(
student = c("A", "A", "A","B","B", "B", "C", "C","C"),
grade = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
age= c(NA, 6, 6, 7, 7, 7, NA, NA, 9)
)
df %>%
group_by(student) %>%
mutate(age = age + cumsum(!is.na(age))-1)
#> # A tibble: 9 × 3
#> # Groups: student [3]
#> student grade age
#> <chr> <dbl> <dbl>
#> 1 A 1 NA
#> 2 A 2 6
#> 3 A 3 7
#> 4 B 1 7
#> 5 B 2 8
#> 6 B 3 9
#> 7 C 1 NA
#> 8 C 2 NA
#> 9 C 3 9
Created on 2022-12-30 by the reprex package (v2.0.1)
in data.table, assuming the order of the rows is the 'correct' order:
library(data.table)
setDT(df)[, new_age := age + rowid(age) - 1, by = .(student)]
# student grade age new_age
# 1: A 1 NA NA
# 2: A 2 6 6
# 3: A 3 6 7
# 4: B 1 7 7
# 5: B 2 7 8
# 6: B 3 7 9
# 7: C 1 NA NA
# 8: C 2 NA NA
# 9: C 3 9 9

replace negative values with na using na_if{dplyr}

Let's say I have the following dataframe:
dat <- tribble(
~V1, ~V2,
2, -3,
3, 2,
1, 3,
3, -4,
5, 1,
3, 2,
1, -4,
3, 4,
4, 1,
3, -5,
4, 2,
3, 4
)
How can I replace negative values with NA using na_if()? I know how to do this using ifelse, but don't manage to come up with a correct condition for na_if():
> dat %>%
+ mutate(V2 = ifelse(V2 < 0, NA, V2))
# A tibble: 12 x 2
V1 V2
<dbl> <dbl>
1 2 NA
2 3 2
3 1 3
4 3 NA
5 5 1
6 3 2
7 1 NA
8 3 4
9 4 1
10 3 NA
11 4 2
12 3 4

build a network edge table from a sparse table

I don't know exactly how to explain it but...
I have a sparse table where each group represents a level. The columns are ordered, it means, the downstream (left) column represents a child node and upstream (right) node represents a parent node.
I'd like a two columns table where the 1st column is the parent node and the 2nd is the child node. If possible, a 3rd columns with the length (sum of the number of final nodes) of the parents.
Follow the example:
>tt <- tibble(
ID = letters[1:8],
`1` = c( 1, 1, 1, 1, 2, 2, 2, 2),
`2` = c( 3, 3, 4, 4, 5, 5, 5, 6),
`3` = c( 7, 7, 8, 9,10,10,11,12)
)
> tt
# A tibble: 8 x 4
ID `1` `2` `3`
<chr> <dbl> <dbl> <dbl>
1 a 1 3 7
2 b 1 3 7
3 c 1 4 8
4 d 1 4 9
5 e 2 5 10
6 f 2 5 10
7 g 2 5 11
8 h 2 6 12
>dput(tt)
structure(list(ID = c("a", "b", "c", "d", "e", "f", "g", "h"),
`1` = c(1, 1, 1, 1, 2, 2, 2, 2), `2` = c(3, 3, 4, 4, 5, 5,
5, 6), `3` = c(7, 7, 8, 9, 10, 10, 11, 12)), row.names = c(NA,
-8L), class = c("tbl_df", "tbl", "data.frame"))
the result should be:
>ttt <- tibble(
parent = c(1,1,2,2,3,4,4, 5, 5, 6, 7,7,8,9,10,10,11,12),
child = c(3,4,5,6,7,8,9,10,11,12, letters[1:8] ),
length = c(4,4,4,4,2,2,2, 3, 3, 1, 2,2,1,1, 2, 2, 1, 1)
)
>ttt
# A tibble: 18 x 3
parent child length
<dbl> <chr> <dbl>
1 1 3 4
2 1 4 4
3 2 5 4
4 2 6 4
5 3 7 2
6 4 8 2
7 4 9 2
8 5 10 3
9 5 11 3
10 6 12 1
11 7 a 2
12 7 b 2
13 8 c 1
14 9 d 1
15 10 e 2
16 10 f 2
17 11 g 1
18 12 h 1
> dput(ttt)
structure(list(parent = c(1, 1, 2, 2, 3, 4, 4, 5, 5, 6, 7, 7,
8, 9, 10, 10, 11, 12), child = c("3", "4", "5", "6", "7", "8",
"9", "10", "11", "12", "a", "b", "c", "d", "e", "f", "g", "h"
), length = c(4, 4, 4, 4, 2, 2, 2, 3, 3, 1, 2, 2, 1, 1, 2, 2,
1, 1)), row.names = c(NA, -18L), class = c("tbl_df", "tbl", "data.frame"
))
Any help is appreciated.
Thanks in advance.
This gets you 90% of the way there:
tt_correct <- tt[, c(2,3,4,1)]
ttt <- do.call(
rbind,
lapply(seq_len(length(tt)-1),
function(i){
DF <- tt_correct[, c(i, i+1)]
names(DF) <- c('parent', 'child')
DF$length <- ave(DF$parent, DF$parent, FUN = length)
unique(DF)
}
)
)
ttt
# A tibble: 18 x 3
parent child length
<dbl> <chr> <dbl>
1 1 3 4
2 1 4 4
3 2 5 4
4 2 6 4
5 3 7 2
6 4 8 2
7 4 9 2
8 5 10 3
9 5 11 3
10 6 12 1
11 7 a 2
12 7 b 2
13 8 c 1
14 9 d 1
15 10 e 2
16 10 f 2
17 11 g 1
18 12 h 1
The first part is correcting the order. Your expected output indicates that the 1st column is a child of the 4th column. The lapply() statement largely walks along the data.frame and stacks the data.
This is 90% of the way because the answer doesn't agree with your expected output for lengths. I think this is correct but I could be wrong.
Finally, and I'm not that good with igraph, you could likely find additional information doing:
library(igraph)
plot(graph_from_data_frame(ttt[, 1:2]))

Compute the sum of variables in data frames allocated in a list in R considering differents conditions over others variables

Hi everybody I am working with a list of data frames in R. Lists are awesome in R but I want to solve this. I have a list named global that has five data frames f1,f2,f3,f4,f5 each data frame has a principal variable named CreditValue and variables that works like flags for example f1 has CreditValue and a flag variable b1 with values of 1. f2 has two flag variables b1 with values of 1 and b2 with values of 2. f3 hast three flag variables b1 with values of 1, b2 with values of 2 and b3 with values of 3. f4 has four flag variables b1 with values of 1, b2 with values of 2 ,b3 with values of 3 and b4 with values of 4. f5 has five flag variables b1 with values of 1, b2 with values of 2 ,b3 with values of 3, b4 with values of 4 and b5 with values of 5. Flag variables always start in column 3 for all data frames. I wish to compute the sum of CreditValue in each data frame considering different aspects over flag variables. My list has the next structure (I include dput version in the final part):
global
$f1
KeyID CreditValue b1
1 001 1 1
2 002 2 1
3 003 3 1
4 004 4 1
5 005 5 1
6 006 6 1
7 007 7 1
8 009 8 1
9 010 9 1
$f2
KeyID CreditValue b1 b2
1 001 1 1 2
2 002 2 1 2
3 003 3 NA 2
4 004 4 NA 2
5 005 5 NA 2
6 006 6 1 2
7 007 7 1 2
8 009 8 NA 2
9 010 9 1 2
10 011 10 NA 2
11 012 11 1 2
$f3
KeyID CreditValue b1 b2 b3
1 001 1 1 2 3
2 002 2 1 2 3
3 003 3 1 2 3
4 004 4 1 2 3
5 005 5 NA 2 3
6 006 6 NA 2 3
7 007 7 1 2 3
8 009 8 1 2 3
9 010 9 NA NA 3
10 011 10 NA NA 3
11 012 11 NA 2 3
12 013 11 1 2 3
13 014 11 NA NA 3
$f4
KeyID CreditValue b1 b2 b3 b4
1 001 1 NA 2 3 4
2 002 2 NA 2 3 4
3 003 3 NA NA NA 4
4 004 4 NA NA NA 4
5 005 5 NA NA NA 4
6 006 6 1 2 3 4
7 007 7 1 2 3 4
8 009 8 1 2 3 4
9 010 9 1 2 3 4
10 011 10 1 2 3 4
11 012 11 1 2 3 4
12 013 11 1 2 3 4
13 014 11 1 2 3 4
14 015 12 1 NA 3 4
15 016 12 1 NA 3 4
$f5
KeyID CreditValue b1 b2 b3 b4 b5
1 001 1 1 2 3 4 5
2 002 2 1 2 3 4 5
3 003 3 1 2 3 4 5
4 004 4 1 2 3 4 5
5 005 5 NA NA 3 4 5
6 006 6 1 2 3 4 5
7 007 7 1 2 3 4 5
8 009 8 1 2 3 4 5
9 010 9 1 2 3 4 5
10 011 10 NA NA NA NA 5
11 012 11 1 2 3 4 5
12 013 11 1 2 3 4 5
13 014 11 1 2 3 4 5
14 015 12 1 2 3 4 5
15 016 12 1 2 3 4 5
16 017 14 NA NA NA 4 5
17 018 14 NA NA NA 4 5
I have used llply() function form plyr package to work with lists in R but I don't know how to define a function to make this. I compute the sums using this code but if I had more data frames it would be so complex. Also I would like to save this values in a new data frame or matrix considering flag variables (5). The results of the sums are the next:
sum(f1$CreditValue[f1[,3]==1])
[1] 45
sum(f2$CreditValue[f2[,3]==1],na.rm=TRUE)
[1] 36
sum(f3$CreditValue[f3[,3]==1],na.rm=TRUE)
[1] 36
sum(f4$CreditValue[f4[,3]==1],na.rm=TRUE)
[1] 97
sum(f5$CreditValue[f5[,3]==1],na.rm=TRUE)
[1] 97
These sums are computed applying those formulas considering b1 variable in all data frames.
sum(f2$CreditValue[is.na(f2[,3]) & f2[,4]==2] ,na.rm=TRUE)
[1] 30
sum(f3$CreditValue[is.na(f3[,3]) & f3[,4]==2] ,na.rm=TRUE)
[1] 22
sum(f4$CreditValue[is.na(f4[,3]) & f4[,4]==2] ,na.rm=TRUE)
[1] 3
sum(f5$CreditValue[is.na(f5[,3]) & f5[,4]==2] ,na.rm=TRUE)
[1] 0
These sums are computed applying those formulas considering values of b2 and b1 variables in all data frames. Here there is a condition over values of b1 (column 3).
sum(f3$CreditValue[is.na(f3[,3]) & is.na(f3[,4]) & f3[,5]==3] ,na.rm=TRUE)
[1] 30
sum(f4$CreditValue[is.na(f4[,3]) & is.na(f4[,4]) & f4[,5]==3] ,na.rm=TRUE)
[1] 0
sum(f5$CreditValue[is.na(f5[,3]) & is.na(f5[,4]) & f5[,5]==3] ,na.rm=TRUE)
[1] 5
These sums are computed applying those formulas considering values of b3, b2 and b1 variables in all data frames. Now there is a condition over values of b1 and b2 (columns 3, 4).
sum(f4$CreditValue[is.na(f4[,3]) & is.na(f4[,4]) & is.na(f4[,5]) & f4[,6]==4] ,na.rm=TRUE)
[1] 12
sum(f5$CreditValue[is.na(f5[,3]) & is.na(f5[,4]) & is.na(f5[,5]) & f5[,6]==4] ,na.rm=TRUE)
[1] 28
These sums are computed applying those formulas considering values of b4, b3, b2 and b1 variables in all data frames. Now there is a condition over values of b1, b2 and b3 (columns 3, 4, 5).
sum(f5$CreditValue[is.na(f5[,3]) & is.na(f5[,4]) & is.na(f5[,5]) & is.na(f5[,6]) & f5[,7]==5] ,na.rm=TRUE)
[1] 10
This sum is computed applying last formula considering values of b5, b4, b3, b2 and b1 variables in all data frames. Now there is a condition over values of b1, b2, b3 and b4 (columns 3, 4, 5, 6).
The showed sum are the result of a lot of code but I would like to create a function that works over flag variables (b1, b2, b3, b4, b5) to compute the sums. I don't know if it is possible to make this with a for or a function that works with llply or lapply. I have tried to resume code like this:
sum(f5$CreditValue[is.na(f5[,3]) & is.na(f5[,4]) & is.na(f5[,5]) & is.na(f5[,6]) & f5[,7]==5] ,na.rm=TRUE)
With this code:
sum(f5$CreditValue[is.na(f5[,3,4,5,6]) & f5[,7]==5] ,na.rm=TRUE)
But it doesn't job because with the original conditions I am considering only specific rows in each data frame and the resumed code doesn't make this. I would like to save the results of sums in a new data frame, matrix like this:
f1 f2 f3 f4 f5
f1 45 0 0 0 0
f2 36 30 0 0 0
f3 36 22 30 0 0
f4 97 3 0 12 0
f5 97 0 5 28 10
The zeros in the last data frame are produced due to all data frames don't have all flag variables for example f1 only has b1 and it doesn't have b2,b3,b4,b5 like f5. The dput version of my list is the next:
structure(list(f1 = structure(list(KeyID = c("001", "002", "003",
"004", "005", "006", "007", "009", "010"), CreditValue = c(1,
2, 3, 4, 5, 6, 7, 8, 9), b1 = c(1, 1, 1, 1, 1, 1, 1, 1, 1)), .Names = c("KeyID",
"CreditValue", "b1"), row.names = c(NA, 9L), class = "data.frame"),
f2 = structure(list(KeyID = c("001", "002", "003", "004",
"005", "006", "007", "009", "010", "011", "012"), CreditValue = c(1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11), b1 = c(1, 1, NA, NA, NA,
1, 1, NA, 1, NA, 1), b2 = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2)), .Names = c("KeyID", "CreditValue", "b1", "b2"), row.names = c(NA,
11L), class = "data.frame"), f3 = structure(list(KeyID = c("001",
"002", "003", "004", "005", "006", "007", "009", "010", "011",
"012", "013", "014"), CreditValue = c(1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 11, 11), b1 = c(1, 1, 1, 1, NA, NA, 1, 1, NA,
NA, NA, 1, NA), b2 = c(2, 2, 2, 2, 2, 2, 2, 2, NA, NA, 2,
2, NA), b3 = c(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3)), .Names = c("KeyID",
"CreditValue", "b1", "b2", "b3"), row.names = c(NA, 13L), class = "data.frame"),
f4 = structure(list(KeyID = c("001", "002", "003", "004",
"005", "006", "007", "009", "010", "011", "012", "013", "014",
"015", "016"), CreditValue = c(1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 11, 11, 12, 12), b1 = c(NA, NA, NA, NA, NA, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1), b2 = c(2, 2, NA, NA, NA, 2, 2, 2,
2, 2, 2, 2, 2, NA, NA), b3 = c(3, 3, NA, NA, NA, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3), b4 = c(4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4)), .Names = c("KeyID", "CreditValue", "b1",
"b2", "b3", "b4"), row.names = c(NA, 15L), class = "data.frame"),
f5 = structure(list(KeyID = c("001", "002", "003", "004",
"005", "006", "007", "009", "010", "011", "012", "013", "014",
"015", "016", "017", "018"), CreditValue = c(1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 11, 11, 12, 12, 14, 14), b1 = c(1,
1, 1, 1, NA, 1, 1, 1, 1, NA, 1, 1, 1, 1, 1, NA, NA), b2 = c(2,
2, 2, 2, NA, 2, 2, 2, 2, NA, 2, 2, 2, 2, 2, NA, NA), b3 = c(3,
3, 3, 3, 3, 3, 3, 3, 3, NA, 3, 3, 3, 3, 3, NA, NA), b4 = c(4,
4, 4, 4, 4, 4, 4, 4, 4, NA, 4, 4, 4, 4, 4, 4, 4), b5 = c(5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5)), .Names = c("KeyID",
"CreditValue", "b1", "b2", "b3", "b4", "b5"), row.names = c(NA,
17L), class = "data.frame")), .Names = c("f1", "f2", "f3",
"f4", "f5"))
I hope you can help me it is so complex for me building a function to compute the sums and If I use traditional forms of code I would have problems with lists of more data frames. Thanks for your help.
You can use lapply and call a function that builds the rows of your output data frame:
get.sums = function(df) {
sapply(1:5, function(y) {
if (y > 1) {
na.col = 3:(y+1)
} else {
na.col = NULL
}
if (paste0("b", y) %in% names(df)) {
return(sum(df$CreditValue[rowSums(!is.na(df[,na.col,drop=F])) == 0 & df[,(y+2)] == y], na.rm=T))
} else {
return(0)
}
})
}
rows = lapply(global, get.sums)
sums = do.call(rbind, rows)
sums
# [,1] [,2] [,3] [,4] [,5]
# f1 45 0 0 0 0
# f2 36 30 0 0 0
# f3 36 22 30 0 0
# f4 97 3 0 12 0
# f5 97 0 5 28 10

Resources