Mutate with function over multiple columns - r

I have questionnaire (EHP30) answers from a list of participants, where they are rating something between 0 and 4, or -9 for not relevant. The overall score is the sum of the scores scaled to 100. If there are any not relevant answers they are ignored (unless they are all not relevant, in which case the output is missing). Any missing items sets the whole output to missing.
I have written a function that calculates the score from an input vector:
ehp30_sexual <- function(scores = c(0, 0, 0, 0, 0)){
if(anyNA(scores)){
return(NA)
} else if(!all(scores %in% c(-9, 0, 1, 2, 3, 4))){
stop("Values not in correct range (-9, 0, 1, 2, 3, 4)")
} else if(length(scores) != 5){
stop("Must be vector length of 5")
} else if(all(scores == -9)){
return(NA)
} else if(any(scores == -9)){
newscores <- scores[which(scores != -9)]
sum(newscores) * 100 / (4 * length(newscores))
} else {
sum(scores) * 100 / (4 * length(scores))
}
}
I wish to apply this function to each row of a dataframe using mutate if possible (or apply if not):
ans <- c(NA, -9, 0, 1, 2, 3, 4)
set.seed(1)
data <- data.frame(id = 1:10,
ePainAfterSex = sample(ans, 10, TRUE),
eWorriedSex = sample(ans, 10, TRUE),
eAvoidSex = sample(ans, 10, TRUE),
eGuiltyNoSex = sample(ans, 10, TRUE),
eFrustratedNoSex = sample(ans, 10, TRUE))
Any ideas? I'm happy to rewrite the function or use a case_when solution if it is any simpler.

Using dplyr::rowwise() and c_across() (inspired by #edvinsyk’s answer):
set.seed(1)
library(dplyr)
data %>%
rowwise() %>%
mutate(score = ehp30_sexual(
c_across(ePainAfterSex:eFrustratedNoSex)
)) %>%
ungroup()
# A tibble: 10 × 7
id ePainAfterSex eWorriedSex eAvoidSex eGuiltyNoSex eFrustratedNoSex score
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 NA 0 NA -9 -9 NA
2 2 1 0 4 3 3 55
3 3 4 NA 2 NA 4 NA
4 4 NA 2 2 1 1 NA
5 5 -9 2 NA 4 1 NA
6 6 2 -9 NA NA 1 NA
7 7 4 3 3 1 -9 68.8
8 8 0 3 2 0 1 30
9 9 3 -9 2 3 NA NA
10 10 -9 4 -9 -9 4 100

Is something like this what you're after? Seems easier than the function you supplied.
data = tibble(data)
data |>
mutate(across(where(is.numeric), ~ ifelse(.x == -9, NA, .x))) |>
rowwise() |>
mutate(index = sum(c_across(2:6), na.rm = TRUE)) |>
ungroup() |>
mutate(score = round(scales::rescale(index, to = c(0, 100))))
id ePainAfterSex eWorriedSex eAvoidSex eGuiltyNoSex eFrustratedNoSex index score
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 NA 0 NA NA NA 0 0
2 2 1 0 4 3 3 11 100
3 3 4 NA 2 NA 4 10 91
4 4 NA 2 2 1 1 6 55
5 5 NA 2 NA 4 1 7 64
6 6 2 NA NA NA 1 3 27
7 7 4 3 3 1 NA 11 100
8 8 0 3 2 0 1 6 55
9 9 3 NA 2 3 NA 8 73
10 10 NA 4 NA NA 4 8 73

Related

How to count occurrences of a specific value across multiple columns?

I want to count how many times a specific value occurs across multiple columns and put the number of occurrences in a new column. My dataset has a lot of missing values but only if the entire row consists solely of NA's, it should return NA. If possible, I would prefer something that works with dplyr pipelines.
Example dataset:
df <- data.frame(c1 = sample(1:4, 20, replace = TRUE),
c2 = sample(1:4, 20, replace = TRUE),
c3 = sample(1:4, 20, replace = TRUE),
c4 = sample(1:4, 20, replace = TRUE),
c5 = sample(1:4, 20, replace = TRUE))
for (i in 1:5) {
df[sample(1:20, 1), sample(1:5, 1)] <- NA
df[sample(1:20, 1), ] <- NA
}
c1 c2 c3 c4 c5
1 1 2 4 4 1
2 2 2 1 3 4
3 2 4 4 3 3
4 4 2 3 2 1
5 4 2 4 1 3
6 NA 1 2 4 4
7 3 NA 4 NA 4
8 NA NA NA NA NA
9 1 3 3 2 2
10 NA NA NA NA NA
I have tried with rowwise() and rowSums. Some non-working examples here:
# First attempt
df <- df %>%
rowwise() %>%
mutate(count2 = sum(c_across(c1:c5, ~.x %in% 2)))
# Second attempt
df <- df %>%
rowwise() %>%
mutate(count2 = sum(c_across(select(where(c1:c5 %in% 2)))))
# With rowSums
df <- df %>%
rowwise() %>%
mutate(count4 = rowSums(select(c1:c5 %in% 4), na.rm = TRUE))
How about this:
library(dplyr)
df <- data.frame(c1 = sample(1:4, 20, replace = TRUE),
c2 = sample(1:4, 20, replace = TRUE),
c3 = sample(1:4, 20, replace = TRUE),
c4 = sample(1:4, 20, replace = TRUE),
c5 = sample(1:4, 20, replace = TRUE))
for (i in 1:5) {
df[sample(1:20, 1), sample(1:5, 1)] <- NA
df[sample(1:20, 1), ] <- NA
}
df %>%
rowwise() %>%
mutate(count2 = sum(na.omit(c_across(c1:c5)) == 2),
count2 = ifelse(all(is.na(c_across(c1:c5))), NA, count2))
#> # A tibble: 20 × 6
#> # Rowwise:
#> c1 c2 c3 c4 c5 count2
#> <int> <int> <int> <int> <int> <int>
#> 1 NA NA NA NA NA NA
#> 2 2 2 3 4 2 3
#> 3 1 1 1 4 4 0
#> 4 2 3 3 2 4 2
#> 5 NA NA NA NA NA NA
#> 6 1 1 1 2 1 1
#> 7 3 3 2 3 4 1
#> 8 1 1 4 3 4 0
#> 9 NA NA NA NA NA NA
#> 10 NA NA NA NA NA NA
#> 11 2 3 3 4 1 1
#> 12 2 1 4 2 NA 2
#> 13 4 4 2 NA 2 2
#> 14 4 2 3 3 2 2
#> 15 1 3 4 2 2 2
#> 16 1 1 3 3 2 1
#> 17 1 1 1 4 4 0
#> 18 2 4 4 NA 1 1
#> 19 NA NA NA NA NA NA
#> 20 4 1 1 NA 4 0
Created on 2022-12-08 by the reprex package (v2.0.1)

How to Filter by group and move all values to new column if any value in any of the affected columns is greater than 5 in R

I have a Datafaame like this:
dt <- tibble(
TRIAL = c("A", "A", "A", "B", "B", "B", "C", "C", "C","D","D","D"),
RL = c(1, NA, 3, 1, 6, 3, 2, 3, 1, 0, 1.5, NA),
SL = c(6, 1.5, 1, 0, 0, 1, 1, 2, 0, 1, 1.5, NA),
HC = c(0, 1, 5, 6,7, 8, 9, 3, 4, 5, 4, 2)
)
# A tibble: 12 x 4
TRIAL RL SL HC
<chr> <dbl> <dbl> <dbl>
1 A 1 6 0
2 A NA 1.5 1
3 A 3 1 5
4 B 1 0 6
5 B 6 0 7
6 B 3 1 8
7 C 2 1 9
8 C 3 2 3
9 C 1 0 4
10 D 0 1 5
11 D 1.5 1.5 4
12 D NA NA 2
I want to group the data frame by TRIAL and have the values in RL and SL checked by group, if the value in either of the column is greater than 5 then move all values for RL and SL for that particular group to RLCT and SLCT respectively.
# A tibble: 12 x 6
TRIAL HC RLCT SLCT SL RL
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 0 1 6 NA NA
2 A 1 NA 1.5 NA NA
3 A 5 3 1 NA NA
4 B 6 1 0 NA NA
5 B 7 6 0 NA NA
6 B 8 3 1 NA NA
7 C 9 NA NA 1 3
8 C 3 NA NA 3 5
9 C 4 NA NA 1 1
10 D 5 NA NA 1 0
11 D 4 NA NA 1.5 1.5
12 D 2 NA NA NA NA
When I run the below code, I did not get the expected output
dt0 <- dt %>%
mutate(RLCT = NA,
SLCT = NA) %>%
group_by(TRIAL) %>%
filter(!any(RL > 5.0 | SL > 5.0))
dt1 <- dt %>%
group_by(TRIAL) %>%
filter(any(RL > 5.0 | SL > 5.0)) %>%
mutate(RLCT = RL,
SLCT = SL) %>%
rbind(dt0, .) %>%
mutate(RL = ifelse(!is.na(RLCT), NA, RL),
SL = ifelse(!is.na(SLCT), NA, SL)) %>% arrange(TRIAL)
This is what I get
# A tibble: 9 x 6
# Groups: TRIAL [3]
TRIAL RL SL HC RLCT SLCT
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A NA NA 0 1 6
2 A NA NA 1 NA 1.5
3 A NA NA 5 3 1
4 B NA NA 6 1 0
5 B NA NA 7 6 0
6 B NA NA 8 3 1
7 C 2 1 9 NA NA
8 C 3 2 3 NA NA
9 C 1 0 4 NA NA
You can define a column to storage the condition, and change RL and SL with ifelse inside across.
dt %>%
group_by(TRIAL) %>%
mutate(cond = any(RL > 5.0 | SL > 5.0, na.rm = TRUE),
across(c(RL, SL), ~ ifelse(cond, ., NA), .names = "{.col}CT"),
across(c(RL, SL), ~ ifelse(!cond, ., NA)),
cond = NULL)
Result:
# A tibble: 12 x 6
# Groups: TRIAL [4]
TRIAL RL SL HC RLCT SLCT
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A NA NA 0 1 6
2 A NA NA 1 NA 1.5
3 A NA NA 5 3 1
4 B NA NA 6 1 0
5 B NA NA 7 6 0
6 B NA NA 8 3 1
7 C 2 1 9 NA NA
8 C 3 2 3 NA NA
9 C 1 0 4 NA NA
10 D 0 1 5 NA NA
11 D 1.5 1.5 4 NA NA
12 D NA NA 2 NA NA
With dplyr, you could use group_modify():
library(dplyr)
dt %>%
group_by(TRIAL) %>%
group_modify(~ {
if(any(select(.x, c(RL, SL)) > 5, na.rm = TRUE)) {
rename_with(.x, ~ paste0(.x, 'CT'), c(RL, SL))
} else {
.x
}
})
Output
# A tibble: 12 × 6
# Groups: TRIAL [4]
TRIAL RLCT SLCT HC RL SL
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 6 0 NA NA
2 A NA 1.5 1 NA NA
3 A 3 1 5 NA NA
4 B 1 0 6 NA NA
5 B 6 0 7 NA NA
6 B 3 1 8 NA NA
7 C NA NA 9 2 1
8 C NA NA 3 3 2
9 C NA NA 4 1 0
10 D NA NA 5 0 1
11 D NA NA 4 1.5 1.5
12 D NA NA 2 NA NA

Replacing leading NAs by group with 0s, but Keep other NAs

I have a COVID data frame grouped by state with 60 columns. As the COVID started at different times across states, therefore there are NAs before values for different states. Different indicators (column9) also have data starting differently. Below is a sample df I made for the demonstration.
state <- c(rep("A", 6), rep("B", 6))
time <- c(1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6)
x1 <- c(NA, NA, NA, 4, 5, 6, NA, NA, 3, 4, 5, NA)
x2 <- c(NA, 2, 3, NA, 5, 6, NA, NA, NA, 4, 5, 6)
x3 <- c(NA, NA, 3, 4, 5, NA, NA, 2, NA, 4, 5, 6)
df <- data.frame(state, time, x1, x2, x3)
df
state time x1 x2 x3
1 A 1 NA NA NA
2 A 2 NA 2 NA
3 A 3 NA 3 3
4 A 4 4 NA 4
5 A 5 5 5 5
6 A 6 6 6 NA
7 B 1 NA NA NA
8 B 2 NA NA 2
9 B 3 3 NA NA
10 B 4 4 4 4
11 B 5 5 5 5
12 B 6 NA 6 6
I'm trying to replace all the leading NAs with 0 for each state, but keep other NAs. The results should look like below:
state time x1 x2 x3
1 A 1 0 0 0
2 A 2 0 2 0
3 A 3 0 3 3
4 A 4 4 NA 4
5 A 5 5 5 5
6 A 6 6 6 NA
7 B 1 0 0 0
8 B 2 0 0 2
9 B 3 3 0 NA
10 B 4 4 4 4
11 B 5 5 5 5
12 B 6 NA 6 6
One solution I came up with is to replace NAs by the condition of the cumulative sums, as below:
df1 <- df %>%
group_by(state) %>%
mutate(
check.sum1 = cumsum(replace_na(x1, 0)),
x1 = if_else(check.sum1 != 0, x1, 0),
check.sum2 = cumsum(replace_na(x2, 0)),
x2 = if_else(check.sum2 != 0, x2, 0),
check.sum3 = cumsum(replace_na(x3, 0)),
x3 = if_else(check.sum3 != 0, x3, 0)
)
df1
This method worked fine. But since there are 60 columns, I want to wrap it up with a function and/or use apply(). But it gives out error messages:
df2 <- df %>%
group_by(state) %>%
apply(
df[3:5], MARGIN = 2, FUN = function(x) mutate(
check.sum = cumsum(replace_na(x, 0)),
x = if_else(check.sum != 0, x, 0)
)
)
Error in FUN(newX[, i], ...) : unused argument (df[3:5])
#or
func <- function(x) {
mutate(
check.sum = cumsum(replace_na(x, 0)),
x = if_else(check.sum != 0, x, 0)
)
}
df3 <- df %>%
group_by(state) %>%
apply(
df[3:5], MARGIN = 2, func
)
Error in match.fun(FUN) :
'df[3:5]' is not a function, character or symbol
So there are three specific questions:
How to create the user-defined functions by using columns as arguments.
How to use apply() function. and
Are there any other ways of using exiting functions, such as na.locf() or na.trim() to do the job?
Thank you!
Using by and looking where a column is.na and NA is not repeated, i.e. boolean differences are smaller or equal to zero.
do.call(rbind, by(df, df$state, \(x) {
x[] <- lapply(x, \(z) {z[is.na(z) & c(0, diff(is.na(z))) <= 0] <- 0; z})
return(x)
}))
# state time x1 x2 x3
# A.1 A 1 0 0 0
# A.2 A 2 0 2 0
# A.3 A 3 0 3 3
# A.4 A 4 4 NA 4
# A.5 A 5 5 5 5
# A.6 A 6 6 6 NA
# B.7 B 1 0 0 0
# B.8 B 2 0 0 2
# B.9 B 3 3 0 NA
# B.10 B 4 4 4 4
# B.11 B 5 5 5 5
# B.12 B 6 NA 6 6
Note: Please use update R>=4.1 for \(x) function shorthand notation or write function(x).
Using dplyr, we can do
library(dplyr)
df %>%
group_by(state) %>%
mutate(across(starts_with('x'), ~ replace(., !cumsum(!is.na(.)), 0))) %>%
ungroup
# A tibble: 12 × 5
state time x1 x2 x3
<chr> <dbl> <dbl> <dbl> <dbl>
1 A 1 0 0 0
2 A 2 0 2 0
3 A 3 0 3 3
4 A 4 4 NA 4
5 A 5 5 5 5
6 A 6 6 6 NA
7 B 1 0 0 0
8 B 2 0 0 2
9 B 3 3 0 NA
10 B 4 4 4 4
11 B 5 5 5 5
12 B 6 NA 6 6

Calculate value based on multiple other values

I would like to make a some new variables in R based on multiple (>100) other variables.
My dataset looks like this
sub_id diag_1_ais diag_2_ais diag_3_ais diag_4_ais diag_5_ais diag_1_br diag_2_br diag_3_br diag_4_br diag_5_br
1 1 1 1 2 2 1 6 0 1 6 1
2 2 2 3 2 5 1 3
3 3 0 0 <NA> 4 1 0 0 <NA> 2 2
4 4 NA 1 2 2 NA 1 1 4
5 5 NA 4 2 3 5 NA 4 3 4 3
The variables diag_x_ais can take integers from 0-6, and diag_x_br can take integers between 1-6.
I would like to make 6 new variables corresponding to the 6 possible diag_x_br values, i.e. the new variables would be called br_1, br_2 ... br_6. These new variables shall then be filled with the maximum value of the corresponding diag_x_ais variables, i.e.
if diag_1_br, diag_2_br, and diag_4_br are all 3, then br_3 should take the maximum value of diag_1_ais, diag_2_ais, and diag_4_ais.
Please also see the example dataset below:
sub_id diag_1_ais diag_2_ais diag_3_ais diag_4_ais diag_5_ais diag_1_br diag_2_br diag_3_br diag_4_br diag_5_br br_1 br_2 br_3 br_4 br_5 br_6
1 1 1 1 2 2 1 6 0 1 6 1 2 NA NA NA NA 2
2 2 2 1 4 3 5 5 2 2 1 3 3 4 5 NA 2 NA
3 3 0 0 NA 4 1 0 0 NA 2 2 NA 4 NA NA NA NA
4 4 NA 1 2 2 NA 1 1 4 2 NA NA 2 NA NA
5 5 NA 4 2 3 5 NA 4 3 4 3 NA NA 5 4 NA NA
Hereafter, I would like a final variable which calculates the sum of the up to three largest br_x variables, example displayed below:
sub_id diag_1_ais diag_2_ais diag_3_ais diag_4_ais diag_5_ais diag_1_br diag_2_br diag_3_br diag_4_br diag_5_br br_1 br_2 br_3 br_4 br_5 br_6 sum3
1 1 1 1 2 2 1 6 0 1 6 1 2 NA NA NA NA 2 4
2 2 2 1 4 3 5 5 2 2 1 3 3 4 5 NA 2 NA 12
3 3 0 0 NA 4 1 0 0 NA 2 2 NA 4 NA NA NA NA 4
4 4 NA 1 2 2 NA 1 1 4 2 NA NA 2 NA NA 4
5 5 NA 4 2 3 5 NA 4 3 4 3 NA NA 5 4 NA NA 9
My actual dataset has 60 diag_x_ais variables and 60 diag_x_br variables and 4000 rows.
I hope that someone can help me do this in R. Thank you!
I think you could use the following solution. I made a slight modification so that we only sum the first 3 max values:
library(dplyr)
library(purrr)
df %>%
bind_cols(as.data.frame(t(map_dfr(1:6, function(a) pmap_dfc(df, ~ {x <- c(...)[grepl("br", names(df))]
inds <- which(x == a)
if(length(inds) != 0) {
y <- c(...)[grepl("ais", names(df))]
max(y[inds])
} else {
NA
}})))) %>%
setNames(paste0("br", 1:6))) %>%
rowwise() %>%
mutate(sum = sum(sort(as.numeric(c_across(starts_with("br"))), decreasing = TRUE)[1:3], na.rm = TRUE)) %>%
select(starts_with("br"), sum)
Resulting output
# A tibble: 5 x 7
# Rowwise:
br1 br2 br3 br4 br5 br6 sum
<chr> <chr> <chr> <chr> <chr> <chr> <dbl>
1 2 NA NA NA NA 2 4
2 3 4 5 NA 2 NA 12
3 NA 4 NA NA NA NA 4
4 2 NA NA 2 NA NA 4
5 NA NA 5 4 NA NA 9
You could use some heavy data.transforming most likely not very efficient on large datasets. There are some empty values, NA and 0 in your dataset. I didn't handle them (and replaced the empty values by NA to make importing easier).
library(tidyr)
library(dplyr)
data %>%
pivot_longer(-sub_id,
names_to = c("name", "cat"),
names_pattern = ".*_(\\d+)_(.*)") %>%
pivot_wider(names_from = "cat") %>%
group_by(sub_id, br) %>%
summarise(value = max(ais), .groups = "drop") %>%
filter(br %in% 1:6) %>%
group_by(sub_id) %>%
mutate(sum = sum(tail(sort(value), 3))) %>%
pivot_wider(names_from = br,
names_glue = "br_{br}") %>%
select(sub_id, paste0("br_", 1:6), sum)
This returns
# A tibble: 5 x 8
sub_id br_1 br_2 br_3 br_4 br_5 br_6 sum
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2 NA NA NA NA 2 4
2 2 3 4 5 NA 2 NA 12
3 3 NA 4 NA NA NA NA 4
4 4 2 NA NA 2 NA NA 4
5 5 NA NA 5 4 NA NA 9
Piping an addtional right_join(data, by = "sub_id") gives you your example output (minus the order of your columns).
I took an idea from this answer.
Data
data <- structure(list(sub_id = c(1, 2, 3, 4, 5), diag_1_ais = c(1, 2,
0, NA, NA), diag_2_ais = c(1, 1, 0, NA, 4), diag_3_ais = c(2,
4, NA, 1, 2), diag_4_ais = c(2, 3, 4, 2, 3), diag_5_ais = c(1,
5, 1, 2, 5), diag_1_br = c(6, 5, 0, NA, NA), diag_2_br = c(0,
2, 0, NA, 4), diag_3_br = c(1, 2, NA, 1, 3), diag_4_br = c(6,
1, 2, 1, 4), diag_5_br = c(1, 3, 2, 4, 3)), row.names = c(NA,
-5L), class = c("tbl_df", "tbl", "data.frame"))
For the first part:
data <- data.frame(sub_id = c(1,2,3,4,5),
diag_1_ais = c(1,2,0,NA,NA),
diag_2_ais = c(1,1,0,NA,4),
diag_3_ais = c(2,4,NA,1,2),
diag_4_ais = c(2,3,4,2,3),
diag_5_ais = c(1,5,1,2,5),
diag_1_br = c(6,5,0,NA,NA),
diag_2_br = c(0,2,0,NA,4),
diag_3_br = c(1,2,NA,1,3),
diag_4_br = c(6,1,2,1,4),
diag_5_br = c(1,3,2,4,3))
calc_br <- function(data, value, firstBr, lastBr) {
br <- c()
for (i in 1:nrow(data)){
if (length(which(data[i,c(firstBr:lastBr)] %in% value))!=0){
br <- c(br, c(max(data[i,which(data[i,c(firstBr:lastBr)] %in% value)+1])))
}
else {
br <- c(br, c(NA))
}
}
result <- br
}
firstBr = 7
lastBr = 11
data$br_1 <- calc_br(data,1,firstBr,lastBr)
data$br_2 <- calc_br(data,2,firstBr,lastBr)
data$br_3 <- calc_br(data,3,firstBr,lastBr)
data$br_4 <- calc_br(data,4,firstBr,lastBr)
data$br_5 <- calc_br(data,5,firstBr,lastBr)
data$br_6 <- calc_br(data,6,firstBr,lastBr)
This should yield the same results as in your example. You should only have to exchange lastBr and firstBr (to 62 and 122 i would guess).
For the second part this should do the trick:
br_sum <- c()
for (i in 1:nrow(data)){
br_sum <- c(br_sum, sum(data[i,lastBr+tail(order(data[i,c((lastBr+1):(lastBr+6))], na.last = NA), 3)]))
}
data$br_sum <- br_sum
For completness here my results:
sub_id diag_1_ais diag_2_ais diag_3_ais diag_4_ais diag_5_ais diag_1_br
1 1 1 1 2 2 1 6
2 2 2 1 4 3 5 5
3 3 0 0 NA 4 1 0
4 4 NA NA 1 2 2 NA
5 5 NA 4 2 3 5 NA
diag_2_br diag_3_br diag_4_br diag_5_br br_1 br_2 br_3 br_4 br_5 br_6 br_sum
1 0 1 6 1 2 NA NA NA NA 2 4
2 2 2 1 3 3 4 5 NA 2 NA 12
3 0 NA 2 2 NA 4 NA NA NA NA 4
4 NA 1 1 4 2 NA NA 2 NA NA 4
5 4 3 4 3 NA NA 5 4 NA NA 9

Applying custom function to each row uses only first value of argument

I am trying to recode NA values to 0 in a subset of columns using the following dataset:
set.seed(1)
df <- data.frame(
id = c(1:10),
trials = sample(1:3, 10, replace = T),
t1 = c(sample(c(1:9, NA), 10)),
t2 = c(sample(c(1:7, rep(NA, 3)), 10)),
t3 = c(sample(c(1:5, rep(NA, 5)), 10))
)
Each row has a certain number of trials associated with it (between 1-3), specified by the trials column. columns t1-t3 represent scores for each trial.
The number of trials indicates the subset of columns in which NAs should be recoded to 0: NAs that are within the number of trials represent missing data, and should be recoded as 0, while NAs outside the number of trials are not meaningful, and should remain NAs. So, for a row where trials == 3, an NA in column t3 would be recoded as 0, but in a row where trials == 2, an NA in t3 would remain an NA.
So, I tried using this function:
replace0 <- function(x, num.sun) {
x[which(is.na(x[1:(num.sun + 2)]))] <- 0
return(x)
}
This works well for single vectors. When I try applying the same function to a data frame with apply(), though:
apply(df, 1, replace0, num.sun = df$trials)
I get a warning saying:
In 1:(num.sun + 2) :
numerical expression has 10 elements: only the first used
The result is that instead of having the value of num.sun change every row according to the value in trials, apply() simply uses the first value in the trials column for every single row. How could I apply the function so that the num.sun argument changes according to the value of df$trials?
Thanks!
Edit: as some have commented, the original example data had some non-NA scores that didn't make sense according to the trials column. Here's a corrected dataset:
df <- data.frame(
id = c(1:5),
trials = c(rep(1, 2), rep(2, 1), rep(3, 2)),
t1 = c(NA, 7, NA, 6, NA),
t2 = c(NA, NA, 3, 7, 12),
t3 = c(NA, NA, NA, 4, NA)
)
Another approach:
# create an index of the NA values
w <- which(is.na(df), arr.ind = TRUE)
# create an index with the max column by row where an NA is allowed to be replaced by a zero
m <- matrix(c(1:nrow(df), (df$trials + 2)), ncol = 2)
# subset 'w' such that only the NA's which fall in the scope of 'm' remain
i <- w[w[,2] <= m[,2][match(w[,1], m[,1])],]
# use 'i' to replace the allowed NA's with a zero
df[i] <- 0
which gives:
> df
id trials t1 t2 t3
1 1 1 3 NA 5
2 2 2 2 2 NA
3 3 2 6 6 4
4 4 3 0 1 2
5 5 1 5 NA NA
6 6 3 7 0 0
7 7 3 8 7 0
8 8 2 4 5 1
9 9 2 1 3 NA
10 10 1 9 4 3
You could easily wrap this in a function:
replace.NA.with.0 <- function(df) {
w <- which(is.na(df), arr.ind = TRUE)
m <- matrix(c(1:nrow(df), (df$trials + 2)), ncol = 2)
i <- w[w[,2] <= m[,2][match(w[,1], m[,1])],]
df[i] <- 0
return(df)
}
Now, using replace.NA.with.0(df) will produce the above result.
As noted by others, some rows (1, 3 & 10) have more values than trails. You could tackle that problem by rewriting the above function to:
replace.with.NA.or.0 <- function(df) {
w <- which(is.na(df), arr.ind = TRUE)
df[w] <- 0
v <- tapply(m[,2], m[,1], FUN = function(x) tail(x:5,-1))
ina <- matrix(as.integer(unlist(stack(v)[2:1])), ncol = 2)
df[ina] <- NA
return(df)
}
Now, using replace.with.NA.or.0(df) produces the following result:
id trials t1 t2 t3
1 1 1 3 NA NA
2 2 2 2 2 NA
3 3 2 6 6 NA
4 4 3 0 1 2
5 5 1 5 NA NA
6 6 3 7 0 0
7 7 3 8 7 0
8 8 2 4 5 NA
9 9 2 1 3 NA
10 10 1 9 NA NA
Here I just rewrite your function using double subsetting x[paste0('t',x['trials'])], which overcome the problem in the other two solutions with row 6
replace0 <- function(x){
#browser()
x_na <- x[paste0('t',x['trials'])]
if(is.na(x_na)){x[paste0('t',x['trials'])] <- 0}
return(x)
}
t(apply(df, 1, replace0))
id trials t1 t2 t3
[1,] 1 1 3 NA 5
[2,] 2 2 2 2 NA
[3,] 3 2 6 6 4
[4,] 4 3 NA 1 2
[5,] 5 1 5 NA NA
[6,] 6 3 7 NA 0
[7,] 7 3 8 7 0
[8,] 8 2 4 5 1
[9,] 9 2 1 3 NA
[10,] 10 1 9 4 3
Here is a way to do it:
x <- is.na(df)
df[x & t(apply(x, 1, cumsum)) > 3 - df$trials] <- 0
The output looks like this:
> df
id trials t1 t2 t3
1 1 1 3 NA 5
2 2 2 2 2 NA
3 3 2 6 6 4
4 4 3 0 1 2
5 5 1 5 NA NA
6 6 3 7 0 0
7 7 3 8 7 0
8 8 2 4 5 1
9 9 2 1 3 NA
10 10 1 9 4 3
> x <- is.na(df)
> df[x & t(apply(x, 1, cumsum)) > 3 - df$trials] <- 0
> df
id trials t1 t2 t3
1 1 1 3 NA 5
2 2 2 2 2 NA
3 3 2 6 6 4
4 4 3 0 1 2
5 5 1 5 NA NA
6 6 3 7 0 0
7 7 3 8 7 0
8 8 2 4 5 1
9 9 2 1 3 NA
10 10 1 9 4 3
Note: row 1/3/10, is problematic since there are more non-NA values than the trials.
Here's a tidyverse way, note that it doesn't give the same output as other solutions.
Your example data shows results for trials that "didn't happen", I assumed your real data doesn't.
library(tidyverse)
df %>%
nest(matches("^t\\d")) %>%
mutate(data = map2(data,trials,~mutate_all(.,replace_na,0) %>% select(.,1:.y))) %>%
unnest
# id trials t1 t2 t3
# 1 1 1 3 NA NA
# 2 2 2 2 2 NA
# 3 3 2 6 6 NA
# 4 4 3 0 1 2
# 5 5 1 5 NA NA
# 6 6 3 7 0 0
# 7 7 3 8 7 0
# 8 8 2 4 5 NA
# 9 9 2 1 3 NA
# 10 10 1 9 NA NA
Using the more commonly used gather strategy this would be:
df %>%
gather(k,v,matches("^t\\d")) %>%
arrange(id) %>%
group_by(id) %>%
slice(1:first(trials)) %>%
mutate_at("v",~replace(.,is.na(.),0)) %>%
spread(k,v)
# # A tibble: 10 x 5
# # Groups: id [10]
# id trials t1 t2 t3
# <int> <int> <dbl> <dbl> <dbl>
# 1 1 1 3 NA NA
# 2 2 2 2 2 NA
# 3 3 2 6 6 NA
# 4 4 3 0 1 2
# 5 5 1 5 NA NA
# 6 6 3 7 0 0
# 7 7 3 8 7 0
# 8 8 2 4 5 NA
# 9 9 2 1 3 NA
# 10 10 1 9 NA NA

Resources