Calculate new values by row - r

I'd like to create a new column (val_new) in which each value is multiplied by a value in another column (val2) by row. As I want to do this for several groups I'd prefer using dplyr, but how?
dat <- data.frame(group = rep(c("A", "B"), each = 3),
val1 = c(50, NA, NA, 40, NA, NA),
val2 = c(NA, 0.5, 0.3, NA, 0.8, 0.7))
> dat
group val1 val2
1 A 50 NA
2 A NA 0.5
3 A NA 0.3
4 B 40 NA
5 B NA 0.8
6 B NA 0.7
dat %>%
group_by(group) %>%
mutate(val_new = ifelse(!is.na(val1), val1, lag(val_new) * val2))
Error in mutate_impl(.data, dots) :
Evaluation error: object 'val_new' not found.
Desired result:
# A tibble: 6 x 4
# Groups: group [2]
group val1 val2 val_new
<fct> <dbl> <dbl> <dbl>
1 A 50 NA 50
2 A NA 0.5 25
3 A NA 0.3 7.5
4 B 40 NA 40
5 B NA 0.8 32
6 B NA 0.7 22.4

Try this:
dat %>%
group_by(group) %>%
mutate(val_new = cumprod(c(first(val1),val2[-1])))
## A tibble: 6 x 4
## Groups: group [2]
# group val1 val2 val_new
# <fct> <dbl> <dbl> <dbl>
#1 A 50 NA 50
#2 A NA 0.5 25
#3 A NA 0.3 7.5
#4 B 40 NA 40
#5 B NA 0.8 32
#6 B NA 0.7 22.4

Related

How to Filter by group and move all values to new column if any value in any of the affected columns is greater than 5 in R

I have a Datafaame like this:
dt <- tibble(
TRIAL = c("A", "A", "A", "B", "B", "B", "C", "C", "C","D","D","D"),
RL = c(1, NA, 3, 1, 6, 3, 2, 3, 1, 0, 1.5, NA),
SL = c(6, 1.5, 1, 0, 0, 1, 1, 2, 0, 1, 1.5, NA),
HC = c(0, 1, 5, 6,7, 8, 9, 3, 4, 5, 4, 2)
)
# A tibble: 12 x 4
TRIAL RL SL HC
<chr> <dbl> <dbl> <dbl>
1 A 1 6 0
2 A NA 1.5 1
3 A 3 1 5
4 B 1 0 6
5 B 6 0 7
6 B 3 1 8
7 C 2 1 9
8 C 3 2 3
9 C 1 0 4
10 D 0 1 5
11 D 1.5 1.5 4
12 D NA NA 2
I want to group the data frame by TRIAL and have the values in RL and SL checked by group, if the value in either of the column is greater than 5 then move all values for RL and SL for that particular group to RLCT and SLCT respectively.
# A tibble: 12 x 6
TRIAL HC RLCT SLCT SL RL
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 0 1 6 NA NA
2 A 1 NA 1.5 NA NA
3 A 5 3 1 NA NA
4 B 6 1 0 NA NA
5 B 7 6 0 NA NA
6 B 8 3 1 NA NA
7 C 9 NA NA 1 3
8 C 3 NA NA 3 5
9 C 4 NA NA 1 1
10 D 5 NA NA 1 0
11 D 4 NA NA 1.5 1.5
12 D 2 NA NA NA NA
When I run the below code, I did not get the expected output
dt0 <- dt %>%
mutate(RLCT = NA,
SLCT = NA) %>%
group_by(TRIAL) %>%
filter(!any(RL > 5.0 | SL > 5.0))
dt1 <- dt %>%
group_by(TRIAL) %>%
filter(any(RL > 5.0 | SL > 5.0)) %>%
mutate(RLCT = RL,
SLCT = SL) %>%
rbind(dt0, .) %>%
mutate(RL = ifelse(!is.na(RLCT), NA, RL),
SL = ifelse(!is.na(SLCT), NA, SL)) %>% arrange(TRIAL)
This is what I get
# A tibble: 9 x 6
# Groups: TRIAL [3]
TRIAL RL SL HC RLCT SLCT
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A NA NA 0 1 6
2 A NA NA 1 NA 1.5
3 A NA NA 5 3 1
4 B NA NA 6 1 0
5 B NA NA 7 6 0
6 B NA NA 8 3 1
7 C 2 1 9 NA NA
8 C 3 2 3 NA NA
9 C 1 0 4 NA NA
You can define a column to storage the condition, and change RL and SL with ifelse inside across.
dt %>%
group_by(TRIAL) %>%
mutate(cond = any(RL > 5.0 | SL > 5.0, na.rm = TRUE),
across(c(RL, SL), ~ ifelse(cond, ., NA), .names = "{.col}CT"),
across(c(RL, SL), ~ ifelse(!cond, ., NA)),
cond = NULL)
Result:
# A tibble: 12 x 6
# Groups: TRIAL [4]
TRIAL RL SL HC RLCT SLCT
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A NA NA 0 1 6
2 A NA NA 1 NA 1.5
3 A NA NA 5 3 1
4 B NA NA 6 1 0
5 B NA NA 7 6 0
6 B NA NA 8 3 1
7 C 2 1 9 NA NA
8 C 3 2 3 NA NA
9 C 1 0 4 NA NA
10 D 0 1 5 NA NA
11 D 1.5 1.5 4 NA NA
12 D NA NA 2 NA NA
With dplyr, you could use group_modify():
library(dplyr)
dt %>%
group_by(TRIAL) %>%
group_modify(~ {
if(any(select(.x, c(RL, SL)) > 5, na.rm = TRUE)) {
rename_with(.x, ~ paste0(.x, 'CT'), c(RL, SL))
} else {
.x
}
})
Output
# A tibble: 12 × 6
# Groups: TRIAL [4]
TRIAL RLCT SLCT HC RL SL
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 6 0 NA NA
2 A NA 1.5 1 NA NA
3 A 3 1 5 NA NA
4 B 1 0 6 NA NA
5 B 6 0 7 NA NA
6 B 3 1 8 NA NA
7 C NA NA 9 2 1
8 C NA NA 3 3 2
9 C NA NA 4 1 0
10 D NA NA 5 0 1
11 D NA NA 4 1.5 1.5
12 D NA NA 2 NA NA

complete dplyr with missing ID

I understand how to use complete from tidyverse tidyverse complete a dataframe
In the example they give:
df <- tibble(
group = c(1:2, 1, 2),
item_id = c(1:2, 2, 3),
item_name = c("a", "a", "b", "b"),
value1 = c(1, NA, 3, 4),
value2 = 4:7
)
df
#> # A tibble: 4 × 5
#> group item_id item_name value1 value2
#> <dbl> <dbl> <chr> <dbl> <int>
#> 1 1 1 a 1 4
#> 2 2 2 a NA 5
#> 3 1 2 b 3 6
#> 4 2 3 b 4 7
Is there a way of adding a group and completing? e.g. add a group 3 and complete the table.
For example, I have a df which I populate in a for loop to make a plot. The df is like so:
variant Location Position variable value protein Mutation.type
FANCI_L605F FANCI chr15:89828441_C/T B 0.45 L605F nonsynonymous_SNV
PLCG2_R953* PLCG2 chr16:81969788_C/T B 0.87 R953* stopgain
STAT3_R278C STAT3 chr17:40486033_G/A B 0.38 R278C nonsynonymous_SN
FANCI_L605F FANCI chr15:89828441_C/T C 0.45 L605F nonsynonymous_SNV
PLCG2_R953* PLCG2 chr16:81969788_C/T C 0.87 R953* stopgain
STAT3_R278C STAT3 chr17:40486033_G/A C 0.38 R278C nonsynonymous_SNV
I also have a vector of possible variable names:
all_var<-c("A","B","C")
I have worked out how to add any missing variables (I think):
new_df<-complete(df,variable=all_var,Position)
>new_df
variant Location Position variable value protein Mutation.type
NA NA chr15:89828441_C/T A NA NA NA
NA NA chr16:81969788_C/T A NA NA NA
NA NA chr17:40486033_G/A A NA NA NA
FANCI_L605F FANCI chr15:89828441_C/T B 0.45 L605F nonsynonymous_SNV
PLCG2_R953* PLCG2 chr16:81969788_C/T B 0.87 R953* stopgain
STAT3_R278C STAT3 chr17:40486033_G/A B 0.38 R278C nonsynonymous_SN
FANCI_L605F FANCI chr15:89828441_C/T C 0.45 L605F nonsynonymous_SNV
PLCG2_R953* PLCG2 chr16:81969788_C/T C 0.87 R953* stopgain
STAT3_R278C STAT3 chr17:40486033_G/A C 0.38 R278C nonsynonymous_SNV
How do I now complete the variant,Location, protein, Mutation.Type?
You can use add a row by specifying the group and use complete() to complete the combinations, i.e.
library(dplyr)
library(tidyr)
df %>%
add_row(group = 3) %>%
complete(group, nesting(item_id, item_name)) %>%
drop_na(item_id)
# A tibble: 12 x 5
group item_id item_name value1 value2
<dbl> <dbl> <chr> <dbl> <int>
1 1 1 a 1 4
2 1 2 a NA NA
3 1 2 b 3 6
4 1 3 b NA NA
5 2 1 a NA NA
6 2 2 a NA 5
7 2 2 b NA NA
8 2 3 b 4 7
9 3 1 a NA NA
10 3 2 a NA NA
11 3 2 b NA NA
12 3 3 b NA NA

na.approx only when less than 3 consecutive NA in a column

mydata <-data.frame(group = c(1,1,1,1,1,2,2,2,2,2), score = c(10, NA, NA, 20, 30, 5, NA, NA, NA, 40))
From 'mydata' I am trying to use dplyr to interpolate 'x' using na.approx when there are fewer than 3 consecutive NAs between the closest non-NA entries in 'value'. The interpolated x values are store in 'x_approx'.
Without the condition on the number of consecutive NAs in 'value' I use this code:
library(zoo)
mydata %>%
group_by(group) %>%
mutate(score_approx = na.approx(score)) %>%
mutate(score_approx = coalesce(score_approx,score))
mydata
# A tibble: 10 x 3
# Groups: group [2]
group score score_approx
<dbl> <dbl> <dbl>
1 1 10 10
2 1 NA 13.3
3 1 NA 16.7
4 1 20 20
5 1 30 30
6 2 5 5
7 2 NA 13.8
8 2 NA 22.5
9 2 NA 31.2
10 2 40 40
However, the desired data frame is:
# A tibble: 10 x 3
# Groups: group [2]
group score score_approx
<dbl> <dbl> <dbl>
1 1 10 10
2 1 NA 13.3
3 1 NA 16.7
4 1 20 20
5 1 30 30
6 2 5 5
7 2 NA NA
8 2 NA NA
9 2 NA NA
10 2 40 40
You can use maxgap argument in na.approx -
library(dplyr)
library(zoo)
mydata %>%
group_by(group) %>%
mutate(score_approx = na.approx(score, maxgap = 2)) %>%
ungroup
# group score score_approx
# <dbl> <dbl> <dbl>
# 1 1 10 10
# 2 1 NA 13.3
# 3 1 NA 16.7
# 4 1 20 20
# 5 1 30 30
# 6 2 5 5
# 7 2 NA NA
# 8 2 NA NA
# 9 2 NA NA
#10 2 40 40

Unnest a data frame and fill new rows with NAs

Let's say I have a nested df, and I want to unnest the columns:
df <- tibble::tribble(
~x, ~y, ~nestdf,
1, 2, tibble::tibble(a=1:2, b=3:4),
3, 4, tibble::tibble(a=3:5, b=5:7)
)
tidyr::unnest(df, nestdf)
# x y a b
# <dbl> <dbl> <int> <int>
#1 1 2 1 3
#2 1 2 2 4
#3 3 4 3 5
#4 3 4 4 6
#5 3 4 5 7
The result has the x and y columns extended to match the dimensions of nestdf, with the new rows using the existing values. However, I want the new rows to contain NA, like so:
# x y a b
# <dbl> <dbl> <int> <int>
#1 1 2 1 3
#2 NA NA 2 4
#3 3 4 3 5
#4 NA NA 4 6
#5 NA NA 5 7
Is it possible to do this with unnest? Either the first or last row for each group can be kept as non-NA, I don't mind.
Repeating rows, and binding with an unnest of the nested list column(s):
nr <- sapply(df$nestdf, nrow) - 1
cbind(
df[rep(rbind(seq_along(nr), NA), rbind(1, nr)), c("x","y")],
unnest(df["nestdf"], cols=everything())
)
# x y a b
#1 1 2 1 3
#2 NA NA 2 4
#3 3 4 3 5
#4 NA NA 4 6
#5 NA NA 5 7
One way would be to change the duplicates to NA.
df1 <- tidyr::unnest(df, nestdf)
cols <- c('x', 'y')
df1[duplicated(df1[cols]), cols] <- NA
df1
# x y a b
# <dbl> <dbl> <int> <int>
#1 1 2 1 3
#2 NA NA 2 4
#3 3 4 3 5
#4 NA NA 4 6
#5 NA NA 5 7
If the values in columns x and y can repeat you can create a row number to identify them uniquely -
library(dplyr)
library(tidyr)
df1 <- df %>% mutate(row = row_number()) %>% unnest(nestdf)
cols <- c('x', 'y', 'row')
df1[duplicated(df1[cols]), cols] <- NA
df1 <- select(df1, -row)
You could convert x and y to lists first:
library(tidyverse)
df <- tibble::tribble(
~x, ~y, ~nestdf,
1, 2, tibble::tibble(a=1:2, b=3:4),
3, 4, tibble::tibble(a=3:5, b=5:7)
)
df %>%
mutate_at(vars(x:y), ~map2(., nestdf, ~.x[seq(nrow(.y))])) %>%
unnest(everything())
# A tibble: 5 x 4
#x y a b
#<dbl> <dbl> <int> <int>
# 1 1 2 1 3
#2 NA NA 2 4
#3 3 4 3 5
#4 NA NA 4 6
#5 NA NA 5 7

How to take a maximum of several columns in R/dplyr [duplicate]

This question already has answers here:
Calculate max value across multiple columns by multiple groups
(5 answers)
Closed 2 years ago.
I have data which looks basically like this:
id <- c(1:5)
VolumeA <- c(12, NA, NA, NA, NA)
VolumeB <- c(NA, 34, NA, NA, NA)
VolumeC <- c(NA, NA, 56, NA, NA)
VolumeD <- c(NA, NA, NA, 78, NA)
VolumeE <- c(NA, NA, NA, NA, 90)
df_now <- tibble(id, VolumeA, VolumeB, VolumeC, VolumeD, VolumeE)
df_now
# A tibble: 5 x 6
id VolumeA VolumeB VolumeC VolumeD VolumeE
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 12 NA NA NA NA
2 2 NA 34 NA NA NA
3 3 NA NA 56 NA NA
4 4 NA NA NA 78 NA
5 5 NA NA NA NA 90
In the IRL dataset, there are MANY more Volume[label] columns, but in each row I only need one of them: the largest one. So I want to create a new variable which has the largest value:
Volume <- c(12, 34, 56, 78, 90)
df_desired <- cbind(df_now, Volume)
df_desired
id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
1 1 12 NA NA NA NA 12
2 2 NA 34 NA NA NA 34
3 3 NA NA 56 NA NA 56
4 4 NA NA NA 78 NA 78
5 5 NA NA NA NA 90 90
After looking at the dplyr documentation, I tried this...
library(tidyverse)
df_try <- df_now %>%
mutate(Volume = across(contains("Volume"), max, na.rm = TRUE))
...but got back a tibble of data, not a single column. Can someone tell me how to do this properly?
(Please assume, due to issues with my IRL data too complicated to explain here, that I cannot just gather and spread my data. I want to use a conditional mutate.)
Since you have "MANY more Volume[label] columns", any solution that works over each row (rowwise) or individually on each column (with reduce or Reduce) is going to be much slower than necessary.
df_now %>%
mutate(Volume = do.call(pmax, c(select(., starts_with('Volume')), na.rm = TRUE)))
# # A tibble: 5 x 7
# id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 12 NA NA NA NA 12
# 2 2 NA 34 NA NA NA 34
# 3 3 NA NA 56 NA NA 56
# 4 4 NA NA NA 78 NA 78
# 5 5 NA NA NA NA 90 90
Proof of relative improvement:
Using Reduce or purrr::reduce or anything that will iterate per column (well, with nc columns, then it will iterate nc-1 times):
mypmax <- function(...) { message("mypmax"); pmax(...); }
df_now %>%
mutate(Volume = reduce(select(., starts_with('Volume')), mypmax, na.rm = TRUE))
# mypmax
# mypmax
# mypmax
# mypmax
# # A tibble: 5 x 7
# id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 12 NA NA NA NA 12
# 2 2 NA 34 NA NA NA 34
# 3 3 NA NA 56 NA NA 56
# 4 4 NA NA NA 78 NA 78
# 5 5 NA NA NA NA 90 90
Anything rowwise is doing this once per row, perhaps even worse (assuming more rows than columns in your data:
mymax <- function(...) { message("mymax"); max(...); }
df_now %>%
rowwise %>%
mutate(Volume = mymax(c_across(starts_with('Volume')), na.rm = TRUE))
# mymax
# mymax
# mymax
# mymax
# mymax
# # A tibble: 5 x 7
# # Rowwise:
# id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 12 NA NA NA NA 12
# 2 2 NA 34 NA NA NA 34
# 3 3 NA NA 56 NA NA 56
# 4 4 NA NA NA 78 NA 78
# 5 5 NA NA NA NA 90 90
Do it once across all columns, all rows:
mypmax <- function(...) { message("mypmax"); pmax(...); }
df_now %>%
mutate(Volume = do.call(mypmax, c(select(., starts_with('Volume')), na.rm = TRUE)))
# mypmax
# # A tibble: 5 x 7
# id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 12 NA NA NA NA 12
# 2 2 NA 34 NA NA NA 34
# 3 3 NA NA 56 NA NA 56
# 4 4 NA NA NA 78 NA 78
# 5 5 NA NA NA NA 90 90
The benchmarking is minor at this scale, but will be more dramatic with larger data:
microbenchmark::microbenchmark(
red = df_now %>% mutate(Volume = reduce(select(., starts_with('Volume')), pmax, na.rm = TRUE)),
row = df_now %>% rowwise %>% mutate(Volume = max(c_across(starts_with('Volume')), na.rm = TRUE)),
sgl = df_now %>% mutate(Volume = do.call(pmax, c(select(., starts_with('Volume')), na.rm = TRUE)))
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# red 4.9736 5.36240 7.240561 5.68010 6.19915 70.7482 100
# row 4.5813 5.02020 6.082047 5.34460 5.70345 63.1166 100
# sgl 3.8270 4.18605 5.803043 4.43215 4.76030 65.7217 100
We can use pmax (first posted the pmax solution here). Note that the relative improvement is very small with do.call
library(dplyr)
library(purrr)
df_now %>%
mutate(Volume = reduce(select(., starts_with('Volume')), pmax, na.rm = TRUE))
# A tibble: 5 x 7
# id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 12 NA NA NA NA 12
#2 2 NA 34 NA NA NA 34
#3 3 NA NA 56 NA NA 56
#4 4 NA NA NA 78 NA 78
#5 5 NA NA NA NA 90 90
Or with c_across and max (using only tidyverse approaches)
df_now %>%
rowwise %>%
mutate(Volume = max(c_across(starts_with('Volume')), na.rm = TRUE))
# A tibble: 5 x 7
# Rowwise:
# id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 12 NA NA NA NA 12
#2 2 NA 34 NA NA NA 34
#3 3 NA NA 56 NA NA 56
#4 4 NA NA NA 78 NA 78
#5 5 NA NA NA NA 90 90
Benchmarks
system.time({df_now %>% mutate(Volume = reduce(select(., starts_with('Volume')), pmax, na.rm = TRUE))})
# user system elapsed
# 0.023 0.006 0.029
system.time({df_now %>% rowwise %>% mutate(Volume = max(c_across(starts_with('Volume')), na.rm = TRUE))})
# user system elapsed
# 0.012 0.002 0.015
system.time({df_now %>% mutate(Volume = do.call(pmax, c(select(., starts_with('Volume')), na.rm = TRUE)))})
# user system elapsed
# 0.011 0.001 0.011
NOTE: Not that much difference in timings

Resources