dplyr get a group-level variable from the lagged group - r

Let's say I have a df with groups and a group-level variable like a mean. How do I produce a variable which is the group-level mean of the lagged group, where the only rows with NA for this variable are those in the first group?
e.g:
df <- data_frame(group = c(1,1,2,2),
grouped.mean = c(2.5,2.5,3.5,3.5))
# my attempt
df %<>%
group_by(group) %>%
mutate(lag.group.mean = lag(grouped.mean))
# A tibble: 4 x 3
# Groups: group [2]
group grouped.mean lag.group.mean
<dbl> <dbl> <dbl>
1 1. 2.50 NA
2 1. 2.50 2.50
3 2. 3.50 NA
4 2. 3.50 3.50
Desired output:
group grouped.mean lag.group.mean
<dbl> <dbl> <dbl>
1 1. 2.50 NA
2 1. 2.50 NA
3 2. 3.50 2.50
4 2. 3.50 2.50
Thanks!
EDIT: more challenging example:
df <- data_frame(group = c(1,1,2,3,3,3),
grouped.mean = c(2.5,2.5,3.5,4.5,4.5,4.5))
expected output:
group grouped.mean lag.grouped.mean
<dbl> <dbl> <dbl>
1 1. 2.50 NA
2 1. 2.50 NA
3 2. 3.50 2.50
4 3. 4.50 3.50
5 3. 4.50 3.50
6 3. 4.50 3.50

Here is an option. The key is to use distinct to remove duplicated rows, create the lag.group.mean column, and then left_join to the original data frame.
library(dplyr)
df <- data_frame(group = c(1,1,2,2),
grouped.mean = c(2.5,2.5,3.5,3.5))
df2 <- df %>%
distinct() %>%
mutate(lag.group.mean = lag(grouped.mean)) %>%
left_join(df, ., by = c("group", "grouped.mean"))
df2
# # A tibble: 4 x 3
# group grouped.mean lag.group.mean
# <dbl> <dbl> <dbl>
# 1 1 2.5 NA
# 2 1 2.5 NA
# 3 2 3.5 2.5
# 4 2 3.5 2.5

The lagged group value is the first globally lagged value within each group:
library(tidyverse)
df <- data_frame(group = c(1, 1, 2, 3, 3, 3),
grouped.mean = c(2.5, 2.5, 3.5, 4.5, 4.5, 4.5))
df %>%
mutate(lag.grouped.mean = lag(grouped.mean)) %>%
group_by(group) %>%
mutate(lag.grouped.mean = first(lag.grouped.mean))
#> # A tibble: 6 x 3
#> # Groups: group [3]
#> group grouped.mean lag.grouped.mean
#> <dbl> <dbl> <dbl>
#> 1 1 2.5 NA
#> 2 1 2.5 NA
#> 3 2 3.5 2.5
#> 4 3 4.5 3.5
#> 5 3 4.5 3.5
#> 6 3 4.5 3.5
But it's probably easier to see what's happening if you use a join like in
#www's answer.
Created on 2018-08-06 by the reprex package (v0.2.0.9000).

Related

Loop to make a basic table for many variables by condition

I am running an experiment where participants are randomly assigned to one of two conditions, and then I collect data on several variables. Here is an example of my code:
df <- data.frame(condition =c(1,1,1,1,1,-1,-1,-1,-1,-1),
var1 = c(6,6,4,7,5,6,6,6,4,7),
var2 = c(3,4,3,6,7,1,2,1,2,5),
var3 = c(2,2,6,6,7,1,7,7,3,1),
var4 = c(6,4,3,6,4,1,3,3,4,4))
df$condition = factor(df$condition, levels = c(-1,1),labels = c("Digital","Physical"))
For each variable (var1, var2, etc.) I would like a little table with the count, mean, and standard deviation. This code creates the kind of table that I want:
group_by(df, df$condition) %>%
summarise(
count = n(),
mean = mean(var1),
sd = sd(var1))
But because I have many variables, I would like to use some kind of loop (or "lapply"?) to create all these tables at once. It would also be great if each table could show the name of the variable. Thanks!
You can just use summarise on all the variables, i.e.
library(dplyr)
group_by(df, condition) %>%
summarise(across(everything(), ~ c(count = n(), mean = mean(.), sd = sd(.))))
`summarise()` has grouped output by 'condition'. You can override using the `.groups` argument.
# A tibble: 6 x 5
# Groups: condition [2]
condition var1 var2 var3 var4
<fct> <dbl> <dbl> <dbl> <dbl>
1 Digital 5 5 5 5
2 Digital 5.8 2.2 3.8 3
3 Digital 1.10 1.64 3.03 1.22
4 Physical 5 5 5 5
5 Physical 5.6 4.6 4.6 4.6
6 Physical 1.14 1.82 2.41 1.34
You can control the output structure by changing object in the formula, i.e.
group_by(df, condition) %>%
summarise(across(everything(), ~ data.frame(count = n(), mean = mean(.), sd = sd(.))))
# A tibble: 2 x 5
condition var1$count $mean $sd var2$count $mean $sd var3$count $mean $sd var4$count $mean $sd
<fct> <int> <dbl> <dbl> <int> <dbl> <dbl> <int> <dbl> <dbl> <int> <dbl> <dbl>
1 Digital 5 5.8 1.10 5 2.2 1.64 5 3.8 3.03 5 3 1.22
2 Physical 5 5.6 1.14 5 4.6 1.82 5 4.6 2.41 5 4.6 1.34
We could still do it my summarise using a list:
library(dplyr)
df %>%
group_by(condition) %>%
summarise(across(starts_with("var"), .f = list(n = ~n(),
mean = mean,
sd = sd), na.rm = TRUE))
condition var1_n var1_mean var1_sd var2_n var2_mean var2_sd var3_n var3_mean var3_sd var4_n var4_mean var4_sd
<dbl> <int> <dbl> <dbl> <int> <dbl> <dbl> <int> <dbl> <dbl> <int> <dbl> <dbl>
1 -1 5 5.8 1.10 5 2.2 1.64 5 3.8 3.03 5 3 1.22
2 1 5 5.6 1.14 5 4.6 1.82 5 4.6 2.41 5 4.6 1.34
df <- data.frame(condition =c(1,1,1,1,1,-1,-1,-1,-1,-1),
var1 = c(6,6,4,7,5,6,6,6,4,7),
var2 = c(3,4,3,6,7,1,2,1,2,5),
var3 = c(2,2,6,6,7,1,7,7,3,1),
var4 = c(6,4,3,6,4,1,3,3,4,4))
df$condition = factor(df$condition, levels = c(-1,1),labels = c("Digital","Physical"))
for (var in names(df)[2:length(names(df))]){
tab <- group_by(df, condition) %>%
select(c("condition", var)) %>%
dplyr::rename(v = var) %>%
summarise(
count = n(),
mean = mean(v),
sd = sd(v)
)
print(var)
print(tab)
}
gives
[1] "var1"
# A tibble: 2 × 4
condition count mean sd
<fct> <int> <dbl> <dbl>
1 Digital 5 5.8 1.10
2 Physical 5 5.6 1.14
[1] "var2"
# A tibble: 2 × 4
condition count mean sd
<fct> <int> <dbl> <dbl>
1 Digital 5 2.2 1.64
2 Physical 5 4.6 1.82
[1] "var3"
# A tibble: 2 × 4
condition count mean sd
<fct> <int> <dbl> <dbl>
1 Digital 5 3.8 3.03
2 Physical 5 4.6 2.41
[1] "var4"
# A tibble: 2 × 4
condition count mean sd
<fct> <int> <dbl> <dbl>
1 Digital 5 3 1.22
2 Physical 5 4.6 1.34
>
Rather than lapply, the function of choice is aggregate, a close relative to the *apply family at least. Put in a custom function f.
f <- \(x) c(n=length(x), mu=mean(x), sd=sd(x))
aggregate(. ~ condition, df, f)
# condition var1.n var1.mu var1.sd var2.n var2.mu var2.sd var3.n var3.mu var3.sd var4.n var4.mu var4.sd
# 1 Digital 5.000000 5.800000 1.095445 5.000000 2.200000 1.643168 5.000000 3.800000 3.033150 5.000000 3.000000 1.224745
# 2 Physical 5.000000 5.600000 1.140175 5.000000 4.600000 1.816590 5.000000 4.600000 2.408319 5.000000 4.600000 1.341641
If you want to aggregate on a specific set of variables (e.g. assembled with grep), use list notation instead.
aggregate(df[grep('^var', names(df))], df['condition'], f)
You can use gtsummary here if you need to present the results.
Example one below will make one table with all of your variables. Example two will split each variable into its own table (if you need them to be seperate)
library(gtsummary)
#example one:
tbl_summary(df, by = condition,
type = list(everything()~"continuous"),
statistic = list(all_continuous()~"{mean} ({sd}) "))
#example two:
tbl_summary(df, by = condition,
type = list(everything()~"continuous"),
statistic = list(all_continuous()~"{mean} ({sd}) ")) %>%
tbl_split(variables = c(var1, var2,var3,var4))

How to transform a tibble from one column to two columns with repeated observations

I tried to transform df into df2. I have done it through a very patchy way using df3, Is there a simpler and more elegant way of doing it?
library(tidyverse)
# I want to transform df
df <- tibble(id = c(1, 2, 1, 2, 1, 2),
time = c('t1', 't1', 't2', 't2', 't3', 't3'),
value = c(2, 3, 6, 4, 5, 7))
df
#> # A tibble: 6 x 3
#> id time value
#> <dbl> <chr> <dbl>
#> 1 1 t1 2
#> 2 2 t1 3
#> 3 1 t2 6
#> 4 2 t2 4
#> 5 1 t3 5
#> 6 2 t3 7
# into df2
df2 <- tibble(id = c(1, 2, 1, 2),
t = c(2, 3, 6, 4),
r = c(6, 4, 5, 7))
df2
#> # A tibble: 4 x 3
#> id t r
#> <dbl> <dbl> <dbl>
#> 1 1 2 6
#> 2 2 3 4
#> 3 1 6 5
#> 4 2 4 7
# This is how I did it, but I think it should be a better way
df3 <- df %>% pivot_wider(names_from = time, values_from = value)
b <- tibble(id = numeric(), t = numeric(), r = numeric())
for (i in 2:3){
a <- df3[,c(1,i,i+1)]
colnames(a) <- c('id', 't', 'r')
b <- bind_rows(a, b)
}
b
#> # A tibble: 4 x 3
#> id t r
#> <dbl> <dbl> <dbl>
#> 1 1 6 5
#> 2 2 4 7
#> 3 1 2 6
#> 4 2 3 4
Created on 2020-11-25 by the reprex package (v0.3.0)
For each id you can use lead to select next value and create r column and drop NA rows.
library(dplyr)
df %>%
group_by(id) %>%
mutate(t = value,
r = lead(value)) %>%
na.omit() %>%
select(id, t, r)
# id t r
# <dbl> <dbl> <dbl>
#1 1 2 6
#2 2 3 4
#3 1 6 5
#4 2 4 7
We can use summarise from dplyr version >= 1.0. Previously, it had the constraint of returning only single observation per group. From version >= 1.0, it is no longer the case. Can return any number of rows i.e. it can be shorter or longer than the original number of rows
library(dplyr)
df %>%
group_by(id) %>%
summarise(t = value[-n()], r = value[-1], .groups = 'drop')
-output
# A tibble: 4 x 3
# id t r
# <dbl> <dbl> <dbl>
#1 1 2 6
#2 1 6 5
#3 2 3 4
#4 2 4 7

tidyverse calculate ranking per row across several columns

I have the following data frame:
dat <- data.frame(id = c("a", "b", "c", "d"),
x1 = c(1, 3, 5, 7),
x2 = c(4, 2, 6, 0),
x3 = c(2, 2, 5, 9))
I now want to calculate the ranking per row across my three x columns and want to store that result into my dat data frame.
So the result could be stored in two ways:
a) ideally, there will be 4 new columns with the respective ranks or
b) there will be a new nested column that I probably need to unnest somehow.
I tried the following which at least gives me a list column.
dat %>%
rowwise() %>%
mutate(my_ranks = list(rank(c_across(starts_with("x")))))
But when I try to unnest, it will give me the ranks but it does so by creating new rows (i.e. each original case now appears four times). Although I guess I could somehow reshape this result with pivot_wider, it feels wrong to follow that route.
Any better/easier idea? Thanks.
We can use unnest_wider
library(dplyr)
library(tidyr)
library(stringr)
dat %>%
rowwise() %>%
mutate(my_ranks = list(rank(c_across(starts_with("x"))))) %>%
unnest_wider(c(my_ranks)) %>%
rename_at(vars(starts_with("...")), ~ str_replace(., fixed("..."), "rank_x"))
# A tibble: 4 x 7
# id x1 x2 x3 rank_x1 rank_x2 rank_x3
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 a 1 4 2 1 3 2
#2 b 3 2 2 3 1.5 1.5
#3 c 5 6 5 1.5 3 1.5
#4 d 7 0 9 2 1 3
Another option is pmap/as_tibble_row
library(tibble)
library(purrr)
dat %>%
mutate(my_ranks = pmap(select(., starts_with('x')), ~
as_tibble_row(rank(c(...)),
.name_repair = ~ str_c('rank', seq_along(.))))) %>%
unnest(c(my_ranks))
# A tibble: 4 x 7
# id x1 x2 x3 rank1 rank2 rank3
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 a 1 4 2 1 3 2
#2 b 3 2 2 3 1.5 1.5
#3 c 5 6 5 1.5 3 1.5
#4 d 7 0 9 2 1 3
It can be done more straightforward with rowRanks from matrixStats
library(matrixStats)
nm1 <- names(dat)[-1]
dat[paste0('rank', nm1)] <- rowRanks(as.matrix(dat[nm1]), ties.method = 'average')
I guess this is sort of tidyverse:
dat %>%
bind_cols(as_tibble(`colnames<-`(t(apply(dat[-1], 1, rank)), paste0("rank_x", 1:3))))
#> id x1 x2 x3 rank_x1 rank_x2 rank_x3
#> 1 a 1 4 2 1.0 3.0 2.0
#> 2 b 3 2 2 3.0 1.5 1.5
#> 3 c 5 6 5 1.5 3.0 1.5
#> 4 d 7 0 9 2.0 1.0 3.0

dplyr use both rowwise and df-wise values in a mutate

How do you perform a rowwise operation which uses values from other rows (in dplyr/tidy style)? Let's say I have this df:
df <- data_frame(value = c(5,6,7,3,4),
group = c(1,2,2,3,3),
group.to.use = c(2,3,3,1,1))
I want to create a new variable, new.value, which is equal to each row's current value plus the maximum of value for rows whose "group" equals this row's "group.to.use." So for the first row
new.value = 5 + (max(value[group === 2])) = 5 + 7 = 12
desired output:
# A tibble: 5 x 4
value group group.to.use new.value
<dbl> <dbl> <dbl> <dbl>
1 5. 1. 2. 12.
2 6. 2. 3. 10.
3 7. 2. 3. 11.
4 3. 3. 1. 8.
5 4. 3. 1. 9.
pseudo code:
df %<>%
mutate(new.value = value + max(value[group.to.use == <group.for.this.row>]))
In rowwise operation, you can refer to the whole data.frame with . and a whole column in the data.frame with normal syntax .$colname or .[['col.name']]:
df %>%
rowwise() %>%
mutate(new.value = value + max(.$value[.$group == group.to.use])) %>%
ungroup()
# # A tibble: 5 x 4
# value group group.to.use new.value
# <dbl> <dbl> <dbl> <dbl>
# 1 5 1 2 12
# 2 6 2 3 10
# 3 7 2 3 11
# 4 3 3 1 8
# 5 4 3 1 9
Alternatively, you can precompute the max for each group and then do a left-join:
df.max <- df %>% group_by(group) %>% summarise(max.value = max(value))
df %>%
left_join(df.max, by = c('group.to.use' = 'group')) %>%
mutate(new.value = value + max.value) %>%
select(-max.value)
# # A tibble: 5 x 4
# value group group.to.use new.value
# <dbl> <dbl> <dbl> <dbl>
# 1 5 1 2 12
# 2 6 2 3 10
# 3 7 2 3 11
# 4 3 3 1 8
# 5 4 3 1 9
With base R, we can use ave, where we calculate max for each group and add them with the corresponding value matching the groups.
df$new.value <- with(df, value +
ave(value, group, FUN = max)[match(group.to.use, group)])
df
# A tibble: 5 x 4
# value group group.to.use new.value
# <dbl> <dbl> <dbl> <dbl>
#1 5.00 1.00 2.00 12.0
#2 6.00 2.00 3.00 10.0
#3 7.00 2.00 3.00 11.0
#4 3.00 3.00 1.00 8.00
#5 4.00 3.00 1.00 9.00
Here is an option with base R
df$new.value <- with(df, value + vapply(group.to.use, function(x)
max(value[group == x]), numeric(1)))
df$new.value
#[1] 12 10 11 8 9

Replace NA on numeric columns with mutate_if and replace_na

I would like to replace NAs in numeric columns using some variation of mutate_if and replace_na if possible, but can't figure out the syntax.
df <-tibble(
first = c("a", NA, "b"),
second = c(NA, 2, NA),
third = c(10, NA, NA)
)
#> # A tibble: 3 x 3
#> first second third
#> <chr> <dbl> <dbl>
#> 1 a NA 10.0
#> 2 <NA> 2.00 NA
#> 3 b NA NA
Final result should be:
#> # A tibble: 3 x 3
#> first second third
#> <chr> <dbl> <dbl>
#> 1 a 0 10.0
#> 2 <NA> 2.00 0
#> 3 b 0 0
My attempts look like:
df %>% mutate_if(is.numeric , replace_na(., 0) )
#>Error: is_list(replace) is not TRUE
df %>% mutate_if(is.numeric , replace_na, replace = 0)
# A tibble: 3 x 3
# first second third
# <chr> <dbl> <dbl>
#1 a 0 10.0
#2 NA 2.00 0
#3 b 0 0
The in another answer mentioned solution based on mutate_if is based on a suspended function in dplyr. The suggested alternative is to use the across() function. Here a solution using that one:
df %>%
mutate(
across(where(is.numeric), ~replace_na(.x, 0))
)
# A tibble: 3 × 3
first second third
<chr> <dbl> <dbl>
1 a 0 10
2 NA 2 0
3 b 0 0

Resources