dplyr mutate with null value - r

I have a data frame and I'd like to use mutate to populate a "e_value" column that is the value for the "e" metric within a group so I use dplyr and group_by the group then mutate using value[metric == "e"] but this is returning an error when there is no metric == e within a group like in group C below. Is there a way to just return the f metric when there is no e metric?
library(dplyr)
# this code does not work because there is no e metric in group C
data =data.frame(group = c("A","A","B","B","C"),metric=c("e","f","e","f","f"),value = c(1,2,3,4,5))
data %>% group_by(group) %>% mutate( e_value = value[metric == "e"] )
## this code below work becuase there is always an e metric
data =data.frame(group = c("A","A","B","B"),metric=c("e","f","e","f"),value = c(1,2,3,4))
data %>% group_by(group) %>% mutate( e_value = value[metric == "e"] )

You can insert an ifelse to make it conditional.
data %>%
group_by(group) %>%
mutate(
e_value = ifelse(is.null(value[metric == "e"]), NA, value[metric == "e"])
)
# # A tibble: 5 x 4
# # Groups: group [3]
# group metric value e_value
# <fct> <fct> <dbl> <dbl>
# 1 A e 1.00 1.00
# 2 A f 2.00 1.00
# 3 B e 3.00 3.00
# 4 B f 4.00 3.00
# 5 C f 5.00 NA

Or like this using %in%:
data %>% group_by(group) %>% mutate(e_value = ifelse("e" %in% metric, value, NA));
## A tibble: 5 x 4
## Groups: group [3]
# group metric value e_value
# <fctr> <fctr> <dbl> <dbl>
#1 A e 1 1
#2 A f 2 1
#3 B e 3 3
#4 B f 4 3
#5 C f 5 NA

Related

R: how to combine the value of a column in two rows together, if these two rows share same character strings in another two columns

I am constructing an edge list for a network. I would like to combine the value of the third column together if the first two columns are the same. The data I have is like this.
ego alter weight
A B 12
B A 10
C D 5
D C 2
E F 7
F E 6
The dataset I expect is like this:
ego alter weight
A B 22
C D 7
E F 13
Please enlighten me if you have some great ideas to achieve the expected result.
A base R option using pmin/pmax + aggregate
aggregate(
weight ~ .,
transform(
df,
ego = pmin(ego,alter),
alter = pmax(ego,alter)
),
sum
)
gives
ego alter weight
1 A B 22
2 C D 7
3 E F 13
Or, we can use igraph
library(igraph)
df %>%
graph_from_data_frame(directed = FALSE) %>%
simplify() %>%
get.data.frame()
which gives
from to weight
1 A B 22
2 C D 7
3 E F 13
You could do the following:
f <- function(e,a) sapply(seq_along(e), \(i) paste0(sort(c(e[i],a[i])), collapse=""))
group_by(dt, grp = f(ego,alter)) %>%
summarize(weight=sum(weight),.groups="drop") %>%
separate(grp,c("ego","alter"),1)
Output:
ego alter weight
<chr> <chr> <int>
1 A B 22
2 C D 7
3 E F 13
A possible solution:
library(tidyverse)
df %>%
rowwise() %>%
mutate(aux = sort(c(ego, alter)) %>% str_c(collapse = "")) %>%
group_by(aux) %>%
summarise(ego, alter, weight = sum(weight), .groups = "drop") %>%
filter(!duplicated(aux)) %>%
select(-aux)
#> # A tibble: 3 × 3
#> ego alter weight
#> <chr> <chr> <int>
#> 1 A B 22
#> 2 C D 7
#> 3 E F 13
Or avoiding rowwise:
library(tidyverse)
df %>%
mutate(aux = apply(df[1:2], 1, \(x) sort(x) %>% paste0(collapse = ""))) %>%
group_by(aux) %>%
summarise(ego, alter, weight = sum(weight), .groups = "drop") %>%
filter(!duplicated(aux)) %>%
select(-aux)
#> # A tibble: 3 × 3
#> ego alter weight
#> <chr> <chr> <int>
#> 1 A B 22
#> 2 C D 7
#> 3 E F 13
And yet another solution, a bit more succinct:
library(tidyverse)
df %>%
group_by(aux = map2_chr(ego, alter, ~ sort(c(.x, .y)) %>% str_c(collapse = ""))) %>%
summarise(weight = sum(weight)) %>%
extract(aux, c("ego", "alter"), "([[:upper:]])([[:upper:]])")
#> # A tibble: 3 × 3
#> ego alter weight
#> <chr> <chr> <int>
#> 1 A B 22
#> 2 C D 7
#> 3 E F 13

How to get the pairwise difference of all values within uneven categories in R

I found solutions for simple vectors, but is there a way to make all pairwise differences using dplyr or base R for all the elements in a category?
library(tidyverse)
x = 1:10
y = rep(letters[1:5],each=2)
z = rep(1:2,length.out =10)
df = data.frame(x,y, z)
df = rbind(df,c(11,"e",3))
df$verif = paste0(df$y,df$z)
df$x = as.numeric(df$x)
df %>%
group_by(y) %>%
summarise(Diff = abs(x - lag(x)))
gives:
`summarise()` regrouping output by 'y' (override with `.groups` argument)
# A tibble: 11 x 2
# Groups: y [5]
y Diff
<chr> <dbl>
1 a NA
2 a 1
3 b NA
4 b 1
5 c NA
6 c 1
7 d NA
8 d 1
9 e NA
10 e 1
11 e 1
In this example, it's only using the previous value in the data frame, therefore missing pairwise differences (look at 9, 10 and 11 for group "e" ).
Is there a way to get all the pairwise differences in each category? Keeping track of the pairwise differences would be useful as well (e.g., e1 with e2 = 1, e2 with e3 is = 1 and e1 with e3 is =2)
I tired the outer() function but wasn't able to make it work as well as the dist() function.
I continued to try and found this:
my.df=df %>%
group_by(y) %>%
summarise(Diff = combn(x,2,diff))
my.df
# A tibble: 7 x 2
# Groups: y [5]
y Diff
<chr> <dbl>
1 a 1
2 b 1
3 c 1
4 d 1
5 e 1
6 e 2
7 e 1
I just now need to get which pairwise difference was calculated...
Continued again and got this mess:
my.df=df %>%
group_by(y) %>%
summarise(Diff = combn(x,2,diff),
test = combn(verif,2,paste, simplify = FALSE)) %>%
mutate(test2 = paste0(test, collapse = "-"))
my.df
> my.df
# A tibble: 7 x 4
# Groups: y [5]
y Diff test test2
<chr> <dbl> <list> <chr>
1 a 1 <chr [2]> "c(\"a1\", \"a2\")"
2 b 1 <chr [2]> "c(\"b1\", \"b2\")"
3 c 1 <chr [2]> "c(\"c1\", \"c2\")"
4 d 1 <chr [2]> "c(\"d1\", \"d2\")"
5 e 1 <chr [2]> "c(\"e1\", \"e2\")-c(\"e1\", \"e3\")-c(\"e2\", \"e3\")"
6 e 2 <chr [2]> "c(\"e1\", \"e2\")-c(\"e1\", \"e3\")-c(\"e2\", \"e3\")"
7 e 1 <chr [2]> "c(\"e1\", \"e2\")-c(\"e1\", \"e3\")-c(\"e2\", \"e3\")"
Got it:
library(tidyverse)
x = 1:10
y = rep(letters[1:5],each=2)
z = rep(1:2,length.out =10)
df = data.frame(x,y, z)
df = rbind(df,c(11,"e",3))
df$verif = paste0(df$y,df$z)
df$x = as.numeric(df$x)
my.df=df %>%
group_by(y) %>%
summarise(Diff = combn(x,2,diff),
test = combn(verif,2,paste, simplify = FALSE)) %>%
mutate(test2 = unlist(lapply(test, function(x)paste(x,collapse="-")))) %>%
select(-test)
Here is the output
my.df
# A tibble: 7 x 3
# Groups: y [5]
y Diff test2
<chr> <dbl> <chr>
1 a 1 a1-a2
2 b 1 b1-b2
3 c 1 c1-c2
4 d 1 d1-d2
5 e 1 e1-e2
6 e 2 e1-e3
7 e 1 e2-e3
You could do:
library(tidyverse)
df %>%
group_by(y) %>%
summarise(result = combn(seq_along(x), 2, function(i)
list(test1 = diff(x[i]), #The difference
test2 = paste0(verif[i], collapse = '-')), # The pairs
simplify = FALSE),
.groups = 'drop') %>%
unnest_wider(result)
# A tibble: 7 x 3
y test1 test2
<chr> <dbl> <chr>
1 a 1 a1-a2
2 b 1 b1-b2
3 c 1 c1-c2
4 d 1 d1-d2
5 e 1 e1-e2
6 e 2 e1-e3
7 e 1 e2-e3

Creating a function in R that takes input as dataframe , orders it grouped columns and generates a sequence. New column isnt coming in DF1

#Function that takes df1,group_vars as input and return df1 with seq columns as output
get_seq <- function(df1,group_vars) {
df1<-df1[ with( df1, do.call(order, mget(group_vars)) ), ]
df1<-df1 %>%
group_by(.dots=group_vars) %>%
mutate(seq=row_number())
return(df1)
}
Try using this function :
library(dplyr)
get_seq <- function(df1, group_vars) {
df1 %>%
arrange(across(all_of(group_vars))) %>%
group_by(across(all_of(group_vars))) %>%
mutate(seq=row_number())
}
You can call this function as :
df2 <- get_seq(df1, 'col1')
df2 <- get_seq(df1, c('col1', 'col2'))
It's really not clear what you're trying to do here. If you want to pass a variable number of column names to a function, sort the data frame according to these columns, then group_by the columns, then add a row number within each subgroup, you would do:
get_seq <- function(df1, ...)
{
group_vars <- enquos(...)
df1 %>%
arrange(!!!group_vars) %>%
group_by(!!!group_vars) %>%
mutate(seq = row_number())
}
So if we had a data frame like this:
df <- data.frame(a = rep(1:3, each = 4),
b = rep(LETTERS[4:1], each = 3),
c = rnorm(12))
We could do:
get_seq(df, a, b)
#> # A tibble: 12 x 4
#> # Groups: a, b [6]
#> a b c seq
#> <int> <fct> <dbl> <int>
#> 1 1 C 0.779 1
#> 2 1 D 0.318 1
#> 3 1 D -0.0710 2
#> 4 1 D 0.183 3
#> 5 2 B -0.351 1
#> 6 2 B 0.401 2
#> 7 2 C -1.26 1
#> 8 2 C 1.99 2
#> 9 3 A -0.0723 1
#> 10 3 A -0.602 2
#> 11 3 A 2.05 3
#> 12 3 B 2.13 1

tidyr::expand() for a single column across groups

tidyr::expand() returns all possible combinations of values from multiple columns. I'm looking for a slightly different behavior, where all the values are in a single column and the combinations are to be taken across groups.
For example, let the data be defined as follows:
library( tidyverse )
X <- bind_rows( data_frame(Group = "Group1", Value = LETTERS[1:3]),
data_frame(Group = "Group2", Value = letters[4:5]) )
We want all combinations of values from Group1 with values from Group2. My current clunky solution is to separate the values across multiple columns
Y <- X %>% group_by(Group) %>% do(vals = .$Value) %>% spread(Group, vals)
# # A tibble: 1 x 2
# Group1 Group2
# <list> <list>
# 1 <chr [3]> <chr [2]>
followed by a double unnest operation
Y %>% unnest( .preserve = Group2 ) %>% unnest
# # A tibble: 6 x 2
# Group1 Group2
# <chr> <chr>
# 1 A d
# 2 A e
# 3 B d
# 4 B e
# 5 C d
# 6 C e
This is the desired output, but as you can imagine, this solution doesn't generalize well: as the number of groups increases, so does the number of unnest operations that we have to perform.
Is there a more elegant solution?
Because OP seems happy to use base, I upgrade my comment to an answer:
expand.grid(split(X$Value, X$Group))
# Group1 Group2
# 1 A d
# 2 B d
# 3 C d
# 4 A e
# 5 B e
# 6 C e
As noted by OP, expand.grid converts character vectors to factors. To prevent that, use stringsAsFactors = FALSE.
The tidyverse equivalent is purrr::cross_df, which doesn't coerce to factor:
cross_df(split(X$Value, X$Group))
# A tibble: 6 x 2
# Group1 Group2
# <chr> <chr>
# 1 A d
# 2 B d
# 3 C d
# 4 A e
# 5 B e
# 6 C e
Here is one option. It will work on the cases with more than two groups although complete_ is deprecated.
library( tidyverse )
X2 <- X %>%
group_by(Group) %>%
mutate(ID = 1:n()) %>%
spread(Group, Value) %>%
select(-ID) %>%
complete_(names(.)) %>%
na.omit()
X2
# # A tibble: 6 x 2
# Group1 Group2
# <chr> <chr>
# 1 A d
# 2 A e
# 3 B d
# 4 B e
# 5 C d
# 6 C e
Update
!!!syms(names(.)) works well with the regular complete function, thus is better than using complete_ as my original solution.
library( tidyverse )
X2 <- X %>%
group_by(Group) %>%
mutate(ID = 1:n()) %>%
spread(Group, Value) %>%
select(-ID) %>%
complete(!!!syms(names(.))) %>%
na.omit()
X2
# # A tibble: 6 x 2
# Group1 Group2
# <chr> <chr>
# 1 A d
# 2 A e
# 3 B d
# 4 B e
# 5 C d
# 6 C e
I often use tidyr::crossing() to join all values from group2 to group.
data_frame(group = c(LETTERS[1:3])) %>%
crossing(group2 = letters[4:5])
I might do something like this:
data %>%
distinct(group) %>%
crossing(group2)
A more specific example:
dates <- lubridate::make_date(2000:2018)
data_frame(group = letters[1:5]) %>%
crossing(dates)
This still works with expand after spread.
X %>%
mutate(id = row_number()) %>%
spread(Group, Value) %>%
expand(Group1, Group2) %>%
na.omit()

dplyr and aggregation with summarise; a simple way to get mean at diffrent levels of aggregation

I am interested in the total mean, and the mean within different conditions, of some measurements preferably using dplyr's summarise function.
I'll illustrate my question in the following. Say I have some data, borrowed form this this,
dta <- read.table(header=TRUE, text='
subject sex condition measurement
1 M control 7.9
1 M cond1 12.3
1 M cond2 10.7
2 F control 6.3
2 F cond1 10.6
2 F cond2 11.1
3 F control 9.5
3 F cond1 13.1
3 F cond2 13.8
4 M control 11.5
4 M cond1 13.4
4 M cond2 12.9
') # ; dta
I now want the mean for each sex and the mean by sex for each condition. I know how to get it for each condition, like this.
# install.packages(c("dplyr"), dependencies = TRUE)
library(dplyr)
dta %>%
group_by(sex, condition) %>%
summarise(
mean = mean(measurement)
)
#> # A tibble: 6 x 3
#> # Groups: sex [?]
#> sex condition mean
#> <fctr> <fctr> <dbl>
#> 1 F cond1 11.85
#> 2 F cond2 12.45
#> 3 F control 7.90
#> 4 M cond1 12.85
#> 5 M cond2 11.80
#> 6 M control 9.70
But, this does not give me the aggregate mean for both sexes. To get this I either have to run a separate call, i.e.
dta %>%
group_by(sex) %>%
summarise(
mean = mean(measurement)
)
#> # A tibble: 2 x 2
#> sex mean
#> <fctr> <dbl>
#> 1 F 10.73333
#> 2 M 11.45000
or deconstruct data structure. Like this,
# install.packages(c("tidyr"), dependencies = TRUE)
library(tidyr)
dta_wide <- spread(dta, condition, measurement)
dta_wide %>%
group_by(sex) %>%
summarise(
mean_tot = mean(cond1 + cond2 + control)/3,
mean_cond1 = mean(cond1),
mean_cond2 = mean(cond2),
mean_control = mean(control)
)
#> # A tibble: 2 x 5
#> sex mean_tot mean_cond1 mean_cond2 mean_control
#> <fctr> <dbl> <dbl> <dbl> <dbl>
#> 1 F 10.73333 11.85 12.45 7.9
#> 2 M 11.45000 12.85 11.80 9.7
This gives me an output with both the over all mean by sex and the individual mean by condition.
However, both running two separate calls and deconstructing data seems unnecessarily cumbersome. Isn't there a simply way to add a categorical variable, here condition, as the by variable and at the same time keep the aggregate information, here mean by sex? Maybe I am overlooking something logical and shouldn't be messing with data like this?
One option is to calculate the two summaries separately, then join back:
dta %>%
group_by(sex, condition) %>%
summarise(mean = mean(measurement)) %>%
inner_join(
group_by(dta, sex) %>%
summarise(mean_tot = mean(measurement))
)
# Joining, by = "sex"
# A tibble: 6 x 4
# Groups: sex [?]
# sex condition mean mean_tot
# <fctr> <fctr> <dbl> <dbl>
#1 F cond1 11.85 10.73333
#2 F cond2 12.45 10.73333
#3 F control 7.90 10.73333
#4 M cond1 12.85 11.45000
#5 M cond2 11.80 11.45000
#6 M control 9.70 11.45000
Or use group_by twice:
dta %>%
group_by(sex, condition) %>%
summarise(s = sum(measurement), n = n()) %>%
group_by(sex) %>%
transmute(condition, mean_tot = sum(s) / sum(n), mean = s / n)
# Adding missing grouping variables: `sex`
# A tibble: 6 x 4
# Groups: sex [2]
# sex condition mean_tot mean
# <fctr> <fctr> <dbl> <dbl>
#1 F cond1 10.73333 11.85
#2 F cond2 10.73333 12.45
#3 F control 10.73333 7.90
#4 M cond1 11.45000 12.85
#5 M cond2 11.45000 11.80
#6 M control 11.45000 9.70

Resources