conditional grouping based on group invariant column value - r

I have a data frame:
temp = as.data.frame(matrix(c(1,2,2,-3,1,1,2,3,-2,0,2,1,-5,1,1,2,1,3,0,0,3,2,4,-1,1,3,2,2,0,1,3,2,4,3,0), ncol=5,byrow = TRUE))
colnames(temp) = c("ID","srch","utility","reutility","code")
I need to group by "ID" column. For any "ID" value, "srch" column stays the same. For each group, if srch > 1 I need min(utility where code==1)-max(reutility where code ==0) else (i.e. if srch<=1) I need it to be 0.
This is the output I need:
temp = as.data.frame(matrix(c(1,4,2,0,3,-1), ncol=2,byrow = TRUE))
colnames(temp)=c("ID","Val")
Any code using dplyr is great but others are welcome too.

Assuming output for ID = 2 is 0, we can do :
library(dplyr)
temp %>%
group_by(ID) %>%
summarise(Val = if(first(srch) > 1) min(utility[code == 1]) -
max(reutility[code == 0]) else 0)
# A tibble: 3 x 2
# ID Val
# <dbl> <dbl>
#1 1 4
#2 2 0
#3 3 -1

Related

How to summarise grouped value increases

I have this type of data:
df <- data.frame(
Utt = c(rep("oh", 10), rep("ah", 10)),
name = rep(LETTERS[1:2], 10),
value = c(0.5,2,2,2,2,1,0,1,3.5,1,
2.2,2.3,1.9,0.1,0.3,1.8,3,4,3.5,2)
)
I need to know whether within in each group of Utt and name, there are continuous value increases and how large these increases are.
EDIT: I've cobbled together this code, which produces the right result but seems convoluted:
df %>%
# order by name:
arrange(name) %>%
group_by(name, Utt) %>%
# mutate:
mutate(
# is there an increase from one value to the next?
is_increase = ifelse(lag(value) < value, value, NA),
# what's the difference between these values?
diff = is_increase - lag(value)) %>%
group_by(name, Utt, grp = rleid(!is.na(diff))) %>%
# sum the contiguous values:
summarise(increase_size = sum(diff, na.rm = TRUE)) %>%
# remove 0 values:
filter(!increase_size == 0) %>%
# put same-group increase_sizes in the same row:
summarise(
increase_size = str_c(increase_size, collapse = ', '))
# A tibble: 3 x 3
# Groups: name [2]
name Utt increase_size
<chr> <chr> <chr>
1 A ah 3.2
2 A oh 1.5, 3.5
3 B ah 3.9
NOTE: Ideally, the expected outcome would be:
1 A ah 3.2
2 A oh 1.5, 3.5
3 B ah 3.9
4 B oh NA
Is there a better (i.e., more concise, more clever) dplyr solution?
Use this function to find what you want.
f <- function(x) {
ind <- which(x > lag(x))
if (length(ind) == 0) {
return(NA)
}
ind2 <- ind[which(lead(ind, default = max(ind) + 2) - ind > 1)]
ind1 <- ind[which(ind - lag(ind, default = min(ind) - 2) > 1)] - 1
return(paste0(x[ind2] - x[ind1], collapse = ", "))
}
And use the function in summarise:
df %>% group_by(name, Utt) %>% summarise(increase = f(value))
Using tidyverse, my solution was similar to yours. One possible modification might be to subset your columns before summing instead of filtering. This will keep all combinations of name and Utt and allow for NA for increase_size in the end. Since the column increase_size is character type, you can convert an empty string to NA.
library(data.table)
library(tidyverse)
df %>%
arrange(name) %>%
group_by(name, Utt) %>%
mutate(diff = c(0, diff(value))) %>%
group_by(grp = rleid(diff < 0), .add = T) %>%
summarise(increase_size = sum(diff[diff > 0], na.rm = T)) %>%
group_by(name, Utt) %>%
summarise(increase_size = toString(increase_size[increase_size > 0])) %>%
mutate(increase_size = na_if(increase_size, ""))
Output
name Utt increase_size
<chr> <chr> <chr>
1 A ah 3.2
2 A oh 1.5, 3.5
3 B ah 3.9
4 B oh NA

Matching and returning values based on condition or ID

This seems like it should be fairly easy, but i'm having trouble with it.
Example: I have a dataframe with two columns IDs and perc_change. I want to know which unique IDs have had more than 30% change.
IDs <- c(1,1,2,1,1,2,2,2,3,2,3,4,5,6,3)
perc_change <- c(50,40,60,70,80,30,20,40,23,25,10,30,12,7,70)
df <- data.frame(IDs, perc_change)
So far:
if (df$perc_change > 30) {
unique(df$IDs)
} else {
}
This obviously doesn't work because it returns all unique IDs. Should I be be finding the index and then matching it or soemthing?
Thanks in advance!
We could do so, to get the values of each ID:
library(dplyr)
df %>%
group_by(IDs) %>%
filter(perc_change > 30) %>%
mutate(values = paste0(perc_change, collapse = ","), .keep="unused") %>%
distinct(IDs, .keep_all = TRUE)
Output:
IDs values
<dbl> <chr>
1 1 50,40,70,80
2 2 60,40
3 3 70
Just use [ to subset and take the unique - i.e. no need for if/else conditions
with(df, unique(IDs[perc_change > 30]))
[1] 1 2 3
We can group, filter and count using dplyr
> library(dplyr)
> df %>%
group_by(IDs) %>%
filter(perc_change > 30) %>%
count(IDs)
# A tibble: 3 x 2
# Groups: IDs [3]
IDs n
<dbl> <int>
1 1 4
2 2 2
3 3 1
unique(df[df$perc_change > 30,"IDs"])

R - Efficiently counting number of switches in binary variable for each group

To give some context, I have a dataframe of eyetracking data from a psychology experiment and I want to count the switches between two Areas Of Interest (AOI), for each participant.
Here's a simplified dataframe of the problem (we assume that AOI2 == !AOI1 so we don't need it):
library(tidyverse)
df <- tibble(Participant = rep(1:7, times = 1, each = 10),
Time = rep(1:10, 7),
AOI1 = rbinom(70, 1, .5))
What I want is to count how many times the value of AOI1 changes during time for each participant. I could do it using for loops like bellow, but I was wondering if there was a simpler and more R way of doing it?
df.switches <- tibble(Participant = 1:7,
Switches = NA)
for(p in 1:7){
s <- 0
for(i in 2:10){
if(subset(df, Participant == p & Time == i, select = AOI1) !=
subset(df, Participant == p & Time == i-1, select = AOI1)){
s <- s + 1
}
}
df.switches <- df.switches %>%
mutate(Switches = ifelse(Participant == p, s, Switches))
}
One option is to use dplyr::lag to compare the value with current row in order to count number of switches for each participants.
library(tidyverse)
df %>% group_by(Participant) %>%
summarise(count = sum(AOI1 != lag(AOI1, default = -Inf)))
# # A tibble: 7 x 2
# Participant count
# <int> <int>
# 1 1 5
# 2 2 4
# 3 3 5
# 4 4 4
# 5 5 6
# 6 6 6
# 7 7 4
Since you are already using the tidyverse, you can use lag available as part of dplyr. This checks whether the value of AOI1 is the same as the previous value, and if not, sets a flag to 1. For the first record of each participant, the value is automatically set to NA. Note that the group_by is required, otherwise the flag won't get "reset" every time a new participant is encountered. Also it is assumed that the data is sorted by Participant and Time; if not, pipe arrange(Participant, Time) before the group_by.
df <- tibble(Participant = rep(1:7, times = 1, each = 10),
Time = rep(1:10, 7),
AOI1 = rbinom(70, 1, .5))
df2 <- df %>%
group_by(Participant) %>%
mutate(switch = ifelse(AOI1 != lag(AOI1), 1, 0)) %>%
summarise(num_switches = sum(switch, na.rm = TRUE))

sum() with conditions provides incorrect result in dplyr package

When applying sum() with conditions in summarize() function, it does not provide the correct answer.
Make a data frame x:
x = data.frame(flag = 1, uin = 1, val = 2)
x = rbind(x, data.frame(flag = 2, uin = 2, val = 3))
This is what x looks like:
flag uin val
1 1 1 2
2 2 2 3
I want to sum up the val and the val with flag == 2, so I write
x %>% summarize(val = sum(val), val.2 = sum(val[flag == 2]))
and the result is:
val val.2
1 5 NA
But what I expect is that val.2 is 3 instead of NA. For more information, if I calculate the conditional summation first then the total summation, it comes out with the correct answer:
x %>% summarize(val.2 = sum(val[flag == 2]), val = sum(val))
val.2 val
1 3 5
Moreover, if I only calculate the conditional summation, it works fine too:
x %>% summarize(val.2 = sum(val[flag == 2]))
val.2
1 3
Duplicate names are causing you problems. In this code
x %>% summarize(val = sum(val), val.2 = sum(val[flag == 2]))
You have two val objects. One created from val = sum(val) and other from the data frame x. In your code, you change val from the data frame value to val=sum(val) = 5. Then you do
`val[flag == 2]`
which gives a vector c(2, NA), since val = 5. Hence, when you add 2 + NA you get NA. The solution, don't use val twice,
x %>% summarize(val_sum = sum(val), val.2 = sum(val[flag == 2]))

Grouped operation on all groups relative to "baseline" group, with multiple observations

Starting with data containing multiple observations for each group, like this:
set.seed(1)
my.df <- data.frame(
timepoint = rep(c(0, 1, 2), each= 3),
counts = round(rnorm(9, 50, 10), 0)
)
> my.df
timepoint counts
1 0 44
2 0 52
3 0 42
4 1 66
5 1 53
6 1 42
7 2 55
8 2 57
9 2 56
To perform a summary calculation at each timepoint relative to timepoint == 0, for each group I need to pass a vector of counts for timepoint == 0 and a vector of counts for the group (e.g. timepoint == 0) to an arbitrary function, e.g.
NonsenseFunction <- function(x, y){
(mean(x) - mean(y)) / (1 - mean(y))
}
I can get the required output from this table, either with dplyr:
library(dplyr)
my.df %>%
group_by(timepoint) %>%
mutate(rep = paste0("r", 1:n())) %>%
left_join(x = ., y = filter(., timepoint == 0), by = "rep") %>%
group_by(timepoint.x) %>%
summarise(result = NonsenseFunction(counts.x, counts.y))
or data.table:
library(data.table)
my.dt <- data.table(my.df)
my.dt[, rep := paste0("r", 1:length(counts)), by = timepoint]
merge(my.dt, my.dt[timepoint == 0], by = "rep", all = TRUE)[
, NonsenseFunction(counts.x, counts.y), by = timepoint.x]
This only works if the number of observations between groups is the same. Anyway, the observations aren't matched, so using the temporary rep variable seems hacky.
For a more general case, where I need to pass vectors of the baseline values and the group's values to an arbitrary (more complicated) function, is there an idiomatic data.table or dplyr way of doing so with a grouped operation for all groups?
Here's the straightforward data.table approach:
my.dt[, f(counts, my.dt[timepoint==0, counts]), by=timepoint]
This probably grabs my.dt[timepoint==0, counts] again and again, for each group. You could instead save that value ahead of time:
v = my.dt[timepoint==0, counts]
my.dt[, f(counts, v), by=timepoint]
... or if you don't want to add v to the environment, maybe
with(list(v = my.dt[timepoint==0, counts]),
my.dt[, f(counts, v), by=timepoint]
)
You could give the second argument to use the vector from your group of interest as a constant.
my.df %>%
group_by(timepoint) %>%
mutate(response = NonsenseFunction(counts, my.df$counts[my.df$timepoint == 0]))
Or if you want to make it beforehand:
constant = = my.df$counts[my.df$timepoint == 0]
my.df %>%
group_by(timepoint) %>%
mutate(response = NonsenseFunction(counts, constant))
You can try,
library(dplyr)
my.df %>%
mutate(new = mean(counts[timepoint == 0])) %>%
group_by(timepoint) %>%
summarise(result = NonsenseFunction(counts, new))
# A tibble: 3 × 2
# timepoint result
# <dbl> <dbl>
#1 0 0.0000000
#2 1 0.1398601
#3 2 0.2097902

Resources