Related
I want to do a row wise check if multiple columns are all equal or not. I came up with a convoluted approach to count the occurences of each value per group. But this seems somewhat... cumbersome.
sample data
sample_df <- data.frame(id = letters[1:6], group = rep(c('r','l'),3), stringsAsFactors = FALSE)
set.seed(4)
for(i in 3:5) {
sample_df[i] <- sample(1:4, 6, replace = TRUE)
sample_df
}
desired output
library(tidyverse)
sample_df %>%
gather(var, value, V3:V5) %>%
mutate(n_var = n_distinct(var)) %>% # get the number of columns
group_by(id, group, value) %>%
mutate(test = n_distinct(var) == n_var ) %>% # check how frequent values occur per "var"
spread(var, value) %>%
select(-n_var)
#> # A tibble: 6 x 6
#> # Groups: id, group [6]
#> id group test V3 V4 V5
#> <chr> <chr> <lgl> <int> <int> <int>
#> 1 a r FALSE 3 3 1
#> 2 b l FALSE 1 4 4
#> 3 c r FALSE 2 4 2
#> 4 d l FALSE 2 1 2
#> 5 e r TRUE 4 4 4
#> 6 f l FALSE 2 2 3
Created on 2019-02-27 by the reprex package (v0.2.1)
Does not need to be dplyr. I just used it for showing what I want to achieve.
There are a bunch of ways to check for equality row-wise. Two good ways:
# test that all values equal the first column
rowSums(df == df[, 1]) == ncol(df)
# count the unique values, see if there is just 1
apply(df, 1, function(x) length(unique(x)) == 1)
If you only want to test some columns, then use a subset of columns rather than the whole data frame:
cols_to_test = c(3, 4, 5)
rowSums(df[cols_to_test] == df[, cols_to_test[1]]) == length(cols_to_test)
# count the unique values, see if there is just 1
apply(df[cols_to_test], 1, function(x) length(unique(x)) == 1)
Note I use df[cols_to_test] instead of df[, cols_to_test] when I want to be sure the result is a data.frame even if cols_to_test has length 1.
I would like to know per group in the column 'Participants' when the value '1' occurs for the first time in the column 'Signal' (by Participants). The count of the value '1' should refer to the group.
Here is an example data frame
> dfInput <- data.frame(Participants=c( 'A','A','A','B','B','B','B','C','C'), Signal=c(0, 1, 1, 0, 0, 0, 1, 1,0))
> dfInput
Participants Signal
1 A 0
2 A 1
3 A 1
4 B 0
5 B 0
6 B 0
7 B 1
8 C 1
9 C 0
And here is the output I am looking for:
> dfOutput <-data.frame(Participants=c( 'A','B','C'), RowNumberofFirst1=c(2, 4, 1))
> dfOutput
Participants RowNumberofFirst1
1 A 2
2 B 4
3 C 1
The problem is somewhat similar to this: Find first occurence of value in group using dplyr mutate
Yet, I could not adapt it accordingly, to create my output df
I think this is what you are looking for
library(dplyr)
dfInput %>%
group_by(Participants) %>%
summarise(RowNumberofFirst1 = which(Signal == 1)[1])
Another base R via aggregate
aggregate(Signal~Participants, dfInput, function(i)which(i == 1)[1])
# Participants Signal
#1 A 2
#2 B 4
#3 C 1
dfInput <- data.frame(Participants=c( 'A','A','A','B','B','B','B','C','C'),
Signal=c(0, 1, 1, 0, 0, 0, 1, 1,0))
library(dplyr)
dfInput %>%
group_by(Participants) %>% # for each Participant
summarise(NumFirst1 = min(row_number()[Signal == 1])) # get the minimum number of row where signal equals 1
# # A tibble: 3 x 2
# Participants NumFirst1
# <fct> <int>
# 1 A 2
# 2 B 4
# 3 C 1
In case you want to return the row (i.e. all column values) that you've identified, you can use this:
set.seed(5)
dfInput <- data.frame(Participants=c( 'A','A','A','B','B','B','B','C','C'),
Signal=c(0, 1, 1, 0, 0, 0, 1, 1,0),
A = sample(c("C","D","F"),9, replace = T),
B = sample(c("N","M","K"),9, replace = T))
library(dplyr)
dfInput %>%
group_by(Participants) %>%
filter(row_number() == min(row_number()[Signal == 1])) %>%
ungroup()
# # A tibble: 3 x 4
# Participants Signal A B
# <fct> <dbl> <fct> <fct>
# 1 A 1 F N
# 2 B 1 D N
# 3 C 1 F M
So, in this case you use filter to return, for each participant, the row that is equal to the minimum row number where Signal is 1.
With tidyverse:
dfInput%>%
group_by(Participants)%>%
mutate(max=cumsum(Signal),
RowNumberofFirst1=row_number())%>%
filter(max==1)%>%
top_n(-1,RowNumberofFirst1)%>%
select(Participants,RowNumberofFirst1)
# A tibble: 3 x 2
# Groups: Participants [3]
Participants RowNumberofFirst1
<fct> <int>
1 A 2
2 B 4
3 C 1
Here is a solution with base R:
dfInput <- data.frame(Participants=c( 'A','A','A','B','B','B','B','C','C'), Signal=c(0, 1, 1, 0, 0, 0, 1, 1,0))
tapply(dfInput$Signal, dfInput$Participants, FUN=function(x) min(which(x==1)))
# > tapply(dfInput$Signal, dfInput$Participants, FUN=function(x) min(which(x==1)))
# A B C
# 2 4 1
If you want a dataframe you can do:
first1 <- tapply(dfInput$Signal, dfInput$Participants, FUN=function(x) min(which(x==1)))
data.frame(Participants=names(first1), f=first1)
Here is a variant with data.table:
library("data.table")
setDT(dfInput)
dfInput[, which(Signal==1)[1], "Participants"]
I have a dataframe with groups that essentially looks like this
DF <- data.frame(state = c(rep("A", 3), rep("B",2), rep("A",2)))
DF
state
1 A
2 A
3 A
4 B
5 B
6 A
7 A
My question is how to count the number of consecutive rows where the first value is repeated in its first "block". So for DF above, the result should be 3. The first value can appear any number of times, with other values in between, or it may be the only value appearing.
The following naive attempt fails in general, as it counts all occurrences of the first value.
DF %>% mutate(is_first = as.integer(state == first(state))) %>%
summarize(count = sum(is_first))
The result in this case is 5. So, hints on a (preferably) dplyr solution to this would be appreciated.
You can try:
rle(as.character(DF$state))$lengths[1]
[1] 3
In your dplyr chain that would just be:
DF %>% summarize(count_first = rle(as.character(state))$lengths[1])
# count_first
# 1 3
Or to be overzealous with piping, using dplyr and magrittr:
library(dplyr)
library(magrittr)
DF %>% summarize(count_first = state %>%
as.character %>%
rle %$%
lengths %>%
first)
# count_first
# 1 3
Works also for grouped data:
DF <- data.frame(group = c(rep(1,4),rep(2,3)),state = c(rep("A", 3), rep("B",2), rep("A",2)))
# group state
# 1 1 A
# 2 1 A
# 3 1 A
# 4 1 B
# 5 2 B
# 6 2 A
# 7 2 A
DF %>% group_by(group) %>% summarize(count_first = rle(as.character(state))$lengths[1])
# # A tibble: 2 x 2
# group count_first
# <dbl> <int>
# 1 1 3
# 2 2 1
No need of dplyrhere but you can modify this example to use it with dplyr. The key is the function rle
state = c(rep("A", 3), rep("B",2), rep("A",2))
x = rle(state)
DF = data.frame(len = x$lengths, state = x$values)
DF
# get the longest run of consecutive "A"
max(DF[DF$state == "A",]$len)
I have a very wide & long data set from which I need to pick out rows where any of a selection of variables meet certain conditions. So far, scoped filtering in dplyr along with any_vars are very close to what I need. To illustrate:
x <- tibble(v1 = c(1, 1, 5, 3, 4), v2 = c(3, 1, 2, 1, 2))
filter_all(x, any_vars( . == min(.)))
produces
# A tibble: 3 x 2
v1 v2
<dbl> <dbl>
1 1 3
2 1 1
3 3 1
I want to add the name of the "filtering variable" to the resulting rows as shown below:
v1 v2 var
<dbl> <dbl> <chr>
1 1 3 v1
2 1 1 v1
3 1 1 v2
4 3 1 v2
Any suggestions? I suspect that one of the map function in purrr may work to do the filtering one by one and then combine the results afterwards.
When one qualify for multiple variables (Thanks to #Moody_Mudskipper), I'd like show the row multiple times --- both with v1 and v2 in this case.
There you go, this should scale for a wide dataset.
x <- tibble(v1 = c(1, 1, 5, 3, 4), v2 = c(3, 1, 2, 1, 2))
library(dplyr)
library(tidyr)
x %>%
mutate_all(rank,ties.method ="min") %>%
gather(var,val) %>%
cbind(x,.) %>%
filter(val ==1) %>%
select(-val)
# v1 v2 var
# 1 1 3 v1
# 2 1 1 v1
# 3 1 1 v2
# 4 3 1 v2
to avoid building big temp table:
gathered <- x %>%
mutate_all(rank,ties.method ="min") %>%
gather(var,val)
rows_to_keep <- which(gathered$val == 1)
cbind(x[(rows_to_keep-1) %% nrow(x) + 1,],gathered[rows_to_keep,])
This is uglier but I think it's the most efficient I could come up with:
log_df <- mutate_all(x,function(x){x==min(x)}) # identify rows that contain min (no time wasted sorting here)
filter1 <- rowSums(log_df)>0 # to get rid of uninteresting rows
x2 <- x[filter1,]
log_df2 <- log_df[filter1,]
gathered <- gather(log_df2,var,val) # put in long format
rows_to_keep <- which(gathered$val)
cbind(x2[(rows_to_keep-1) %% nrow(x2) + 1,],gathered[rows_to_keep,]) %>% select(-val)
Try this code:
x%>%filter_all(., any_vars( . == min(.)))%>%
data.frame(.,var=apply(.,1,function(i) names(.)[i==sapply(x,min)]))
If this helps please let us know. Thank you.
This code will fail in one condition: If more than one variable in a row are minimums. for example in the example posted, if there is a row which has both 1's then this code will fail. Thank you
Thanks for the idea to create new columns, my solution below stores the variable names first prior to the filtering. Let me know if you can improve upon this:
x %>%
mutate_all(funs(qual = . == min(.))) %>%
filter_at(vars(ends_with("_qual")), any_vars(. == TRUE)) %>%
gather(var, qual, ends_with("_qual")) %>%
filter(qual==TRUE) %>%
select(-qual) %>%
extract(var, "var")
the intermediate table after the first step:
v1 v2 v1_qual v2_qual
1 1 3 TRUE FALSE
2 1 1 TRUE TRUE
3 5 2 FALSE FALSE
4 3 1 FALSE TRUE
5 4 2 FALSE FALSE
Consider the following dataframe (ordered by id and time):
df <- data.frame(id = c(rep(1,7),rep(2,5)), event = c("a","b","b","b","a","b","a","a","a","b","a","a"), time = c(1,3,6,12,24,30,32,1,2,6,17,24))
df
id event time
1 1 a 1
2 1 b 3
3 1 b 6
4 1 b 12
5 1 a 24
6 1 b 30
7 1 a 42
8 2 a 1
9 2 a 2
10 2 b 6
11 2 a 17
12 2 a 24
I want to count how many times a given sequence of events appears in each "id" group. Consider the following sequence with time constraints:
seq <- c("a", "b", "a")
time_LB <- c(0, 2, 12)
time_UB <- c(Inf, 8, 18)
It means that event "a" can start at any time, event "b" must start no earlier than 2 and no later than 8 after event "a", another event "a" must start no earlier than 12 and no later than 18 after event "b".
Some rules for creating sequences:
Events don't need to be consecutive with respect to "time" column. For example, seq can be constructed from rows 1, 3, and 5.
To be counted, sequences must have different first event. For example, if seq = rows 8, 10, and 11 was counted, then seq = rows 8, 10, and 12 must not be counted.
The events may be included in many constructed sequences if they do not violate the second rule. For example, we count both sequences: rows 1, 3, 5 and rows 5, 6, 7.
The expected result:
df1
id count
1 1 2
2 2 2
There are some related questions in R - Identify a sequence of row elements by groups in a dataframe and Finding rows in R dataframe where a column value follows a sequence.
Is it a way to solve the problem using "dplyr"?
I believe this is what you're looking for. It gives you the desired output. Note that there is a typo in your original question where you have a 32 instead of a 42 when you define the time column in df. I say this is a typo because it doesn't match your output immediately below the definition of df. I changed the 32 to a 42 in the code below.
library(dplyr)
df <- data.frame(id = c(rep(1,7),rep(2,5)), event = c("a","b","b","b","a","b","a","a","a","b","a","a"), time = c(1,3,6,12,24,30,42,1,2,6,17,24))
seq <- c("a", "b", "a")
time_LB <- c(0, 2, 12)
time_UB <- c(Inf, 8, 18)
df %>%
full_join(df,by='id',suffix=c('1','2')) %>%
full_join(df,by='id') %>%
rename(event3 = event, time3 = time) %>%
filter(event1 == seq[1] & event2 == seq[2] & event3 == seq[3]) %>%
filter(time1 %>% between(time_LB[1],time_UB[1])) %>%
filter((time2-time1) %>% between(time_LB[2],time_UB[2])) %>%
filter((time3-time2) %>% between(time_LB[3],time_UB[3])) %>%
group_by(id,time1) %>%
slice(1) %>% # slice 1 row for each unique id and time1 (so no duplicate time1s)
group_by(id) %>%
count()
Here's the output:
# A tibble: 2 x 2
id n
<dbl> <int>
1 1 2
2 2 2
Also, if you omit the last 2 parts of the dplyr pipe that do the counting (to see the sequences it is matching), you get the following sequences:
Source: local data frame [4 x 7]
Groups: id, time1 [4]
id event1 time1 event2 time2 event3 time3
<dbl> <fctr> <dbl> <fctr> <dbl> <fctr> <dbl>
1 1 a 1 b 6 a 24
2 1 a 24 b 30 a 42
3 2 a 1 b 6 a 24
4 2 a 2 b 6 a 24
EDIT IN RESPONSE TO COMMENT REGARDING GENERALIZING THIS: Yes it is possible to generalize this to arbitrary length sequences but requires some R voodoo. Most notably, note the use of Reduce, which allows you to apply a common function on a list of objects as well as foreach, which I'm borrowing from the foreach package to do some arbitrary looping. Here's the code:
library(dplyr)
library(foreach)
df <- data.frame(id = c(rep(1,7),rep(2,5)), event = c("a","b","b","b","a","b","a","a","a","b","a","a"), time = c(1,3,6,12,24,30,42,1,2,6,17,24))
seq <- c("a", "b", "a")
time_LB <- c(0, 2, 12)
time_UB <- c(Inf, 8, 18)
multi_full_join = function(df1,df2) {full_join(df1,df2,by='id')}
df_list = foreach(i=1:length(seq)) %do% {df}
df2 = Reduce(multi_full_join,df_list)
names(df2)[grep('event',names(df2))] = paste0('event',seq_along(seq))
names(df2)[grep('time',names(df2))] = paste0('time',seq_along(seq))
df2 = df2 %>% mutate_if(is.factor,as.character)
df2 = df2 %>%
mutate(seq_string = Reduce(paste0,df2 %>% select(grep('event',names(df2))) %>% as.list)) %>%
filter(seq_string == paste0(seq,collapse=''))
time_diff = df2 %>% select(grep('time',names(df2))) %>%
t %>%
as.data.frame() %>%
lapply(diff) %>%
unlist %>% matrix(ncol=2,byrow=TRUE) %>%
as.data.frame
foreach(i=seq_along(time_diff),.combine=data.frame) %do%
{
time_diff[[i]] %>% between(time_LB[i+1],time_UB[i+1])
} %>%
Reduce(`&`,.) %>%
which %>%
slice(df2,.) %>%
filter(time1 %>% between(time_LB[1],time_UB[1])) %>% # deal with time1 bounds, which we skipped over earlier
group_by(id,time1) %>%
slice(1) # slice 1 row for each unique id and time1 (so no duplicate time1s)
This outputs the following:
Source: local data frame [4 x 8]
Groups: id, time1 [4]
id event1 time1 event2 time2 event3 time3 seq_string
<dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl> <chr>
1 1 a 1 b 6 a 24 aba
2 1 a 24 b 30 a 42 aba
3 2 a 1 b 6 a 24 aba
4 2 a 2 b 6 a 24 aba
If you want just the counts, you can group_by(id) then count() as in the original code snippet.
Perhaps it's easier to represent event sequences as strings and use regex:
df.str = lapply(split(df, df$id), function(d) {
z = rep('-', tail(d,1)$time); z[d$time] = as.character(d$event); z })
df.str = lapply(df.str, paste, collapse='')
# > df.str
# $`1`
# [1] "a-b--b-----b-----------a-----b-----------a"
#
# $`2`
# [1] "aa---b----------a------a"
df1 = lapply(df.str, function(s) length(gregexpr('(?=a.{1,7}b.{11,17}a)', s, perl=T)[[1]]))
> data.frame(id=names(df1), count=unlist(df1))
# id count
# 1 1 2
# 2 2 2