With following data of start and end points, how can we get routes between 2 points.
> ddf
start end
1 a b
2 a c
3 b e
4 b f
5 b c
6 a d
7 e f
8 f g
> dput(ddf)
structure(list(start = structure(c(1L, 1L, 2L, 2L, 2L, 1L, 3L,
4L), .Label = c("a", "b", "e", "f"), class = "factor"), end = structure(c(1L,
2L, 4L, 5L, 2L, 3L, 5L, 6L), .Label = c("b", "c", "d", "e", "f",
"g"), class = "factor")), .Names = c("start", "end"), class = "data.frame", row.names = c(NA,
-8L))
>
This page (http://www.anselm.edu/homepage/mmalita/culpro/graf1.html) shows only 2 line solution in Prolog! Following code works but does not give correct output list. It can be started with mainpath(ddf, 'a', 'f') to find path between 'a' and 'f'.
mainpath = function(ddf, startpt, endpt){
ddf$start = as.character(ddf$start)
ddf$end = as.character(ddf$end)
nodenames = sort(unique(c(ddf$start, ddf$end)))
routev=c(startpt)
if(is_arc(ddf, startpt, endpt, routev)) {return("Direct route found."); }
else{
nodenames = nodenames[which(nodenames!=startpt)]
nodenames = nodenames[which(nodenames!=endpt )]
if(is_path(ddf, nodenames, startpt, endpt, routev))
return ("Completed successfully")
else return ("Could not find a path.")
}
}
is_arc = function(ddf, frompt, topt, routevector){
len = nrow(ddf)
for(i in 1:len)
if(frompt == ddf$start[i] && topt == ddf$end[i]) {
routevector = append(routevector, frompt)
routevector = append(routevector, topt)
print(routevector)
return (TRUE);
}
return (FALSE)
}
is_path = function(ddf, othernodes, frompt, topt, routevector){
if(is_arc(ddf, frompt, topt, routevector)){
return (TRUE)
}
if(length(othernodes)==0){
print(routevector)
return (FALSE)
}
for(i in 1:length(othernodes)){
intermediate = othernodes[i]
if(is_arc(ddf, frompt, intermediate, routevector) && is_path(ddf, othernodes, intermediate, topt, routevector)){
return (TRUE)
}
}
print(routevector)
return (FALSE)
}
I am sure it can be much improved, especially all these for loops etc can be removed using apply etc functions. I know that packages with such functions are available but how can it be done in base R?
Your answers / comments will be appreciated.
While I'm sure there are wonderful ways to do this with linear algebra, here's a relatively intuitive method (using dplyr here, but translate as you like):
library(dplyr)
# convert factors to characters, filter down to possible starting points
df %>% mutate_each(funs(as.character)) %>% filter(start == 'a') %>%
# join to add possible next steps, indexing endpoints to startpoints
left_join(df, by = c('end' = 'start')) %>%
# iterate for successive steps
left_join(df, by = c('end.y' = 'start')) %>%
left_join(df, by = c('end.y.y' = 'start')) %>%
# chop out rows that didn't end at 'g' (omit if you're curious)
filter(apply(., 1, function(x){x[length(na.omit(x))]}) == 'g')
# start end.x end.y end.y.y end
# 1 a b e f g
# 2 a b f g <NA>
If df is factors, you'll get warnings about coercing, though it'll run fine (coerce to start or add %>% mutate_each(funs(as.character)) to each df call and they'll go away). The column names are a bit ugly; set them with left_join's suffix parameter or select or rename if you like.
Obviously the iteration of joins invites a loop, which might look as such:
df2 <- df %>% mutate_each(funs(as.character)) %>% filter(start == 'a')
for(i in 0:2){
endcol <- paste0('end', paste(rep('.y', i), collapse = ''))
df2 <- df2 %>% left_join(df, by = setNames('start', endcol))
}
df2 %>% filter(apply(., 1, function(x){x[length(na.omit(x))]}) == 'g')
# start end.x end.y end.y.y end
# 1 a b e f g
# 2 a b f g <NA>
If you set the number of iterations too high, it will error out because there are no rows to join to, but the error is actually quite convenient, as the loop has already saved the df2 you want, so the error just stops extra work from being done. Add tryCatch if you like, or go the other direction and refactor it into a scary-looking while loop which will actually iterate the perfect number of times:
df2 <- df %>% mutate_each(funs(as.character)) %>% filter(start == 'a')
endcol <- 'end' # initialize iterating variable
while(TRUE){
df2 <- df2 %>% left_join(df, by = setNames('start', endcol))
endcol <- paste0(endcol, '.y')
}
df2 %>% filter(apply(., 1, function(x){x[length(na.omit(x))]}) == 'g')
# start end.x end.y end.y.y end
# 1 a b e f g
# 2 a b f g <NA>
Following is much shorter and easily understandable, recursive function using base R. (First 2 lines are not needed if start and end columns of data.frame being sent are already character and not factor).
mainpath2 = function(ddf, startpt, endpt, route=c()){
ddf$start = as.character(ddf$start)
ddf$end = as.character(ddf$end)
if(startpt == endpt) return("Error: Same Start and End points.\n")
for(i in 1:nrow(ddf)){
if(ddf$start[i] == startpt){
route = append(route, startpt)
if(ddf$end[i] == endpt){
# PATH FOUND:
route = append(route, endpt)
print(route)
}
else mainpath2(ddf[-i, ], ddf$end[i], endpt, route)
route = route[-length(route)]
}
}
}
> mainpath2(ddf, 'a', 'g')
[1] "a" "b" "e" "f" "g"
[1] "a" "b" "f" "g"
Related
R> data.frame(x1=1:3, x2=11:13, y=c('a', 'a;b', 'b'))
x1 x2 y
1 1 11 a
2 2 12 a;b
3 3 13 b
I have a data.frame in the format like above, where if y contains a, then x1 will be added to the result, and if y contains b, then x2 will be added to the result.
For this specific example, the result should be data.frame(i=c(1,2,2,3), v=c(1, 2, 12, 13)), where i is the index. The order must be maintained as in the input. It is trivial to use element-by-element operations to perform these tasks. But I am wondering if there is a more efficient implementation (e.g., based on vector operations). Is there a more efficient implementation of this problem?
Edit
A method based on *apply may be
f=data.frame(x1=1:3, x2=11:13, y=c('a', 'a;b', 'b'))
n=nrow(f)
do.call(
rbind
, lapply(seq_len(n), function(i) {
do.call(
rbind
, lapply(strsplit(f$y[[i]], ';')[[1]], function(x) {
if(x=='a') {
data.frame(i=i, v=f$x1[[i]])
} else if(x=='b') {
data.frame(i=i, v=f$x2[[i]])
} else {
NULL
}
})
)
})
)
This will give you the desired output:
vector <- df %>%
separate_rows(y) %>%
mutate(new_col = ifelse(y=="a", x1, x2)) %>%
pull(new_col)
dput(vector)
output:
c(1L, 2L, 12L, 13L)
I don't know about efficient for your particular case, but here is what I propose :
library(tidyr)
dat <- tibble( # First create the data
x1 = 1:3, x2 = 11:13, y = c('a', 'a;b', 'b'))
dat %>%
add_row(x1 = 23, x2 = -2, y = "bla") %>% # Add a row for testing purposes
separate_rows(y, sep = ";") %>% # separate rows with ";"
mutate(
result =
case_when( # Output either x1 or x2 based on the value in "y"
y == "a" ~ x1,
y == "b" ~ x2))
I frequently need to recode some (not all!) values in a data frame column based off of a look-up table. I'm not satisfied by the ways I know of to solve the problem. I'd like to be able to do it in a clear, stable, and efficient way. Before I write my own function, I'd want to make sure I'm not duplicating something standard that's already out there.
## Toy example
data = data.frame(
id = 1:7,
x = c("A", "A", "B", "C", "D", "AA", ".")
)
lookup = data.frame(
old = c("A", "D", "."),
new = c("a", "d", "!")
)
## desired result
# id x
# 1 1 a
# 2 2 a
# 3 3 B
# 4 4 C
# 5 5 d
# 6 6 AA
# 7 7 !
I can do it with a join, coalesce, unselect as below, but this isn't as clear as I'd like - too many steps.
## This works, but is more steps than I want
library(dplyr)
data %>%
left_join(lookup, by = c("x" = "old")) %>%
mutate(x = coalesce(new, x)) %>%
select(-new)
It can also be done with dplyr::recode, as below, converting the lookup table to a named lookup vector. I prefer lookup as a data frame, but I'm okay with the named vector solution. My concern here is that recode is the Questioning lifecycle phase, so I'm worried that this method isn't stable.
lookup_v = pull(lookup, new) %>% setNames(lookup$old)
data %>%
mutate(x = recode(x, !!!lookup_v))
It could also be done with, say, stringr::str_replace, but using regex for whole-string matching isn't efficient. I suppose there is forcats::fct_recode is a stable version of recode, but I don't want a factor output (though mutate(x = as.character(fct_recode(x, !!!lookup_v))) is perhaps my favorite option so far...).
I had hoped that the new-ish rows_update() family of dplyr functions would work, but it is strict about column names, and I don't think it can update the column it's joining on. (And it's Experimental, so doesn't yet meet my stability requirement.)
Summary of my requirements:
A single data column is updated based off of a lookup data frame (preferably) or named vector (allowable)
Not all values in the data are included in the lookup--the ones that are not present are not modified
Must work on character class input. Working more generally is a nice-to-have.
No dependencies outside of base R and tidyverse packages (though I'd also be interested in seeing a data.table solution)
No functions used that are in lifecycle phases like superseded or questioning. Please note any experimental lifecycle functions, as they have future potential.
Concise, clear code
I don't need extreme optimization, but nothing wildly inefficient (like regex when it's not needed)
A direct data.table solution, without %in%.
Depending on the length of the lookup / data tables, adding keys could improve performance substantially, but this isn't the case on this simple example.
library(data.table)
setDT(data)
setDT(lookup)
## If needed
# setkey(data,x)
# setkey(lookup,old)
data[lookup, x:=new, on=.(x=old)]
data
id x
1: 1 a
2: 2 a
3: 3 B
4: 4 C
5: 5 d
6: 6 AA
7: 7 !
Benchmarking
Expanding the original dataset to 10M rows, 15 runs using microbenchmark gave the follow results on my computer:
Note that forcats::fct_recode and dplyr::recode solutions mentioned by the OP have also been included. Neither works with the updated data because the named vector that resolves to . = ! will throw an error, which is why results are tested on the original dataset.
data = data.frame(
id = 1:5,
x = c("A", "A", "B", "C", "D")
)
lookup = data.frame(
old = c("A", "D"),
new = c("a", "d")
)
set.seed(1)
data <- data[sample(1:5, 1E7, replace = T),]
dt_lookup <- data.table::copy(lookup)
dplyr_coalesce <- function(){
library(dplyr)
lookupV <- setNames(lookup$new, lookup$old)
data %>%
dplyr::mutate(x = coalesce(lookupV[ x ], x))
}
datatable_in <- function(){
library(data.table)
lookupV <- setNames(lookup$new, lookup$old)
setDT(dt_data)
dt_data[ x %in% names(lookupV), x := lookupV[ x ] ]
}
datatable <- function(){
library(data.table)
setDT(dt_data)
setDT(dt_lookup)
## If needed
# setkey(data,x)
# setkey(lookup,old)
dt_data[dt_lookup, x:=new, on =.(x=old)]
}
purrr_modify_if <- function(){
library(dplyr)
library(purrr)
lookupV <- setNames(lookup$new, lookup$old)
data %>%
dplyr::mutate(x = modify_if(x, x %in% lookup$old, ~ lookupV[.x]))
}
stringr_str_replace_all_update <- function(){
library(dplyr)
library(stringr)
lookupV <- setNames(lookup$new, do.call(sprintf, list("^\\Q%s\\E$", lookup$old)))
data %>%
dplyr::mutate(x = str_replace_all(x, lookupV))
}
base_named_vector <- function(){
lookupV <- c(with(lookup, setNames(new, old)), rlang::set_names(setdiff(unique(data$x), lookup$old)))
lookupV[data$x]
}
base_ifelse <- function(){
lookupV <- setNames(lookup$new, lookup$old)
with(data, ifelse(x %in% lookup$old, lookup$new, x))
}
plyr_mapvalues <- function(){
library(plyr)
data %>%
dplyr::mutate(x = plyr::mapvalues(x, lookup$old, lookup$new, warn_missing = F))
}
base_match <- function(){
tochange <- match(data$x, lookup$old, nomatch = 0)
data$x[tochange > 0] <- lookup$new[tochange]
}
base_local_safe_lookup <- function(){
lv <- structure(lookup$new, names = lookup$old)
safe_lookup <- function(val) {
new_val <- lv[val]
unname(ifelse(is.na(new_val), val, new_val))
}
safe_lookup(data$x)
}
dplyr_recode <- function(){
library(dplyr)
lookupV <- setNames(lookup$new, lookup$old)
data %>%
dplyr::mutate(x = recode(x, !!!lookupV))
}
base_for <- function(){
for (i in seq_len(nrow(lookup))) {
data$x[data$x == lookup$old[i]] = lookup$new[i]
}
}
datatable_for <- function(){
library(data.table)
setDT(dt_data)
for (i in seq_len(nrow(lookup))) {
dt_data[x == lookup$old[i], x := lookup$new[i]]
}
}
forcats_fct_recode <- function(){
library(dplyr)
library(forcats)
lookupV <- setNames(lookup$new, lookup$old)
data %>%
dplyr::mutate(x = as.character(fct_recode(x, !!!lookupV)))
}
datatable_set <- function(){
library(data.table)
setDT(dt_data)
tochange <- dt_data[, chmatch(x, lookup$old, nomatch = 0)]
set(dt_data, i = which(tochange > 0), j = "x", value = lookup$new[tochange])
}
library(microbenchmark)
bench <- microbenchmark(dplyr_coalesce(),
datatable(),
datatable_in(),
datatable_for(),
base_for(),
purrr_modify_if(),
stringr_str_replace_all_update(),
base_named_vector(),
base_ifelse(),
plyr_mapvalues(),
base_match(),
base_local_safe_lookup(),
dplyr_recode(),
forcats_fct_recode(),
datatable_set(),
times = 15L,
setup = dt_data <- data.table::copy(data))
bench$expr <- forcats::fct_rev(forcats::fct_reorder(bench$expr, bench$time, mean))
ggplot2::autoplot(bench)
Thanks to #Waldi and #nicola for advice implementing data.table solutions in the benchmark.
Combination of a named vector and coalesce:
# make lookup vector
lookupV <- setNames(lookup$new, lookup$old)
data %>%
mutate(x = coalesce(lookupV[ x ], x))
# id x
# 1 1 a
# 2 2 a
# 3 3 B
# 4 4 C
# 5 5 d
Or data.table:
library(data.table)
setDT(data)
data[ x %in% names(lookupV), x := lookupV[ x ] ]
This post might have a better solution for data.table - "update on merge":
R data table: update join
A base R option using %in% and match - thanks to #LMc & #nicola
tochange <- match(data$x, lookup$old, nomatch = 0)
data$x[tochange > 0] <- lookup$new[tochange]
One more data.table option using set() and chmatch
library(data.table)
setDT(data)
tochange <- data[, chmatch(x, lookup$old, nomatch = 0)]
set(data, i = which(tochange > 0), j = "x", value = lookup$new[tochange])
Result
data
# id x
#1 1 a
#2 2 a
#3 3 B
#4 4 C
#5 5 d
#6 6 AA
#7 7 !
modify_if
You could use purrr::modify_if to only apply the named vector to values that exist in it. Though not a specified requirement, it has the benefit of the .else argument, which allows you to apply a different function to values not in your lookup.
I also wanted to include the use of tibble::deframe here to create the named vector. It is slower than setNames, though.
lookupV <- deframe(lookup)
data %>%
mutate(x = modify_if(x, x %in% lookup$old, ~ lookupV[.x]))
str_replace_all
Alternatively, you could use stringr::str_replace_all, which can take a named vector for the replacement argument.
data %>%
mutate(x = str_replace_all(x, lookupV))
Update
To accommodate the change to your edited example, the named vector used in str_replace_all needs to be modified. In this way, the entire literal string needs to be match so that "A" does not get substituted in "AA", or "." does not replace everything:
lookupV <- setNames(lookup$new, do.call(sprintf, list("^\\Q%s\\E$", lookup$old)))
data %>%
mutate(x = str_replace_all(x, lookupV))
left_join
Using dplyr::left_join this is very similar to OP solution, but uses .keep argument of mutate so it has less steps. This argument is currently in the experimental lifecycle and so it is not included in the benchmark (though it is around the middle of posted solutions).
left_join(data, lookup, by = c("x" = "old")) %>%
mutate(x = coalesce(new, x) , .keep = "unused")
Base R
Named Vector
Create a substitution value for every unique value in your dataframe.
lookupV <- c(with(lookup, setNames(new, old)), setNames(nm = setdiff(unique(data$x), lookup$old)))
data$x <- lookupV[data$x]
ifelse
with(data, ifelse(x %in% lookup$old, lookupV[x], x))
Another option that is clear is to use a for-loop with subsetting to loop through the rows of the lookup table. This will almost always be quicker with data.table because of auto indexing, or if you set the key (i.e., ?data.table::setkey()) ahead of time. Also, it will--of course--get slower as the lookup table gets longer. I would guess an update-join would be preferred if there is a long lookup table.
Base R:
for (i in seq_len(nrow(lookup))) {
data$x[data$x == lookup$old[i]] <- lookup$new[i]
}
data$x
# [1] "a" "a" "B" "C" "d" "AA" "!"
Or the same logic with data.table:
library(data.table)
setDT(data)
for (i in seq_len(nrow(lookup))) {
data[x == lookup$old[i], x := lookup$new[i]]
}
data$x
# [1] "a" "a" "B" "C" "d" "AA" "!"
Data:
data = data.frame(
id = 1:7,
x = c("A", "A", "B", "C", "D", "AA", ".")
)
lookup = data.frame(
old = c("A", "D", "."),
new = c("a", "d", "!")
)
Another base solution, with a lookup vector:
## Toy example
data = data.frame(
id = 1:5,
x = c("A", "A", "B", "C", "D"),
stringsAsFactors = F
)
lookup = data.frame(
old = c("A", "D"),
new = c("a", "d"),
stringsAsFactors = F
)
lv <- structure(lookup$new, names = lookup$old)
safe_lookup <- function(val) {
new_val <- lv[val]
unname(ifelse(is.na(new_val), val, new_val))
}
data$x <- safe_lookup(data$x)
dplyr+plyr solution that is in order with all ur bulletpoints (if u consider plyr in the the tidyverse):
data <- data %>%
dplyr::mutate(
x = plyr::mapvalues(x, lookup$old, lookup$new) #Can add , F to remove warnings
)
I basically share the same problem. Although dplyr::recode is in the "questioning" life cycle I don't expect it to become deprecated. At some point it might be superseded, but even in this case it should still be usable. Therefore I'm using a wrapper around dplyr::recode which allows the use of named vectors and or two vectors (which could be a lookup table).
library(dplyr)
library(rlang)
recode2 <- function(x, new, old = NULL, .default = NULL, .missing = NULL) {
if (!rlang::is_named(new) && !is.null(old)) {
new <- setNames(new, old)
}
do.call(dplyr::recode,
c(.x = list(x),
.default = list(.default),
.missing = list(.missing),
as.list(new)))
}
data = data.frame(
id = 1:7,
x = c("A", "A", "B", "C", "D", "AA", ".")
)
lookup = data.frame(
old = c("A", "D", "."),
new = c("a", "d", "!")
)
# two vectors new / old
data %>%
mutate(x = recode2(x, lookup$new, lookup$old))
#> id x
#> 1 1 a
#> 2 2 a
#> 3 3 B
#> 4 4 C
#> 5 5 d
#> 6 6 AA
#> 7 7 !
# named vector
data %>%
mutate(x = recode2(x, c("A" = "a",
"D" = "d",
"." = "!")))
#> id x
#> 1 1 a
#> 2 2 a
#> 3 3 B
#> 4 4 C
#> 5 5 d
#> 6 6 AA
#> 7 7 !
Created on 2021-04-21 by the reprex package (v0.3.0)
So, what I am attempting here is that, trying to count the number of sequence in a data set that goes from A immediately to C than after some time in C goes to L. I want to count the number of times this occurs and the average time it takes for this to occur in time periods, which is sectioned off by time_1, time_2,... etc.
So say in R, I have a dataframe with headings like ID, t_1, t_2, t_3,.... and each can take values A, C and L. And say I have a huge amount of data, how would I be able to find the number of times that a sequence that starts with A then immediately after that is C, then after any amount of time (so going through the column for an individual) it will arrive at a state of L?
What I had is that:
Lets say that the data I have is path, where it describes the path that a person with different ID number go through for each time point
My attempt of solving the problem
But this is extremely inefficient, as I need to do all the cases of all the time points, how can one achieve this in R efficiently? Thank you! :)
For Example:
ID <- c("i_1", "i_2", "i_3", "i_4")
t_1 <- c("A","C","A","C")
t_2 <- c("C","A","C","L")
t_3 <- c("L","C","L","L")
t_4 <- c("C","L","L","L")
path <-data.frame("ID" = ID, "t_1" = t_1, "t_2"=t_2, "t_3" = t_3, "t_4" = t_4)
path
diff_path_01 <- path[path$t_1 =="A" & path$t_2 == "C" &path$t_3 == "L",]
diff_path_01
diff_path_02 <- path[path$t_1 =="A" & path$t_2 == "C" &path$t_3 == "C" & path$t_4 == "L",]
diff_path_02
diff_path_03 <- path[path$t_2 =="A" & path$t_3 == "C" &path$t_4 == "L",]
diff_path_03
row(diff_path_03)
count <- nrow(diff_path_01)+nrow(diff_path_02) +nrow(diff_path_03)
count
So the count is the output of the number of sequence from A > C > L
However for the average time it takes, I am not sure how to attempt it, I know that i should be counting the element C between A and L's but dont know how to implement that
Hope someone can help, thank you!
One way to do it is to create a single string for each row containing the complete sequence. From this you can use str_extract_all() to extract all occurences of the specified sequence from this string.
As an example I used another vector than your example to show more occurences:
library(stringr)
x <- c("A", "C", "L", "A", "L", "A", "C", "C", "L", "A", "C", "A", "L", "A", "C", "C", "C", "L")
ACL <- unlist(str_extract_all(paste0(x, collapse = ""), "AC+?L"))
ACL
#[1] "ACL" "ACCL" "ACCCL"
The AC+? is a regular expression to search for a sequence beginning with A and ending with L with at least one C inbetween.
You can then easily extend this for your whole data set and calculate the number of occurences and the average time it takes.
apply(path[, -1], 1, function(x) {
ACL <- unlist(str_extract_all(paste0(x, collapse = ""), "AC+?L"))
c(count = length(ACL), average_time = mean(nchar(ACL)))
})
# [,1] [,2] [,3] [,4]
# count 1 1 1 0
# average_time 3 3 3 NaN
Not sure if this is what you want. Here count is TRUE if the path satisfy your rule, and avgtime count the number of Cs between A and L
# concatenate all alphabets in the data frame as a long string
s <- paste0(as.vector(t(path[-1])),collapse = "")
# divide the long string `s` into sub-string array `v` (the number of elements in the array equals to the number of rows of data frame)
v <- substring(s,seq(1,nchar(s)-3,4),seq(4,nchar(s),4))
# find the index where the sub-strings in `v` match the pattern `AC...L`
idx <- grep("AC.*?L",v)
# create data frame `df`
df <- data.frame(ID = path$ID,count = FALSE,avgtime = NA)
# assign the index in `count` to `TRUE` according to the matched search
df$count[idx] <- TRUE
# count the number of `C` by `nchar()` where the sub-string is in the form of `A...L`.
df$avgtime[idx] = nchar(gsub("[AL]","",gsub(".*?(A.*?L).*","\\1",v[idx])))
Using different data (different from the one you posted) as an example:
path <- structure(list(ID = structure(1:4, .Label = c("i_1", "i_2", "i_3",
"i_4"), class = "factor"), t_1 = structure(c(1L, 1L, 1L, 2L), .Label = c("A",
"C"), class = "factor"), t_2 = structure(c(2L, 1L, 2L, 1L), .Label = c("A",
"C"), class = "factor"), t_3 = structure(c(2L, 1L, 1L, 2L), .Label = c("C",
"L"), class = "factor"), t_4 = structure(c(1L, 2L, 2L, 2L), .Label = c("C",
"L"), class = "factor")), class = "data.frame", row.names = c(NA,
-4L))
> path
ID t_1 t_2 t_3 t_4
1 i_1 A C L C
2 i_2 A A C L
3 i_3 A C C L
4 i_4 C A L L
we can get the result like below:
> df
ID count avgtime
1 i_1 TRUE 1
2 i_2 TRUE 1
3 i_3 TRUE 2
4 i_4 FALSE NA
I have a data frame. I'm trying to remove rows that have values in a column that match other rows that were conditionally removed. Let me provide a simple example for better explaining.
I'm tried using the previous post as a starting point:
Remove Rows From Data Frame where a Row match a String
>dat
A,B,C
4,3,Foo
2,3,Bar
1,2,Bar
7,5,Zap
First remove rows with "Foo" in column C:
dat[!grepl("Foo", dat$C),]
Now I want to remove any additional rows that have values in column B that match the values in rows with Foo. So in this example, any rows with B = 3 would be removed because row 1 has Foo, which was removed and has B=3.
>dat.new
1,2,Bar
7,5,Zap
Any ideas on how to do this would be appreciated.
We subset the 'B' values where 'C' is 'Foo', create a logical vector by checking those values in the 'B', negate (!) and also create a condition where the 'C' is not "Foo"
library(dplyr)
dat.new <- dat %>%
filter(!B %in% B[C == 'Foo'], C != 'Foo')
dat.new
# A B C
#1 1 2 Bar
#2 7 5 Zap
Or in base R with subset
subset(dat, !B %in% B[C == 'Foo'] & C != "Foo")
data
dat <- structure(list(A = c(4L, 2L, 1L, 7L), B = c(3L, 3L, 2L, 5L),
C = c("Foo", "Bar", "Bar", "Zap")), row.names = c(NA, -4L
), class = "data.frame")
I have a data.frame with log of sequences of events. Here, sequence 1 is composed of event A, then B, then C, each starting at a specific timestamp (in seconds).
df=data.frame(id=runif(10, 1e6, 1e7), sequence = c(1,1,1,2,2,3,3,3,4,4), event=c("A", "B", "C", "B", "C", "A", "B", "C", "B", "C"), starts_at=c(20,22,24,20,30,20,21,23,20,40))
What I want is to group my data.frame by type of sequence (there are dozens of types, length 2 to 6): A->B->C or B->C, and then to get some results on those types. Desired output would be:
#### sequence_type number.appearances mean.delay.between.events
#### 1 ABC 2 1.5 / 2
#### 2 BC 2 15
The last column "mean delay" would be a string composed of the mean diff time between successive events in a sequence: in ABC sequence, there is 1.5 seconds in average between A and B, and 2 between B and C.
I also thought of "spreading" each mean difference in a new column diff.1, diff.2..., but seems complicated since sequence have different lengths. Though i'm open to different ways of presenting this information..
So far I've come up with:
library(dplyr)
df %>% group_by(sequence) %>% arrange(starts_at) %>% summarise(sequence_type = paste0(event, collapse="")) %>% group_by(sequence_type) %>% tally
I didn't find how to achieve the second part. Thanks for the help...
This might not bee the elegant solution you would get with dplyr but I think is general enough that it would work with your real data.
First you just need to get the corresponding sequence of each row of your data, that is ayuda_seq
library(zoo)
df=data.frame(id=runif(14, 1e6, 1e7), sequence = c(1,1,1,2,2,3,3,3,4,4,5,5,5,5),
event=c("A", "B", "C", "B", "C", "A", "B", "C", "B", "C","A","B","C","D"),
starts_at=c(20,22,24,20,30,20,21,23,20,40,20,22,21,15))
ayuda_seq = sapply(df$sequence, function(x) paste0(df[df$sequence == x,3],collapse = ""))
and then you just loop through the unique sequences and generate the sub sequence by each 2 elements.
vec_means = NULL
for(x in unique(ayuda_seq)){
data_temp = df[ayuda_seq == x,]
diff_temp = diff(data_temp$starts_at)
temp_sub = apply(rollapply(data_temp[,3],FUN = paste0,width = 2),1,paste0,collapse = "")
mean_temp = aggregate(diff_temp,by = list(temp_sub),mean)
if(all(!duplicated(temp_sub))){
averages = paste0(mean_temp[,2],collapse = " / ")
} else{
averages = paste0(mean_temp[match(temp_sub[duplicated(temp_sub)],mean_temp[,1]),2],collapse = " / ")
}
vec_means = c(vec_means,averages)
}
df_res = data.frame(sequence_type = unique(ayuda_seq),
number.appearances = as.numeric(table(ayuda_seq)/nchar(unique(ayuda_seq))),
mean.delay.between.events = vec_means)
the variable temp_sub will have the different combinations within the original string you are looping. In the case of "ABC" there is a possible combination of "CA" which is not taking in consideration because it is unique.
Not pretty, but it works
tmp<-df %>% group_by(sequence) %>% dplyr::arrange(sequence, starts_at) %>% dplyr::mutate(seq_row_num=dplyr::row_number(), lead_starts_at=dplyr::lead(starts_at, n = 1)) %>% base::as.data.frame()
tmp<- tmp %>% dplyr::group_by(sequence) %>% mutate(max_seq_len=max(seq_row_num)) %>% base::as.data.frame()
tmp$seq_len_id<- paste0(tmp$sequence, tmp$max_seq_len)
tmp$next_seq_val<- tmp$seq_row_num + 1
tmp$next_seq_val<- base::ifelse(tmp$next_seq_val >= tmp$max_seq_len, tmp$max_seq_len, tmp$next_seq_val)
tmp_seq_labels<- stats::aggregate(tmp$event, list(tmp$seq_len_id), paste, collapse='')
tmp<- base::merge(tmp, tmp_seq_labels, by.x="seq_len_id", by.y="Group.1")
colnames(tmp)[which(colnames(tmp)=="x")]<- "seq_group"
tmp$within_group_step<-"ZZ"
tmp$within_group_step<- base::ifelse(tmp$seq_row_num != tmp$max_seq_len, substr(tmp$seq_group, start = tmp$seq_row_num, stop =tmp$next_seq_val), tmp$within_group_step)
tmp$within_step_by_group_id<- paste0(tmp$seq_group, tmp$within_group_step)
tmp$time_diff<- 0
tmp$time_diff<- base::ifelse(!is.na(tmp$lead_starts_at), tmp$lead_starts_at - tmp$starts_at, tmp$time_diff)
res<- stats::aggregate(time_diff ~ within_step_by_group_id + seq_group + within_group_step, data=tmp, FUN=mean)
drops<- grep(pattern = "ZZ", x = res$within_step_by_group_id)
if(length(drops)>=1){
res<- res[-drops,]
}
colnames(res)<- c("Full_Group_Pattern", "Group_Pattern", "Sub_Group_Pattern", "Mean_Time_Difference")
res<- res %>% dplyr::group_by(Group_Pattern) %>%
dplyr::mutate(Number_of_Appearances=n()) %>% base::as.data.frame()
Here is the result: