Group index from column labeling the last element in each group - r

I'm trying to subset a data frame. The data frame is to be broken into subsets, where the last element in each subset has a "TRUE" value in the "bool" column. Consider the following data frame:
df <- data.frame(c(3,1,3,4,1,1,4), rnorm(7))
df <- cbind(df, df[,1] != 1)
names(df) <- c("ind", "var", "bool")
df
# ind var bool
# 1 3 0.02343906 TRUE
# 2 1 0.94786193 FALSE
# 3 3 0.50632766 TRUE
# 4 4 0.24655548 TRUE
# 5 1 -1.58103304 FALSE
# 6 1 0.73999468 FALSE
# 7 4 0.10929906 TRUE
Row 1 should be a subset, rows 2 and 3 should be a subset, row 4 a subset and then rows 5 through 7 a subset. The code I have below works (I can subset on the new column), but I was wondering if there was a more "R" way of doing it.
index = 1
for (i in 1:nrow(df))
{
if(df$bool[i])
{df$index[i] = index
index = index + 1
}
else
{df$index[i] = index
}
}
df
# ind var bool index
# 1 3 0.02343906 TRUE 1
# 2 1 0.94786193 FALSE 2
# 3 3 0.50632766 TRUE 2
# 4 4 0.24655548 TRUE 3
# 5 1 -1.58103304 FALSE 4
# 6 1 0.73999468 FALSE 4
# 7 4 0.10929906 TRUE 4

The first thought I would have would be to use the cumulative sum (cumsum) on the bool column to get the group indices -- this will increase the index value by 1 every time the bool value is TRUE:
df$index <- cumsum(df$bool)
df
# ind var bool index
# 1 3 -1.0712125 TRUE 1
# 2 1 0.4994369 FALSE 1
# 3 3 2.1335274 TRUE 2
# 4 4 -1.5950432 TRUE 3
# 5 1 0.5919880 FALSE 3
# 6 1 2.7039831 FALSE 3
# 7 4 -1.3526646 TRUE 4
This is not quite right because all the observations before the TRUE of each group are assigned to the previous group. We can fix that by adding 1 for all the observations with bool set to FALSE:
df$index <- cumsum(df$bool) + !df$bool
df
# ind var bool index
# 1 3 -1.0712125 TRUE 1
# 2 1 0.4994369 FALSE 2
# 3 3 2.1335274 TRUE 2
# 4 4 -1.5950432 TRUE 3
# 5 1 0.5919880 FALSE 4
# 6 1 2.7039831 FALSE 4
# 7 4 -1.3526646 TRUE 4
Splitting the data frame into a list of subsets can now be achieved efficiently with subsets <- split(df, df$index).

Related

How can I flag a row where a value appears in more than one group in R?

I would like to add a column that indicates whether a value appears in more than one group. Using the below example, value '4' appears in groups '1' and '2', so I would like to flag that value.
x = matrix(c(1,1,1,2,2,2,3,3,4,4,5,4), nrow = 6, ncol = 2, byrow = F)
x = data.frame(x)
x
# X1 X2
# 1 1 3
# 2 1 3
# 3 1 4
# 4 2 4
# 5 2 5
# 6 2 4
This would be the desired output:
# X1 X2 FLAG
# 1 1 3 False
# 2 1 3 False
# 3 1 4 True
# 4 2 4 True
# 5 2 5 False
# 6 2 4 True
We can create the flag by using n_distinct after grouping by 'X2'
library(dplyr)
x %>%
group_by(X2) %>%
mutate(FLAG = n_distinct(X1) > 1)
Here is a base R option using àve
transform(
x,
FLAG = ave(X1, X2, FUN = function(v) length(unique(v))) > 1
)
or aggregate + subset
transform(
x,
FLAG = X2 %in% subset(aggregate(. ~ X2, x, function(x) length(unique(x))), X1 > 1)$X2
)
which gives
X1 X2 FLAG
1 1 3 FALSE
2 1 3 FALSE
3 1 4 TRUE
4 2 4 TRUE
5 2 5 FALSE
6 2 4 TRUE
For completion here is data.table version :
library(data.table)
setDT(x)[, FLAG := uniqueN(X1) > 1, X2]
x
# X1 X2 FLAG
#1: 1 3 FALSE
#2: 1 3 FALSE
#3: 1 4 TRUE
#4: 2 4 TRUE
#5: 2 5 FALSE
#6: 2 4 TRUE

Understanding R's anyDuplicated

Quick question in understanding's R's anyDuplicated, when passed on a dataframe (lets say x y z columns with 1k observations) will if check if any of the rows has the exact same x y z values as another row in the same dataframe? Thanks
I would use duplicated and combine it from front to back.
mydf <- data.frame(x = c(1:3,1,1), y = c(3:5,3,3))
mydf
# x y
# 1 1 3
# 2 2 4
# 3 3 5
# 4 1 3
# 5 1 3
There three duplicated rows 1, 4, and 5. But 'duplicated' will only mark what is duplicated not the original value also.
duplicated(mydf)
#[1] FALSE FALSE FALSE TRUE TRUE
duplicated(mydf, fromLast = TRUE)
#[1] TRUE FALSE FALSE TRUE FALSE
Using from last looks from the end to front to include the original value. By the way, I will ask the R core team to add a unified function to do both.
myduplicates <- duplicated(mydf) | duplicated(mydf, fromLast = TRUE)
Saving the expression as a variable allows us to count and subset later.
sum(myduplicates)
#[1] 3
mydf[myduplicates,]
# x y
# 1 1 3
# 4 1 3
# 5 1 3
mydf[!myduplicates,]
# x y
# 2 2 4
# 3 3 5

counting lengths between alternating columns

I am trying to figure out how to count the number of rows from when one column says True to when the other column says True. I attempted to use run length encoding but couldnt figure out how to get the alternating values form each column.
set.seed(42)
s<-sample(c(0,1,2,3),500,replace=T)
isOverbought<-s==1
isOverSold<-s==0
head(cbind(isOverbought,isOverSold),20)
res<-rle(isOverSold)
tt<-res[res$values==0] #getting when Oversold is true
> head(cbind(isOverbought,isOverSold))
[1,] FALSE FALSE
[2,] FALSE FALSE
[3,] TRUE FALSE <-starting condition is overbought
[4,] FALSE FALSE
[5,] FALSE FALSE
[6,] FALSE FALSE
[7,] FALSE FALSE
[8,] FALSE TRUE <-is oversold. length from overbought to oversold = 5
[9,] FALSE FALSE
[10,] FALSE FALSE
[11,] TRUE FALSE <- is overbought. length from oversold to overbought = 3
[12,] FALSE FALSE
[13,] FALSE FALSE
[14,] TRUE FALSE
[15,] TRUE FALSE
[16,] FALSE FALSE
[17,] FALSE FALSE
[18,] FALSE TRUE <-is oversold. length from overbought to oversold = 7
[19,] TRUE FALSE <- is overbought. length from oversold to overbought = 1
[20,] FALSE FALSE
GOAL
overboughtTOoversold oversoldTOoverbought
5 3
7 1
This is sufficient to solve your problem.
## `a` to `b`
a2b <- function (a, b) {
x <- which(a) ## position of `TRUE` in `a`
y <- which(b) ## position of `TRUE` in `b`
z <- which(a | b) ## position of all `TRUE`
end <- match(y, z) ## match for end position
start <- c(1L, end[-length(end)] + 1L) ## start position
valid <- end > start ## remove cases with `end = start`
z[end[valid]] - z[start[valid]]
}
## cross `a` and `b`
axb <- function (a, b) {
if (any(a & b))
stop ("Invalid input! `a` and `b` can't have TRUE at the same time!")
x <- a2b(a, b); y <- a2b(b, a)
if (which(a)[1L] < which(b)[1L]) cbind(a2b = x, b2a = c(NA_integer_, y))
else cbind(a2b = c(NA_integer_, x), b2a = y)
}
For your isOverbought and isOverSold, we obtain:
result <- axb(isOverbought, isOverSold)
head(result)
# a2b b2a
#[1,] 5 NA
#[2,] 7 3
#[3,] 3 1
#[4,] 8 5
#[5,] 2 6
#[6,] 10 2
Since isOverbought has the first TRUE before isOverSold, the first element of the 2nd column is NA.
The assumption for this answer is that there is at least one overbought/oversold transition (either direction) and hence at least two rows in the data. This condition can easily be checked by counting the number of overbought and oversold conditions and making sure that both are greater than one.
The key is to remove the consecutive overbought and oversold conditions so that we only have alternating overbought and oversold conditions. One way to do this is:
## detect where we are overbought and oversold
i1 <- which(isOverbought)
i2 <- which(isOverSold)
## concatenate into one vector
i3 <- c(i1,i2)
## sort these and get the indices from the sort
i4 <- order(i3)
## at this point consecutive overbought or oversold conditions
## will be marked by a difference of 1 in i4 while alternating
## conditions will be marked by something other than 1. So
## filter those out to get i6. BTW, consecutive here does not mean
## consecutive rows in the data but consecutive occurrence of
## either overbought or oversold conditions without an intervening
## condition of the other. The assumption for at least one transition
## in the data is needed for this to work.
i5 <- diff(i4)
i6 <- i4[c(1,which(i5 != 1)+1)]
## then recover the alternating rows of overbought and oversold conditions in i7
i7 <- i3[i6]
## take the difference and format the output
## I need to credit #akrun for this part
i8 <- diff(i7)
## need to determine which is first
if (i1[1] < i2[1]) {
overboughtTOoversold <- i8[c(TRUE, FALSE)]
oversoldTOoverbought <- i8[c(FALSE, TRUE)]
} else {
overboughtTOoversold <- i8[c(FALSE, TRUE)]
oversoldTOoverbought <- i8[c(TRUE, FALSE)]
}
d1 <- cbind(overboughtTOoversold, oversoldTOoverbought)
print(head(d1))
## overboughtTOoversold oversoldTOoverbought
##[1,] 5 3
##[2,] 7 1
##[3,] 3 5
##[4,] 8 6
##[5,] 2 2
##[6,] 10 4
The cbind may generate a warning that the columns are not the same length. To get rid of that, just pad with NA at the end as appropriate.
A more compact version of the above is:
i3 <- c(which(isOverbought), which(isOverSold))
i4 <- order(i3)
i8 <- diff(i3[i4[c(1,which(diff(i4) != 1)+1)]])
if (which(isOverbought)[1] < which(isOverSold)[1]) {
overboughtTOoversold <- i8[c(TRUE, FALSE)]
oversoldTOoverbought <- i8[c(FALSE, TRUE)]
} else {
overboughtTOoversold <- i8[c(FALSE, TRUE)]
oversoldTOoverbought <- i8[c(TRUE, FALSE)]
}
d1 <- cbind(overboughtTOoversold, oversoldTOoverbought)
Here is a short version:
create a vector called mktState. Encode it with 1 if overbought is TRUE, -1 if oversold is TRUE and NA if both first 2 cols are FALSE.( You are interested only in days where the market state switches)
use na.locf to fill the NAs with the last observation carried forward
now use the rle function
mktState <- ifelse(df$overBought == TRUE,1,ifelse(df$overSold == TRUE,-1,NA))
mktState <- na.locf(mktState)
to get 'overbought' runs:
> rle(mktState)$lengths[rle(mktState)$values == 1]
[1] 5 7 3 8 2 10 7 3 1 2 4 2 5 6 3 11 4 1 5 2 4 6 1 1 8
[26] 7 3 1 1 1 1 3 2 3 1 6 1 1 1 3 2 4 2 1 6 8 8 1 5 15
[51] 2 5 4 2 1 1 3 4 7 1 7 11 1 3 4 2 4 1
and this will give you the 'oversold' runs:
> rle(mktState)$lengths[rle(mktState)$values == -1]
[1] 3 1 5 6 2 4 1 4 3 3 3 5 2 4 1 14 2 2 10 3 7 1 13 1 1
[26] 3 3 1 6 5 2 1 8 7 2 3 1 1 3 5 1 1 2 3 1 2 2 3 3 1
[51] 8 9 4 2 1 6 2 1 3 2 4 5 1 3 7 4 2 2
Here's a [somewhat long] tidyverse version:
library(dplyr)
library(tidyr)
# put vectors in a data.frame
data.frame(isOverbought, isOverSold) %>%
# evaluate each row separately
rowwise() %>%
# add column with name of event for any TRUE, else NA
mutate(change_type = ifelse(isOverbought | isOverSold, names(.)[c(isOverbought, isOverSold)], NA)) %>%
# reset grouping
ungroup() %>%
# replace NA values with last non-NA value
fill(change_type) %>%
# add a column of the cumulate number of changes in change_type
mutate(changes = data.table::rleid(change_type)) %>%
# count number of rows in each changes and change_type grouping
count(changes, change_type) %>%
# remove leading NAs
na.omit() %>%
# reset grouping
ungroup() %>%
# edit change into runs of two with integer division
mutate(changes = changes %/% 2) %>%
# spread to wide form
spread(change_type, n) %>%
# get rid of extra column
select(-changes)
## # A tibble: 68 x 2
## isOverbought isOverSold
## * <int> <int>
## 1 5 3
## 2 7 1
## 3 3 5
## 4 8 6
## 5 2 2
## 6 10 4
## 7 7 1
## 8 3 4
## 9 1 3
## 10 2 3
## # ... with 58 more rows

If row meets criteria, then TRUE else FALSE in R

I have nested data that looks like this:
ID Date Behavior
1 1 FALSE
1 2 FALSE
1 3 TRUE
2 3 FALSE
2 5 FALSE
2 6 TRUE
2 7 FALSE
3 1 FALSE
3 2 TRUE
I'd like to create a column called counter in which for each unique ID the counter adds one to the next row until the Behavior = TRUE
I am expecting this result:
ID Date Behavior counter
1 1 FALSE 1
1 2 FALSE 2
1 3 TRUE 3
2 3 FALSE 1
2 5 FALSE 2
2 6 TRUE 3
2 7 FALSE
3 1 FALSE 1
3 2 TRUE 2
Ultimately, I would like to pull the minimum counter in which the observation occurs for each unique ID. However, I'm having trouble developing a solution for this current counter issue.
Any and all help is greatly appreciated!
I'd like to create a counter within each array of unique IDs and from there, ultimately pull the row level info - the question is how long on average does it take to reach a TRUE
I sense there might an XY problem going on here. You can answer your latter question directly, like so:
> library(plyr)
> mean(daply(d, .(ID), function(grp)min(which(grp$Behavior))))
[1] 2.666667
(where d is your data frame.)
Here's a dplyr solution that finds the row number for each TRUE in each ID:
library(dplyr)
newdf <- yourdataframe %>%
group_by(ID) %>%
summarise(
ftrue = which(Behavior))
do.call(rbind, by(df, list(df$ID), function(x) {n = nrow(x); data.frame(x, Counter = c(1:(m<-which(x$Behavior)), rep(NA, n-m)))}))
ID Date Behavior Counter
1.1 1 1 FALSE 1
1.2 1 2 FALSE 2
1.3 1 3 TRUE 3
2.4 2 3 FALSE 1
2.5 2 5 FALSE 2
2.6 2 6 TRUE 3
2.7 2 7 FALSE NA
3.8 3 1 FALSE 1
3.9 3 2 TRUE 2
df = read.table(text = "ID Date Behavior
1 1 FALSE
1 2 FALSE
1 3 TRUE
2 3 FALSE
2 5 FALSE
2 6 TRUE
2 7 FALSE
3 1 FALSE
3 2 TRUE", header = T)

Identifying unique duplicates in vector in R

I am trying to identify duplicates based of a match of elements in two vectors. Using duplicate() provides a vector of all matches, however I would like to index which are matches with each other or not. Using the following code as an example:
x <- c(1,6,4,6,4,4)
y <- c(3,2,5,2,5,5)
frame <- data.frame(x,y)
matches <- duplicated(frame) | duplicated(frame, fromLast = TRUE)
matches
[1] FALSE TRUE TRUE TRUE TRUE TRUE
Ultimately, I would like to create a vector that identifies elements 2 and 4 are matches as well as 3,5,6. Any thoughts are greatly appreciated.
Another data.table answer, using the group counter .GRP to assign every distinct element a label:
d <- data.table(frame)
d[,z := .GRP, by = list(x,y)]
# x y z
# 1: 1 3 1
# 2: 6 2 2
# 3: 4 5 3
# 4: 6 2 2
# 5: 4 5 3
# 6: 4 5 3
How about this with plyr::ddply()
ddply(cbind(index=1:nrow(frame),frame),.(x,y),summarise,count=length(index),elems=paste0(index,collapse=","))
x y count elems
1 1 3 1 1
2 4 5 3 3,5,6
3 6 2 2 2,4
NB = the expression cbind(index=1:nrow(frame),frame) just adds an element index to each row
Using merge against the unique possibilities for each row, you can get a result:
labls <- data.frame(unique(frame),num=1:nrow(unique(frame)))
result <- merge(transform(frame,row = 1:nrow(frame)),labls,by=c("x","y"))
result[order(result$row),]
# x y row num
#1 1 3 1 1
#5 6 2 2 2
#2 4 5 3 3
#6 6 2 4 2
#3 4 5 5 3
#4 4 5 6 3
The result$num vector gives the groups.

Resources