Create new column to identify the first value >0 across different columns - r

I have the following data frame
df <- data.frame(col1 = c(0,0,1,1),col2 = c(1,0,0,3),col2 = c(1,0,0,3))
How can I identify the the first value of each value which is greater than 0.
The expected output is like this
df <- data.frame(col1 = c(0,0,1,1),col2 = c(1,0,0,3),col3 = c(1,0,0,3),col4 = c(1,0,1,1))
And I have tried the followings
for (i in 1:3){
df$col4 <- apply(df[,c(0:i)],1,sum)
if (df$col4>0)
break
}

We can use max.col() for this.
df[cbind(1:nrow(df), max.col(df > 0, "first"))]
# [1] 1 0 1 1

df$col4 <- apply(df, 1, function(x) x[which(x>0)[1]])
df[is.na(df$col4),'col4'] <- 0

Here is another idea using mapply,
unlist(mapply(`[`, split(df, 1:nrow(df)), max.col(df>0, ties.method = 'first')))
#1.col2 2.col1 3.col1 4.col1
# 1 0 1 1
Depending on what you need by 'count backward', you can either change ties.method to 'last', i.e.
unlist(mapply(`[`, split(df, 1:nrow(df)), max.col(df>0, ties.method = 'last')))
#1.col2.1 2.col2.1 3.col1 4.col2.1
# 1 0 1 3
Or reverse the data frame and leave ties.method to 'first', i.e.
unlist(mapply(`[`, split(rev(df), 1:nrow(df)), max.col(df>0, ties.method = 'first')))
# 1.col2 2.col2.1 3.col2.1 4.col2.1
# 1 0 0 3

Related

Assign() to specific indices of vectors, vectors specified by string names

I'm trying to assign values to specific indices of a long list of vectors (in a loop), where each vector is specified by a string name. The naive approach
testVector1 <- c(0, 0, 0)
vectorName <- "testVector1"
indexOfInterest <- 3
assign(x = paste0(vectorName, "[", indexOfInterest, "]"), value = 1)
doesn't work, instead it creates a new vector "testVector1[3]" (the goal was to change the value of testVector1 to c(0, 0, 1)).
I know the problem is solvable by overwriting the whole vector:
temporaryVector <- get(x = vectorName)
temporaryVector[indexOfInterest] <- 1
assign(x = vectorName, value = temporaryVector)
but I was hoping for a more direct approach.
Is there some alternative to assign() that solves this?
Similarly, is there a way to assign values to specific elements of columns in data frames, where both the data frames and columns are specified by string names?
If you must do this you can do it with eval(parse():
valueToAssign <- 1
stringToParse <- paste0(
vectorName, "[", indexOfInterest, "] <- ", valueToAssign
)
eval(parse(text = stringToParse))
testVector1
# [1] 0 0 1
But this is not recommended. Better to put the desired objects in a named list, e.g.:
testVector1 <- c(0, 0, 0)
dat <- data.frame(a = 1:5, b = 2:6)
l <- list(
testVector1 = testVector1,
dat = dat
)
Then you can assign to them by name or index:
vectorName <- "testVector1"
indexOfInterest <- 3
dfName <- "dat"
colName <- "a"
rowNum <- 3
valueToAssign <- 1
l[[vectorName]][indexOfInterest] <- valueToAssign
l[[dfName]][rowNum, colName] <- valueToAssign
l
# $testVector1
# [1] 0 0 1
# $dat
# a b
# 1 1 2
# 2 2 3
# 3 1 4
# 4 4 5
# 5 5 6

Creating group ids by comparing values of two variables across rows: in R

I have a dataframe with two variables (start,end). would like to create an identifier variable which grows in ascending order of start and, most importantly, is kept constant if the value of start coincides with end of any other row in the dataframe.
Below is a simple example of the data
toy_data <- data.frame(start = c(1,5,6,10,16),
end = c(10,9,11,15,17))
The output I would be looking for is the following:
output_data <- data.frame(start = c(1,10,5,6,16),
end = c(10,15,9,11,17),
NEW_VAR = c(1,1,2,3,4))
You could try adapting this answer to group by ranges that are adjacent to each other. Credit goes entirely to #r2evans.
In this case, you would use expand.grid to get combinations of start and end. Instead of labels you would have row numbers rn to reference.
In the end, you can number the groups based on which rows appear together in the list. The last few lines starting with enframe use tibble/tidyverse. To match the group numbers I resorted the results too.
I hope this might be helpful.
library(tidyverse)
toy_data <- data.frame(start = c(1,5,6,10,16),
end = c(10,9,11,15,17))
toy_data$rn = 1:nrow(toy_data)
eg <- expand.grid(a = seq_len(nrow(toy_data)), b = seq_len(nrow(toy_data)))
eg <- eg[eg$a < eg$b,]
together <- cbind(
setNames(toy_data[eg$a,], paste0(names(toy_data), "1")),
setNames(toy_data[eg$b,], paste0(names(toy_data), "2"))
)
together <- subset(together, end1 == start2)
groups <- split(together$rn2, together$rn1)
for (i in toy_data$rn) {
ind <- (i == names(groups)) | sapply(groups, `%in%`, x = i)
vals <- groups[ind]
groups <- c(
setNames(list(unique(c(i, names(vals), unlist(vals)))), i),
groups[!ind]
)
}
min_row <- as.numeric(sapply(groups, min))
ctr <- seq_along(groups)
lapply(ctr[order(match(min_row, ctr))], \(x) toy_data[toy_data$rn %in% groups[[x]], ]) %>%
enframe() %>%
unnest(col = value) %>%
select(-rn)
Output
name start end
<int> <dbl> <dbl>
1 1 1 10
2 1 10 15
3 2 5 9
4 3 6 11
5 4 16 17
The following function should give you the desired identifier variable NEW_VAR.
identifier <- \(df) {
x <- array(0L, dim = nrow(df))
count <- 0L
my_seq <- seq_len(nrow(df))
for (i in my_seq) {
if(!df[i,]$start %in% df$end) {
x[i] <- my_seq[i] + count
} else {
x[i] <- my_seq[i]-1L + count
count <- count - 1L
}
}
x
}
Examples
# your example
toy_data <- data.frame(start = c(1,10,5,6,16),
end = c(10,15,9,11,17))
toy_data$NEW_VAR <- identifier(toy_data)
# ---------------------
> toy_data$NEW_VAR
[1] 1 1 2 3 4
# other example
toy_data <- data.frame(start = c(1, 2, 2, 4, 16, 21, 18, 3),
end = c(16, 2, 21, 2, 2, 2, 3, 1))
toy_data$NEW_VAR <- identifier(toy_data)
# ---------------------
> toy_data$NEW_VAR
[1] 0 0 0 1 1 1 2 2

Subset data.frame based on lag between two columns

Suppose you want to subset a data.frame where the rule for keeping rows is based
on a lag beteen rows 'a' and 'b':
# input
df <- data.frame(a = c(1,0,0,0,1,0,0,0,0,0,0,0),
b = c(0,1,1,0,0,1,1,0,0,0,1,1))
#output
a b
1 1 0
2 0 1
3 0 1
4 1 0
5 0 1
6 0 1
Essentially, if 'a' = 1 you want to keep that row as well as the subsequent run of rows in
'b' that have a value of 1. This capture continues until the next row with a = 0 & b = 0.
I've tried using nested 'ifelse()' statements, but I am stuck incorporate logical tests based on a lag issue.
Suggestions?
This is how I would do it. There are probably options out there that require maybe 1 or 2 lines less.
df <- data.frame(a = c(1,0,0,0,1,0,0,0,0,0,0,0),
b = c(0,1,1,0,0,1,1,0,0,0,1,1))
library(dplyr)
df %>%
mutate(grp = cumsum(a==1|a+b==0)) %>%
group_by(grp) %>%
filter(any(a == 1)) %>%
ungroup() %>%
select(a, b)
A solution without dplyr. Work with a flag:
# input
df <- data.frame(a = c(1,0,0,0,1,0,0,0,0,0,0,0),
b = c(0,1,1,0,0,1,1,0,0,0,1,1))
# create new empty df
new_df <- read.table(text = "", col.names = c("a", "b"))
a_okay = FALSE # initialize the flag
for (row_number in seq(1:nrow(df))) { # loop over each row of the original df
# if a is 1, we add the row to the new df and set the flag to TRUE
if (df[row_number, "a"] == 1) {
a_okay = TRUE
new_df[nrow(new_df) + 1, ] = c(df[row_number, "a"], df[row_number, "b"])
}
# now we consider the rows where a is not 1
else {
# if b is 1 and we are still following an a == 1: add the row
if (df[row_number, "b"] == 1 & a_okay) {
new_df[nrow(new_df) + 1, ] = c(df[row_number, "a"], df[row_number, "b"])
}
# if b is 0, we reset the flag
else {
a_okay = FALSE
}
}
}
Another base solution inspired by this post, #Wietse de Vries's answer and #Ben's comment.
# input
df <- data.frame(a = c(1,0,0,0,1,0,0,0,0,0,0,0),
b = c(0,1,1,0,0,1,1,0,0,0,1,1))
# identify groups
df$grp <- cumsum(df$a == 1 | df$b == 0)
# subset df by groups with first element of a == 1
df <- do.call(rbind, split(df, df$grp)[by(df, df$grp, function(x) {x$a[1] == 1})])
# remove grp
df$grp <- NULL

Access data.table columns through vector indexes?

i'm stucked with a problem but i can find no satisfying answers on the web. I would like to valorize a data.frame(also a data.table it's good for me) using start:end vectors. An example will clarify what i'm asking.
Suppose i have a data.framelike the following:
df <- data.frame(col_1 = rep(0, 3), col_2 = rep(0, 3), col_3 = rep(0, 3), col_4 = rep(0,3))
df
col_1 col_2 col_3 col_4
1 0 0 0 0
2 0 0 0 0
3 0 0 0 0
And suppose i have two vectors:
indexesStart <- c(1, 2, 1)
indexesEnd <- c(2, 4, 3)
I would like to valorize to 1 all values in the range indicated by the vectors by row. The output should be the following:
col_1 col_2 col_3 col_4
1 1 1 0 0
2 0 1 1 1
3 1 1 1 0
I tried something like this:
df[ , indexesStart:indexesEnd] <- 1
But it doesn't work, it just takes indexesStart[1]:indexesEnd[1] and repeat it for all rows.
I must avoid loop cycles because my real data frame has millions rows and it is too slow. Any help is appreciated (a data.table solution would be even better)
Thank you
This will do it:
df <- data.frame(col_1=rep(0,3),col_2=rep(0,3),col_3=rep(0,3),col_4=rep(0,3))
indexesStart <- c(1, 2, 1)
indexesEnd <- c(2, 4, 3)
for (i in 1:nrow(df)) df[i, indexesStart[i]:indexesEnd[i]] <- 1
df
Here is another technique using a twocolumn matrix as index:
I <- do.call(rbind, lapply(1:length(indexesStart), function(i) cbind(i, indexesStart[i]:indexesEnd[i])))
df[I] <- 1
In the second variant I hided the loop (and the hidden loop is in another place).
Try this, it avoids any looping or lapply and is vectorized. This takes advantage of the fact that a data.frame is really a list.
impute <- function(lst, start, end){ lst[start:end] <- 1; lst }
fill <- function(df, start, end){
cols <- names(df)
lst <- as.list(as.data.frame(t(df)))
res <- as.data.frame(t(Vectorize(impute)(lst, start, end)))
names(res) <- names(df)
row.names(res) <- row.names(df)
res
}
res <- fill(df, indexesStart, indexesEnd)
Takes around 5 seconds to do 1 million rows on my MacBook Pro.

Find the Most Recent Matching in an Array [R]

Imagine an array of numbers called A. At each level of A, you want to find the most recent item with a matching value. You could easily do this with a for loop as follows:
A = c(1, 1, 2, 2, 1, 2, 2)
for(i in 1:length(A)){
if(i > 1 & sum(A[1:i-1] == A[i]) > 0){
answer[i] = max(which(A[1:i-1] == A[i]))
}else{
answer[i] = NA
}
}
But I want vectorize this for loop (because I'll be applying this principle on a very large data set). I tried using sapply:
answer = sapply(A, FUN = function(x){max(which(A == x))})
As you can see, I need some way of reducing the array to only values that come before x. Any advice?
We can use seq_along to loop over the index of each element and then subset it and get the max index where the value last occured.
c(NA, sapply(seq_along(A)[-1], function(x) max(which(A[1:(x-1)] == A[x]))))
#[1] NA 1 -Inf 3 2 4 6
We can change the -Inf to NA if needed in that format
inds <- c(NA, sapply(seq_along(A)[-1], function(x) max(which(A[1:(x-1)] == A[x]))))
inds[is.infinite(inds)] <- NA
inds
#[1] NA 1 NA 3 2 4 6
The above method gives a warning, to remove the warning we can perform an additional check of the length
c(NA, sapply(seq_along(A)[-1], function(x) {
inds <- which(A[1:(x-1)] == A[x])
if (length(inds) > 0)
max(inds)
else
NA
}))
#[1] NA 1 NA 3 2 4 6
Here's an approach with dplyr which is more verbose, but easier for me to grok. We start with recording the row_number, make a group for each number we encounter, then record the prior matching row.
library(dplyr)
A2 <- A %>%
as_tibble() %>%
mutate(row = row_number()) %>%
group_by(value) %>%
mutate(last_match = lag(row)) %>%
ungroup()
You can do:
sapply(seq_along(A)-1, function(x)ifelse(any(a<-A[x+1]==A[sequence(x)]),max(which(a)),NA))
[1] NA 1 NA 3 2 4 6
Here's a function that I made (based upon Ronak's answer):
lastMatch = function(A){
uniqueItems = unique(A)
firstInstances = sapply(uniqueItems, function(x){min(which(A == x))}) #for NA
notFirstInstances = setdiff(seq(A),firstInstances)
lastMatch_notFirstInstances = sapply(notFirstInstances, function(x) max(which(A[1:(x-1)] == A[x])))
X = array(0, dim = c(0, length(A)))
X[firstInstances] = NA
X[notFirstInstances] = lastMatch_notFirstInstances
return(X)
}

Resources