R- Include starting point in cumsum function - r

I have this data.frame:
a b
[1,] 1 0
[2,] 2 0
[3,] 3 0
[4,] 4 0
[5,] 5 0
[6,] 6 1
[7,] 7 2
[8,] 8 3
[9,] 9 4
[10,] 10 5
I want to apply cumsum on column a only when its corresponding value on column b is different from 0.
I tried this below but it doesn't include a starting condition on the cumsum:
df_cumsum <- cbind(c(1:10), c(0,0,0,0,0,1,2,3,4,5),
as.data.frame(ave(A[,1], A[,2] != 0, FUN=cumsum)))
Unfortunately, I obtain a cumsum over the whole column:
a b c
1 1 0 1
2 2 0 3
3 3 0 6
4 4 0 10
5 5 0 15
6 6 1 6
7 7 2 13
8 8 3 21
9 9 4 30
10 10 5 40
I would like to obtain:
a b c
1 1 0 0
2 2 0 0
3 3 0 0
4 4 0 0
5 5 0 0
6 6 1 6
7 7 2 13
8 8 3 21
9 9 4 30
10 10 5 40
Thanks for help!

Assuming the input is df as shown reproducibly in the Note at the end, try this. It zeros out any a value for which b is 0.
transform(df, cum = cumsum((b > 0) * a))
giving:
a b cum
1 1 0 0
2 2 0 0
3 3 0 0
4 4 0 0
5 5 0 0
6 6 1 6
7 7 2 13
8 8 3 21
9 9 4 30
10 10 5 40
Note
We assume this input shown in reproducible form:
Lines <- "
a b
1 0
2 0
3 0
4 0
5 0
6 1
7 2
8 3
9 4
10 5"
df <- read.table(text = Lines, header = TRUE)
Update
a and b had been reversed. Have fixed.

It would be better to create an index and update
i1 <- df1$b > 0
df1$c[i1] <- with(df1, cumsum(a[i1]))
Or in a single line
df1$c <- with(df1, cumsum(a * (b > 0)))
df1$c
#[1] 0 0 0 0 0 6 13 21 30 40

I really like how clean the other answers are using the a * (b > 0) but that can sometimes be a bit confusing for newer programers. As an alternative to this syntax you can use a vectorized ifelse function.
df <- data.frame(a=c(1:10), b=c(0,0,0,0,0,1,2,3,4,5))
# One way
df$c <- cumsum(ifelse(df$b>0,df$a,0))
# Another way
df$d <- with(df,cumsum(ifelse(b>0,a,0)))

Related

Count consecutive numbers

I have some time series with corresponding number for each date as 0 or 1. For example:
date value
1 0
2 0
3 1
4 1
5 1
6 0
7 1
8 1
So I want to count the consecutive 1´s like for date 3-5 the sum should be 3 and then start at date 7 again to count. And if this sum is below 6 the 1´s should be transformed to 0´s.
library(dplyr)
data.frame(
date = 1:8,
value = c(0,0,1,1,1,0,1,1)
) %>%
mutate(
count = rle(value) %>%
{list(.$lengths * .$values, .$lengths)} %>%
{rep(x = .[[1]], times = .[[2]])},
count_1 = ifelse(count < 6, 0, count)
)
gives:
date value count count_1
1 1 0 0 0
2 2 0 0 0
3 3 1 3 0
4 4 1 3 0
5 5 1 3 0
6 6 0 0 0
7 7 1 2 0
8 8 1 2 0
I would first create a grouping variable and then use this to aggregate the dataset.
d = data.frame("date"=1:12,
"value"=c(1,1,0,0,1,1,1,1,0,0,1,0))
d$group = 1
for(i in 2:dim(d)[1]){
if(d$value[i]==d$value[i-1]){
d$group[i]=d$group[i-1]
} else {
d$group[i]=d$group[i-1]+1
}
}
nd = data.frame("Group"=unique(d$group),
"Start"=aggregate(d$date~d$group,FUN=min)[,2],
"End"=aggregate(d$date~d$group,FUN=max)[,2],
"Count"=aggregate(d$value~d$group,FUN=sum)[,2])
The output for this data would be:
> d ## Input data
date value
1 1 1
2 2 1
3 3 0
4 4 0
5 5 1
6 6 1
7 7 1
8 8 1
9 9 0
10 10 0
11 11 1
12 12 0
> nd ## All groups
Group Start End Count
1 1 1 2 2
2 2 3 4 0
3 3 5 8 4
4 4 9 10 0
5 5 11 11 1
6 6 12 12 0
> nd[nd$Count>0,] ## Just the groups with 1 in them:
Group Start End Count
1 1 1 2 2
3 3 5 8 4
5 5 11 11 1
Another solution which looks like what you expected :
d = data.frame("date"=1:20,"value"=c(1,1,0,0,1,1,1,1,0,0,1,0,1,1,1,1,1,1,1,0))
repl <- rle(d$value)
rep_lengths <- rep(repl$lengths, repl$lengths)
rep_lengths[rep_lengths < 6] <- 0
d$value <- rep_lengths
returns
> d
date value
1 1 0
2 2 0
3 3 0
4 4 0
5 5 0
6 6 0
7 7 0
8 8 0
9 9 0
10 10 0
11 11 0
12 12 0
13 13 7
14 14 7
15 15 7
16 16 7
17 17 7
18 18 7
19 19 7
20 20 0
You can use rle to count the consecutive and use ifelse to set those lower 6 to 0:
y <- rle(x$value)
y[[2]] <- y[[1]] * y[[2]]
y[[2]] <- ifelse(y[[2]] < 6, 0, y[[2]])
inverse.rle(y)
#[1] 0 0 0 0 0 0 0 0
Data:
x <- data.frame(date = 1:8, value = c(0,0,1,1,1,0,1,1))

How to do calculation based on previous row results in dplyr

I performing some calculations where the result of a row is the input to the next.
I'm using a for loop which is quite slow, is there a way I can use dplyr for these types of calculations? example below
df <- data.frame(beginning_on_hand = c(10,0,0,0,0,0,0,0,0,0,0,0),
sales = c(10,9,4,7,3,7,2,6,1,5,7,1),
ship = c(10,9,4,7,3,7,2,6,1,5,7,1))
dfb <- df %>%
mutate(receipts = 0) %>%
mutate(ending_on_hand = 0) %>%
mutate(receipts = lag(ship, 2)) %>%
mutate(receipts = if_else(is.na(receipts), 0, receipts))
> dfb
beginning_on_hand sales ship receipts ending_on_hand
10 10 10 0 0
0 9 9 0 0
0 4 4 10 0
0 7 7 9 0
0 3 3 4 0
0 7 7 7 0
0 2 2 3 0
0 6 6 7 0
0 1 1 2 0
0 5 5 6 0
0 7 7 1 0
0 1 1 5 0
for(i in 1:(nrow(dfb)- 2)) {
dfb$ending_on_hand[i] <- dfb$beginning_on_hand[i] + dfb$receipts[i] - dfb$sales[i]
dfb$beginning_on_hand[i+1] <- dfb$ending_on_hand[i]
}
> dfb
beginning_on_hand sales ship receipts ending_on_hand
1 10 10 10 0 0
2 0 9 9 0 -9
3 -9 4 4 10 -3
4 -3 7 7 9 -1
5 -1 3 3 4 0
6 0 7 7 7 0
7 0 2 2 3 1
8 1 6 6 7 2
9 2 1 1 2 3
10 3 5 5 6 4
11 4 7 7 1 0
12 0 1 1 5 0
I don't have a dplyr solution for this, but here is a data.table solution for this.
df <- data.frame(beginning_on_hand = c(10,0,0,0,0,0,0,0,0,0,0,0),
sales = c(10,9,4,7,3,7,2,6,1,5,7,1),
ship = c(10,9,4,7,3,7,2,6,1,5,7,1))
dfb <- df %>%
mutate(ending_on_hand = 0) %>%
mutate(receipts = lag(ship, 2)) %>%
mutate(receipts = if_else(is.na(receipts), 0, receipts))
dfb<-data.table(dfb)
df.end <- dfb[, ending_on_hand := cumsum(beginning_on_hand + receipts - sales)][,
beginning_on_hand := beginning_on_hand + lag(ending_on_hand, default = 0)]
>df.end
beginning_on_hand sales ship ending_on_hand receipts
1: 10 10 10 0 0
2: 0 9 9 -9 0
3: -9 4 4 -3 10
4: -3 7 7 -1 9
5: -1 3 3 0 4
6: 0 7 7 0 7
7: 0 2 2 1 3
8: 1 6 6 2 7
9: 2 1 1 3 2
10: 3 5 5 4 6
11: 4 7 7 -2 1
12: -2 1 1 2 5
To explain, data.table uses basically lists to comprise the data and displays it in typically a flat-file manner. It uses SQL type instructions to organize and process data. The functions of note used here are cumsum and lag. cumsum calculates all values prior to a particular index, and lag looks for a value above or prior to a given index.

Using R to filter special rows

I have a question which has bothered me for a long time.I have a data frame as below...
ll <- data.frame(id=1:10,
A=c(rep(0,5),3,4,5,0,2),
B=c(1,4,2,0,3,0,3,24,0,0),
C=c(0,3,4,5,0,4,0,6,0,5),
D=c(0,1,2,0,42,4,0,3,8,0))
> ll
id A B C D
1 1 0 1 0 0
2 2 0 4 3 1
3 3 0 2 4 2
4 4 0 0 5 0
5 5 0 3 0 42
6 6 3 0 4 4
7 7 4 3 0 0
8 8 5 24 6 3
9 9 0 0 0 8
10 10 2 0 5 0
I want to filter out some special rows which have more than one "0" such as...
id A B C D
1 1 0 1 0 0
I want the final output as...
id A B C D
2 2 0 4 3 1
3 3 0 2 4 2
6 6 3 0 4 4
8 8 5 24 6 3
You can just use rowSums:
> ll[rowSums(ll == 0) <= 1, ]
id A B C D
2 2 0 4 3 1
3 3 0 2 4 2
6 6 3 0 4 4
8 8 5 24 6 3
If there are any columns that shouldn't be included, you can drop them in the rowSums step. For example, I assume "id" would not be included. If that's the case, then you can do:
ll[rowSums(ll[-1] == 0) <= 1, ]

cumulative counter in dataframe R

I have a dataframe with many rows, but the structure looks like this:
year factor
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 1
10 0
11 0
12 0
13 0
14 0
15 0
16 0
17 1
18 0
19 0
20 0
I would need to add a counter as a third column. It should count the cumulative cells that contains zero until it set again to zero once the value 1 is encountered. The result should look like this:
year factor count
1 0 0
2 0 1
3 0 2
4 0 3
5 0 4
6 0 5
7 0 6
8 0 7
9 1 0
10 0 1
11 0 2
12 0 3
13 0 4
14 0 5
15 0 6
16 0 7
17 1 0
18 0 1
19 0 2
20 0 3
I would be glad to do it in a quick way, avoiding loops, since I have to do the operations for hundreds of files.
You can copy my dataframe, pasting the dataframe in "..." here:
dt <- read.table( text="...", , header = TRUE )
Perhaps a solution like this with ave would work for you:
A <- cumsum(dt$factor)
ave(A, A, FUN = seq_along) - 1
# [1] 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 0 1 2 3
Original answer:
(Missed that the first value was supposed to be "0". Oops.)
x <- rle(dt$factor == 1)
y <- sequence(x$lengths)
y[dt$factor == 1] <- 0
y
# [1] 1 2 3 4 5 6 7 8 0 1 2 3 4 5 6 7 0 1 2 3

assigning new values based on the location in the sequence

Working in R.
The data tracks changes in brain activity over time. Column "mark" contains information when a particular treatment begins and ends. For examples, the first condition (mark==1) begins in row 3 and ends in row 6. The second experimental condition (mark==2) starts in row 9 and ends in 12. Another batch of treatment one is repeated between rows 15 and 18.
ob.id <- c(1:20)
mark <- c(0,0,1,0,0,1,0,0,2,0,0,2,0,0,1,0,0,1,0,0)
condition<-c(0,0,1,1,1,1,0,0,2,2,2,2,0,0,1, 1,1,1,0,0)
start <- data.frame(ob.id,mark)
result<-data.frame(ob.id,mark,condition)
print (start)
> print (start)
ob.id mark
1 1 0
2 2 0
3 3 1
4 4 0
5 5 0
6 6 1
7 7 0
8 8 0
9 9 2
10 10 0
11 11 0
12 12 2
13 13 0
14 14 0
15 15 1
16 16 0
17 17 0
18 18 1
19 19 0
20 20 0
I need to create a column that would have a dummy variable indicating the membership of an observation in corresponding experimental condition, like this:
> print(result)
ob.id mark condition
1 1 0 0
2 2 0 0
3 3 1 1
4 4 0 1
5 5 0 1
6 6 1 1
7 7 0 0
8 8 0 0
9 9 2 2
10 10 0 2
11 11 0 2
12 12 2 2
13 13 0 0
14 14 0 0
15 15 1 1
16 16 0 1
17 17 0 1
18 18 1 1
19 19 0 0
20 20 0 0
Thanks for your help!
This is a fun little problem. The trick I use below is to first calculate the rle of the mark vector, which makes the problem simpler, as the resulting values vector will always have just one 0 that may or may not need to be replaced (depending on the surrounding values).
# example vector with some edge cases
v = c(0,0,1,0,0,0,1,2,0,0,2,0,0,1,0,0,0,0,1,2,0,2)
v.rle = rle(v)
v.rle
#Run Length Encoding
# lengths: int [1:14] 2 1 3 1 1 2 1 2 1 4 ...
# values : num [1:14] 0 1 0 1 2 0 2 0 1 0 ...
vals = rle(v)$values
# find the 0's that need to be replaced and replace by the previous value
idx = which(tail(head(vals,-1),-1) == 0 & (head(vals,-2) == tail(vals,-2)))
vals[idx + 1] <- vals[idx]
# finally go back to the original vector
v.rle$values = vals
inverse.rle(v.rle)
# [1] 0 0 1 1 1 1 1 2 2 2 2 0 0 1 1 1 1 1 1 2 2 2
Probably the least cumbersome thing to do is to put the above in a function and then apply that to your data.frame vector (as opposed to manipulating the vector explicitly).
Another approach, based on #SimonO101's observation, involves constructing the right groups from the starting data (run the by part separately, piece by piece, to see how it works):
library(data.table)
dt = data.table(start)
dt[, result := mark[1],
by = {tmp = rep(0, length(mark));
tmp[which(mark != 0)[c(F,T)]] = 1;
cumsum(mark != 0) - tmp}]
dt
# ob.id mark result
# 1: 1 0 0
# 2: 2 0 0
# 3: 3 1 1
# 4: 4 0 1
# 5: 5 0 1
# 6: 6 1 1
# 7: 7 0 0
# 8: 8 0 0
# 9: 9 2 2
#10: 10 0 2
#11: 11 0 2
#12: 12 2 2
#13: 13 0 0
#14: 14 0 0
#15: 15 1 1
#16: 16 0 1
#17: 17 0 1
#18: 18 1 1
#19: 19 0 0
#20: 20 0 0
The latter approach will probably be more flexible.
Here is one way I could think of doing it:
# Find where experiments stop and start
ind <- which( result$mark != 0 )
[1] 3 6 9 12 15 18
# Make a matrix of the start and stop indices taking odd and even elements of the vector
idx <- cbind( head(ind , -1)[ 1:length(ind) %% 2 == 1 ] ,tail( ind , -1)[ 1:length(ind) %% 2 == 1 ] )
[,1] [,2]
[1,] 3 6
[2,] 9 12
[3,] 15 18
edit
I realised making the above index matrix would be easier with just taking odd and even elements:
idx <- cbind( ind[ 1:length(ind) %% 2 == 1 ] , ind[ 1:length(ind) %% 2 != 1 ] )
# Make vector of row indices to turn to 1's
ones <- as.vector( apply( idx , 1 , function(x) c( x[1]:x[2] ) ) )
# Make your new column and turn appropriate rows to 1
result$condition <- 0
result$condition[ ones ] <- 1
result
# ob.id mark condition
#1 1 0 0
#2 2 0 0
#3 3 1 1
#4 4 1 1
#5 5 1 1
#6 6 1 1
#7 7 0 0
#8 8 0 0
#9 9 1 1
#10 10 1 1
#11 11 1 1
#12 12 1 1
#13 13 0 0
#14 14 0 0
#15 15 1 1
#16 16 1 1
#17 17 1 1
#18 18 1 1
#19 19 0 0
#20 20 0 0
Edit
#eddi pointed out I needed to put the value of the experiment in, not just one. So this is another strategy which uses gasp(!) a for loop. This will only be really detrimental if you have millions thousands of experiments (remember to pre-allocate your results vector):
ind <- matrix( which( start$mark != 0 ) , ncol = 2 , byrow = TRUE )
ind <- cbind( ind , start$mark[ ind[ , 1 ] ] )
# [,1] [,2] [,3]
#[1,] 3 6 1
#[2,] 9 12 2
#[3,] 15 18 1
res <- integer( nrow( start ) )
for( i in 1:nrow(ind) ){
res[ ind[i,1]:ind[i,2] ] <- ind[i,3]
}
[1] 0 0 1 1 1 1 0 0 2 2 2 2 0 0 1 1 1 1 0 0

Resources