Row positions relative to a specific condition in R - r

I have a dataset with "Athletes" playing "Matches" ("Match"==1) on random "Dates". For example:
df <- data.frame(matrix(nrow = 80, ncol = 5))
colnames(df) <- c("Athlete", "Date", "Match", "DaysAfter", "DaysBefore")
df[,"Athlete"] <- c(rep(1, 20), rep(2,20), rep(3, 20), rep(4, 20))
df[,"Date"] <- rep(1:20, 4)
df[,"Match"] <- c(0,0,0,0,1,0,0,1,0,0)
I want to make two variables:
df$DaysAfter <- # number of days after last "Match" (for each "Athlete").
df$DaysBefore <- # number of days before next "Match" (for each "Athlete").
PS! When "Match" == 1, then "DaysAfter" and "DaysBefore" should be 0.
When there are no matches before in "DaysAfter" and after in "DaysBefore", show NA (see example).
I want the dataset to look like this:
Ath Dat Mat DA DB
1 1 0 NA -4
1 2 0 NA -3
1 3 0 NA -2
1 4 0 NA -1
1 5 1 0 0
1 6 0 1 -2
1 7 0 2 -1
1 8 1 0 0
1 9 0 1 -4
1 10 0 2 -3
1 11 0 3 -2
1 12 0 4 -1
1 13 1 0 0
1 14 0 1 -2
1 15 0 2 -1
1 16 1 0 0
1 17 0 1 NA
1 18 0 2 NA
1 19 0 3 NA
1 20 0 4 NA
2 1 0 NA -4
2 2 0 NA -3
etc.
How can I achieve this?

We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(df)), grouped by 'Athlete' and another grouping variable created based on the position of 1 in 'Match' (cumsum(Match == 1)), we create two columns -
1) DA - As we need NA for all the elements until the first 1 in 'Match', create a logical condition with if/else so that all the elements that are 0 in 'Match' will be multiplied by 'NA' (NA* any number returns NA). As we did the grouping by cumsum, only the first group have all elements as 0, so that part got solved. The else condition gets the sequence of rows and subtract 1 from it (`.seq_len(.N)-1).
2) DB - We multiply the 'Match' with the number of rows (.N) and subtract from the reverse sequence (.N:1). Once we get this done, the last part involves creating NA for the elements in the column after the last 1 in 'Match'. Grouped by 'Athlete', we get the row index (.I) of the sequence from the last 1 in 'Match' (next element) to the number of rows (.N), and assign (:=) the 'DB' to NA based on that index.
library(data.table)
df1 <- setDT(df)[, c("DA", "DB") := list(if(all(!Match)) NA*Match else
seq_len(.N)-1,Match*(.N) -(.N:1)) , by = .(cumsum(Match==1), Athlete)]
df1[df1[, .I[(max(which(Match==1))+1):.N] , by = Athlete]$V1, DB:= NA][]
# Athlete Date Match DA DB
# 1: 1 1 0 NA -4
# 2: 1 2 0 NA -3
# 3: 1 3 0 NA -2
# 4: 1 4 0 NA -1
# 5: 1 5 1 0 0
# 6: 1 6 0 1 -2
# 7: 1 7 0 2 -1
# 8: 1 8 1 0 0
# 9: 1 9 0 1 -6
#10: 1 10 0 2 -5
#11: 1 11 0 3 -4
#12: 1 12 0 4 -3
#13: 1 13 0 5 -2
#14: 1 14 0 6 -1
#15: 1 15 1 0 0
#16: 1 16 0 1 -2
#17: 1 17 0 2 -1
#18: 1 18 1 0 0
#19: 1 19 0 1 NA
#20: 1 20 0 2 NA
#21: 2 1 0 NA -4
#22: 2 2 0 NA -3
#23: 2 3 0 NA -2
#24: 2 4 0 NA -1
#25: 2 5 1 0 0
#26: 2 6 0 1 -2
#27: 2 7 0 2 -1
#28: 2 8 1 0 0
#29: 2 9 0 1 -6
#30: 2 10 0 2 -5
#31: 2 11 0 3 -4
#32: 2 12 0 4 -3
#33: 2 13 0 5 -2
#34: 2 14 0 6 -1
#35: 2 15 1 0 0
#36: 2 16 0 1 -2
#37: 2 17 0 2 -1
#38: 2 18 1 0 0
#39: 2 19 0 1 NA
#40: 2 20 0 2 NA
#41: 3 1 0 NA -4
#42: 3 2 0 NA -3
#43: 3 3 0 NA -2
#44: 3 4 0 NA -1
#45: 3 5 1 0 0
#46: 3 6 0 1 -2
#47: 3 7 0 2 -1
#48: 3 8 1 0 0
#49: 3 9 0 1 -6
#50: 3 10 0 2 -5
#51: 3 11 0 3 -4
#52: 3 12 0 4 -3
#53: 3 13 0 5 -2
#54: 3 14 0 6 -1
#55: 3 15 1 0 0
#56: 3 16 0 1 -2
#57: 3 17 0 2 -1
#58: 3 18 1 0 0
#59: 3 19 0 1 NA
#60: 3 20 0 2 NA
#61: 4 1 0 NA -4
#62: 4 2 0 NA -3
#63: 4 3 0 NA -2
#64: 4 4 0 NA -1
#65: 4 5 1 0 0
#66: 4 6 0 1 -2
#67: 4 7 0 2 -1
#68: 4 8 1 0 0
#69: 4 9 0 1 -6
#70: 4 10 0 2 -5
#71: 4 11 0 3 -4
#72: 4 12 0 4 -3
#73: 4 13 0 5 -2
#74: 4 14 0 6 -1
#75: 4 15 1 0 0
#76: 4 16 0 1 -2
#77: 4 17 0 2 -1
#78: 4 18 1 0 0
#79: 4 19 0 1 NA
#80: 4 20 0 2 NA

This code should work:
unique_list<-(unique(df$Athlete))
for(k in (1:length(unique_list))){
index<-c(1:dim(df)[1])[df$Athlete==unique_list[k]]
count=NA
for(j in index){
if(df$Mat[j]==1){
count=0
}else{
count=count+1
}
df$DaysAfter[j]=count
}
count=NA
for(j in index[c(length(index):1)]){
if(df$Mat[j]==1){
count=0
}else{
count=count-1
}
df$DaysBefore[j]=count
}
}

I once wrote the following function:
cumsum.r <- function (vals, restart)
{
if (!is.vector(vals) || !is.vector(restart))
stop("expect vectors")
if (length(vals) != length(restart))
stop("different length")
len = length(vals)
restart[1] = T
ind = which(restart)
ind = rep(ind, c(ind[-1], len + 1) - ind)
vals.c = cumsum(vals)
vals.c - vals.c[ind] + vals[ind]
}
It performs cumsum, but starts from zero whenever restart=TRUE.
For "days after", you need
new.ath <- c(TRUE, df$Ath[-1]==df$Ath[-length(df$Ath)])
restart <- df$Math==1 | new.ath
days.after <- cumsum.r(1-restart, restart)
for days.before you need
rr <- rev(restart)
days.before <- -rev(cumsum.r(1-rr, rr))
(This does not put NAs, but you can use this cumsum.r for NAs too.)

Related

How can I create a new variable which identifies rows where another variable changes sign?

I have a question regarding data preparation. I have the following data set (in long format; one row per measurement point, therefore several rows per person):
dd <- read.table(text=
"ID time
1 -4
1 -3
1 -2
1 -1
1 0
1 1
2 -3
2 -1
2 2
2 3
2 4
3 -3
3 -2
3 -1
4 -1
4 1
4 2
4 3
5 0
5 1
5 2
5 3
5 4", header=TRUE)
Now I would like to create a new variable that has a 1 in the row, in which a sign change on the time variable happens for the first time for this person, and a 0 in all other rows. If a person has only negative values on time, the should not be any 1 on the new variable. For a person that has only positive values on time, the first row should have a 1 on the new variable and all other rows should be coded with 0. For my example above the new data frame should look like this:
dd <- read.table(text=
"ID time new.var
1 -4 0
1 -3 0
1 -2 0
1 -1 0
1 0 1
1 1 0
2 -3 0
2 -1 0
2 2 1
2 3 0
2 4 0
3 -3 0
3 -2 0
3 -1 0
4 -1 0
4 1 1
4 2 0
4 3 0
5 0 1
5 1 0
5 2 0
5 3 0
5 4 0", header=TRUE)
Does anyone know how to do this? I thought about using dplyr and group_by, however I am pretty new to R and did not make it. Any help is much appreciated!
There are 2 different operations you want done to create new.var, so you need to do them in 2 steps. I'll break this into 2 separate mutate calls for simplicity, but you can put both of them into the same mutate
First, we group by ID and then find the rows where the sign changes. We need to use time >= 0 instead of sign as recommended in this answer: R identifying a row prior to a change in sign because you want a sign change to be counted only when going from -1 <-> 0, not from 0 <-> 1:
library(tidyverse)
dd2 <- dd %>%
group_by(ID) %>%
mutate(new.var = as.numeric((time >= 0) != (lag(time) >= 0)))
dd2
# A tibble: 23 x 3
# Groups: ID [5]
ID time new.var
<int> <int> <dbl>
1 1 -4 NA
2 1 -3 0
3 1 -2 0
4 1 -1 0
5 1 0 1
6 1 1 0
7 2 -3 NA
8 2 -1 0
9 2 2 1
10 2 3 0
# … with 13 more rows
Then we use case_when to modify the first row based on your desired rules. Due to the way lag works, the first row will always have NA (since there is no previous row to look at) which makes it a good way to pick out that first row to change it based on the time values in that group:
dd3 <- dd2 %>%
mutate(new.var = case_when(
!is.na(new.var) ~ new.var,
all(time >= 0) ~ 1,
TRUE ~ 0)
)
print(dd3, n = 100) #n=100 because tibbles are truncated to 10 rows by print
# A tibble: 23 x 3
# Groups: ID [5]
ID time new.var
<int> <int> <dbl>
1 1 -4 0
2 1 -3 0
3 1 -2 0
4 1 -1 0
5 1 0 1
6 1 1 0
7 2 -3 0
8 2 -1 0
9 2 2 1
10 2 3 0
11 2 4 0
12 3 -3 0
13 3 -2 0
14 3 -1 0
15 4 -1 0
16 4 1 1
17 4 2 0
18 4 3 0
19 5 0 1
20 5 1 0
21 5 2 0
22 5 3 0
23 5 4 0
You can try this:
library(dplyr)
dd %>% left_join(dd %>% group_by(ID) %>% summarise(index=min(which(time>=0)))) %>%
group_by(ID) %>% mutate(new.var=ifelse(row_number(ID)==index,1,0)) %>% select(-index)-> DF
# A tibble: 23 x 3
# Groups: ID [5]
ID time new.var
<int> <int> <dbl>
1 1 -4 0
2 1 -3 0
3 1 -2 0
4 1 -1 0
5 1 0 1
6 1 1 0
7 2 -3 0
8 2 -1 0
9 2 2 1
10 2 3 0
The following ave instruction does what the question asks for.
dd$new.var <- with(dd, ave(time, ID, FUN = function(x){
y <- integer(length(x))
if(any(x >= 0)) y[which.max(x[1]*x <= 0)] <- 1L
y
}))
dd
# ID time new.var
#1 1 -4 0
#2 1 -3 0
#3 1 -2 0
#4 1 -1 0
#5 1 0 1
#6 1 1 0
#7 2 -3 0
#8 2 -1 0
#9 2 2 1
#10 2 3 0
#11 2 4 0
#12 3 -3 0
#13 3 -2 0
#14 3 -1 0
#15 4 -1 0
#16 4 1 1
#17 4 2 0
#18 4 3 0
#19 5 0 1
#20 5 1 0
#21 5 2 0
#22 5 3 0
#23 5 4 0
If the expected output is renamed dd2 then
identical(dd, dd2)
#[1] TRUE

formatting table/matrix in R

I am trying to use a package where the table they've used is in a certain format, I am very new to R and don't know how to get my data in this same format to be able to use the package.
Their table looks like this:
Recipient
Actor 1 10 11 12 2 3 4 5 6 7 8 9
1 0 0 0 1 3 1 1 2 3 0 2 6
10 1 0 0 1 0 0 0 0 0 0 0 0
11 13 5 0 5 3 8 0 1 3 2 2 9
12 0 0 2 0 1 1 1 3 1 1 3 0
2 0 0 2 0 0 1 0 0 0 2 2 1
3 9 9 0 5 16 0 2 8 21 45 13 6
4 21 28 64 22 40 79 0 16 53 76 43 38
5 2 0 0 0 0 0 1 0 3 0 0 1
6 11 22 4 21 13 9 2 3 0 4 39 8
7 5 32 11 9 16 1 0 4 33 0 17 22
8 4 0 2 0 1 11 0 0 0 1 0 1
9 0 0 3 1 0 0 1 0 0 0 0 0
Where mine at the moment is:
X0 X1 X2 X3 X4 X5
0 0 2 3 3 0 0
1 1 0 4 2 0 0
2 0 0 0 0 0 0
3 0 2 2 0 1 0
4 0 0 3 2 0 2
5 0 0 3 3 1 0
I would like to add the recipient and actor to mine, as well as change to row and column names to 1, ..., 6.
Also my data is listed under Data in my Workspace and it says:
'num' [1:6,1:6] 0 1 ...
Whereas the example data in the workspace is shown in Values as:
'table' num [1:12,1:12] 0 1 13 ...
Please let me know if you have suggestion to get my data in the same type and style as theirs, all help is greatly appreciated!
OK, so you have a matrix like so:
m <- matrix(c(1:9), 3)
rownames(m) <- 0:2
colnames(m) <- paste0("X", 0:2)
# X0 X1 X2
#0 1 4 7
#1 2 5 8
#2 3 6 9
First you need to remove the Xs and turn it into a table:
colnames(m) <- sub("X", "", colnames(m))
m <- as.table(m)
# 0 1 2
#0 1 4 7
#1 2 5 8
#2 3 6 9
Then you can set the dimension names:
names(dimnames(m)) <- c("Actor", "Recipient")
# Recipient
#Actor 0 1 2
# 0 1 4 7
# 1 2 5 8
# 2 3 6 9
However, usually you would create the contingency table from raw data using the table function, which would automatically return a table object. So, maybe you should fix the step creating your matrix?

Cumulative count of blocks of 1 with 0 separators in a binary vector in R

I have a data frame with a binary vector that I want to do a cumulative count of. However I would like to count the 'groups of 1's' rather than each individual 1 and create a new vector of this count while retaining the 0 separating values.
i.e.
df1 <- data.frame(c(0,1,1,1,1,0,0,0,1,1,1,1,1,0,0,0,1,1,1)
n bin
1 0
2 1
3 1
4 1
5 1
6 0
7 0
8 0
9 1
10 1
11 1
12 1
13 1
14 0
15 0
16 0
17 1
18 1
19 1
becomes
n bin cumul
1 0 0
2 1 1
3 1 1
4 1 1
5 1 1
6 0 0
7 0 0
8 0 0
9 1 2
10 1 2
11 1 2
12 1 2
13 1 2
14 0 0
15 0 0
16 0 0
17 1 3
18 1 3
19 1 3
how do I go about this?
You can use the rleid function from package data.table:
df1 <- data.frame(bin = c(0,1,1,1,1,0,0,0,1,1,1,1,1,0,0,0,1,1,1))
library(data.table)
setDT(df1)
df1[, cumul := rleid(bin)]
df1[bin == 0, cumul := 0]
df1[bin == 1, cumul := rleid(cumul)]
# bin cumul
# 1: 0 0
# 2: 1 1
# 3: 1 1
# 4: 1 1
# 5: 1 1
# 6: 0 0
# 7: 0 0
# 8: 0 0
# 9: 1 2
#10: 1 2
#11: 1 2
#12: 1 2
#13: 1 2
#14: 0 0
#15: 0 0
#16: 0 0
#17: 1 3
#18: 1 3
#19: 1 3
Although somehow manual:
l <- rle(df1$c1)$lengths
v <- rle(df1$c1)$values
v2 <- cumsum(v)
v2[duplicated(v2)] <- 0
df1$cumul <- rep(v2, times = l)
df1
c1 cumul
1 0 0
2 1 1
3 1 1
4 1 1
5 1 1
6 0 0
7 0 0
8 0 0
9 1 2
10 1 2
11 1 2
12 1 2
13 1 2
14 0 0
15 0 0
16 0 0
17 1 3
18 1 3
19 1 3
Yet another
x<-c(0,1,1,1,1,0,0,0,1,1,1,1,1,0,0,0,1,1,1)
d<-cumsum(diff(c(0,x))>0)
d[x==0]<-0
cbind(x,d)
x d
[1,] 0 0
[2,] 1 1
[3,] 1 1
[4,] 1 1
[5,] 1 1
[6,] 0 0
[7,] 0 0
[8,] 0 0
[9,] 1 2
[10,] 1 2
[11,] 1 2
[12,] 1 2
[13,] 1 2
[14,] 0 0
[15,] 0 0
[16,] 0 0
[17,] 1 3
[18,] 1 3
[19,] 1 3

tagging windows around events within data.frame

I have a data.frame with a factor identifying events
year event
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 1
10 0
11 0
12 0
13 0
14 0
15 0
16 0
17 1
18 0
19 0
20 0
And I would need a counter-type identifying a given window around the events. The result should look like this (for a window that is, for example, 3 periods around the event):
year event window
1 0
2 0
3 0
4 0
5 0
6 0 -3
7 0 -2
8 0 -1
9 1 0
10 0 1
11 0 2
12 0 3
13 0
14 0 -3
15 0 -2
16 0 -1
17 1 0
18 0 1
19 0 2
20 0 3
Any guidance on how to implement this within a function would be appreciated. You can copy the data. frame, pasting the block above in "..." here:
dt <- read.table( text="...", , header = TRUE )
Assuming there is no overlapping, you can use on of my favourite base functions, filter:
DF <- read.table(text="year event
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 1
10 0
11 0
12 0
13 0
14 0
15 0
16 0
17 1
18 0
19 0
20 0", header=TRUE)
DF$window <- head(filter(c(rep(0, 3), DF$event, rep(0, 3)),
filter=-3:3)[-(1:3)], -3)
DF$window[DF$window == 0 & DF$event==0] <- NA
# year event window
# 1 1 0 NA
# 2 2 0 NA
# 3 3 0 NA
# 4 4 0 NA
# 5 5 0 NA
# 6 6 0 -3
# 7 7 0 -2
# 8 8 0 -1
# 9 9 1 0
# 10 10 0 1
# 11 11 0 2
# 12 12 0 3
# 13 13 0 NA
# 14 14 0 -3
# 15 15 0 -2
# 16 16 0 -1
# 17 17 1 0
# 18 18 0 1
# 19 19 0 2
# 20 20 0 3

assigning new values based on the location in the sequence

Working in R.
The data tracks changes in brain activity over time. Column "mark" contains information when a particular treatment begins and ends. For examples, the first condition (mark==1) begins in row 3 and ends in row 6. The second experimental condition (mark==2) starts in row 9 and ends in 12. Another batch of treatment one is repeated between rows 15 and 18.
ob.id <- c(1:20)
mark <- c(0,0,1,0,0,1,0,0,2,0,0,2,0,0,1,0,0,1,0,0)
condition<-c(0,0,1,1,1,1,0,0,2,2,2,2,0,0,1, 1,1,1,0,0)
start <- data.frame(ob.id,mark)
result<-data.frame(ob.id,mark,condition)
print (start)
> print (start)
ob.id mark
1 1 0
2 2 0
3 3 1
4 4 0
5 5 0
6 6 1
7 7 0
8 8 0
9 9 2
10 10 0
11 11 0
12 12 2
13 13 0
14 14 0
15 15 1
16 16 0
17 17 0
18 18 1
19 19 0
20 20 0
I need to create a column that would have a dummy variable indicating the membership of an observation in corresponding experimental condition, like this:
> print(result)
ob.id mark condition
1 1 0 0
2 2 0 0
3 3 1 1
4 4 0 1
5 5 0 1
6 6 1 1
7 7 0 0
8 8 0 0
9 9 2 2
10 10 0 2
11 11 0 2
12 12 2 2
13 13 0 0
14 14 0 0
15 15 1 1
16 16 0 1
17 17 0 1
18 18 1 1
19 19 0 0
20 20 0 0
Thanks for your help!
This is a fun little problem. The trick I use below is to first calculate the rle of the mark vector, which makes the problem simpler, as the resulting values vector will always have just one 0 that may or may not need to be replaced (depending on the surrounding values).
# example vector with some edge cases
v = c(0,0,1,0,0,0,1,2,0,0,2,0,0,1,0,0,0,0,1,2,0,2)
v.rle = rle(v)
v.rle
#Run Length Encoding
# lengths: int [1:14] 2 1 3 1 1 2 1 2 1 4 ...
# values : num [1:14] 0 1 0 1 2 0 2 0 1 0 ...
vals = rle(v)$values
# find the 0's that need to be replaced and replace by the previous value
idx = which(tail(head(vals,-1),-1) == 0 & (head(vals,-2) == tail(vals,-2)))
vals[idx + 1] <- vals[idx]
# finally go back to the original vector
v.rle$values = vals
inverse.rle(v.rle)
# [1] 0 0 1 1 1 1 1 2 2 2 2 0 0 1 1 1 1 1 1 2 2 2
Probably the least cumbersome thing to do is to put the above in a function and then apply that to your data.frame vector (as opposed to manipulating the vector explicitly).
Another approach, based on #SimonO101's observation, involves constructing the right groups from the starting data (run the by part separately, piece by piece, to see how it works):
library(data.table)
dt = data.table(start)
dt[, result := mark[1],
by = {tmp = rep(0, length(mark));
tmp[which(mark != 0)[c(F,T)]] = 1;
cumsum(mark != 0) - tmp}]
dt
# ob.id mark result
# 1: 1 0 0
# 2: 2 0 0
# 3: 3 1 1
# 4: 4 0 1
# 5: 5 0 1
# 6: 6 1 1
# 7: 7 0 0
# 8: 8 0 0
# 9: 9 2 2
#10: 10 0 2
#11: 11 0 2
#12: 12 2 2
#13: 13 0 0
#14: 14 0 0
#15: 15 1 1
#16: 16 0 1
#17: 17 0 1
#18: 18 1 1
#19: 19 0 0
#20: 20 0 0
The latter approach will probably be more flexible.
Here is one way I could think of doing it:
# Find where experiments stop and start
ind <- which( result$mark != 0 )
[1] 3 6 9 12 15 18
# Make a matrix of the start and stop indices taking odd and even elements of the vector
idx <- cbind( head(ind , -1)[ 1:length(ind) %% 2 == 1 ] ,tail( ind , -1)[ 1:length(ind) %% 2 == 1 ] )
[,1] [,2]
[1,] 3 6
[2,] 9 12
[3,] 15 18
edit
I realised making the above index matrix would be easier with just taking odd and even elements:
idx <- cbind( ind[ 1:length(ind) %% 2 == 1 ] , ind[ 1:length(ind) %% 2 != 1 ] )
# Make vector of row indices to turn to 1's
ones <- as.vector( apply( idx , 1 , function(x) c( x[1]:x[2] ) ) )
# Make your new column and turn appropriate rows to 1
result$condition <- 0
result$condition[ ones ] <- 1
result
# ob.id mark condition
#1 1 0 0
#2 2 0 0
#3 3 1 1
#4 4 1 1
#5 5 1 1
#6 6 1 1
#7 7 0 0
#8 8 0 0
#9 9 1 1
#10 10 1 1
#11 11 1 1
#12 12 1 1
#13 13 0 0
#14 14 0 0
#15 15 1 1
#16 16 1 1
#17 17 1 1
#18 18 1 1
#19 19 0 0
#20 20 0 0
Edit
#eddi pointed out I needed to put the value of the experiment in, not just one. So this is another strategy which uses gasp(!) a for loop. This will only be really detrimental if you have millions thousands of experiments (remember to pre-allocate your results vector):
ind <- matrix( which( start$mark != 0 ) , ncol = 2 , byrow = TRUE )
ind <- cbind( ind , start$mark[ ind[ , 1 ] ] )
# [,1] [,2] [,3]
#[1,] 3 6 1
#[2,] 9 12 2
#[3,] 15 18 1
res <- integer( nrow( start ) )
for( i in 1:nrow(ind) ){
res[ ind[i,1]:ind[i,2] ] <- ind[i,3]
}
[1] 0 0 1 1 1 1 0 0 2 2 2 2 0 0 1 1 1 1 0 0

Resources