Cumulative count for a column using R - r

I got data like this
structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), drug_1 = c(0,
0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1), drug_2 = c(0, 1, 1, 1, 1, 0,
1, 0, 0, 1, 0, 1)), class = "data.frame", row.names = c(NA, -12L
))
I would like to get the cumulative count of each column for each id and get the data like this
structure(list(id2 = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), drug_1_b = c(0,
0, 0, 0, 0, 1, 2, 0, 0, 1, 0, 2), drug_2_b = c(0, 1, 2, 3, 4,
0, 5, 0, 0, 1, 0, 2)), class = "data.frame", row.names = c(NA,
-12L))

You can get a cumulative sum with cumsum.
To split data.frame into subsets, you can use split and then lapply cumsum over the list of the data.frames and again over the list of the columns, or you can use the ave function which does exactly that:
data = structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), drug_1 = c(0,
0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1), drug_2 = c(0, 1, 1, 1, 1, 0,
1, 0, 0, 1, 0, 1)), class = "data.frame", row.names = c(NA, -12L
))
data[-1] = ave(data[-1], data$id, FUN=cumsum)
edit:
I assumed that the cumulative sum is requested (as per instructions) and that there is a mistake in the example data. If the example data is correct, then the condition is If the count is zero, don't do cumulative sum and leave at zero or ifelse(x == 0, 0, cumsum(x)) (as per #r2evans). However, this construct doesn't work when applied for the data.frame. A more complex helper function is required:
data[-1] = ave(data[-1], data$id, FUN=function(x){
y = cumsum(x)
y[x == 0] = 0
y
})
We can now compare it with the requested (renamed) data:
result = structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), drug_1 = c(0,
0, 0, 0, 0, 1, 2, 0, 0, 1, 0, 2), drug_2 = c(0, 1, 2, 3, 4,
0, 5, 0, 0, 1, 0, 2)), class = "data.frame", row.names = c(NA,
-12L))
identical(data, result)

Base R,
ave(df$drug_2, df$id, FUN = function(z) ifelse(z == 0, z, cumsum(z)))
# [1] 0 1 2 3 4 0 5 0 0 1 0 2

Edit Simplified the solution after reading r2evans' approach.
You could use
library(dplyr)
df %>%
group_by(id) %>%
mutate(across(starts_with("drug"),
~ifelse(.x == 0, 0, cumsum(.x)))) %>%
ungroup()
This returns
# A tibble: 12 x 3
id drug_1 drug_2
<dbl> <dbl> <dbl>
1 1 0 0
2 1 0 1
3 1 0 2
4 1 0 3
5 1 0 4
6 1 1 0
7 1 2 5
8 2 0 0
9 2 0 0
10 2 1 1
11 2 0 0
12 2 2 2

Base R solution:
# Resolve the names of vectors we want to cumulatively sum:
# drug_vec_names => character vector
drug_vec_names <- grep( "^drug\\_", colnames(df), value = TRUE)
# Resolve the names of vectors we want to keep:
# not_drug_vec_names => character vector
not_drug_vec_names <- names(df)[!(names(df) %in% drug_vec_names)]
# Calculate the result: res => data.frame
res <- setNames(
cbind(
df[,not_drug_vec_names],
replace(
ave(
df[,drug_vec_names],
df[,not_drug_vec_names],
FUN = cumsum
),
df[,drug_vec_names] == 0,
0
)
),
c(not_drug_vec_names, drug_vec_names)
)

If you have binary values (1/0) in drug columns, you can multiply the cumulative sum with itself to get 0 for 0 values.
library(dplyr)
df %>%
group_by(id) %>%
mutate(across(starts_with('drug'), ~cumsum(.) * .)) %>%
ungroup
# id drug_1 drug_2
# <dbl> <dbl> <dbl>
# 1 1 0 0
# 2 1 0 1
# 3 1 0 2
# 4 1 0 3
# 5 1 0 4
# 6 1 1 0
# 7 1 2 5
# 8 2 0 0
# 9 2 0 0
#10 2 1 1
#11 2 0 0
#12 2 2 2

Related

Create column conditioning the behavior of rows in the dataset

I would like to do something very specific. I have a vast set of data, which, in summary, looks more or less like this, with values 0, 1 and 2:
I need to create a situation variable so that it contains the value 0, 1 and 2.
The value 0 for cases that contain only 0's and 1's in the entire line.
The value 1 for the case where the value 2 appears, but at some point 1 appears before it.
The value 2 for the case where the value 2 appears, but at some point 0 appears before it.
So it's something close to:
structure(list(X1 = c(1, 1, 1, 1, 1, 1, 1, 1, 0, 1), X2 = c(1,
1, 1, 1, 0, 0, 0, 0, 0, 2), X3 = c(0, 1, 1, 1, 1, 0, 0, 1, 0,
0), X4 = c(0, 1, 1, 0, 1, 1, 0, 0, 0, 0), X5 = c(2, 1, 1, 0,
2, 1, 1, 0, 0, 0), X6 = c(2, 1, 1, 0, 2, 1, 1, 0, 0, 0), X7 = c(2,
1, 1, 1, 2, 1, 1, 2, 0, 0), X8 = c(0, 1, 1, 1, 2, 1, 2, 2, 2,
0)), class = "data.frame", row.names = c(NA, 10L))
I wrote a score function and applied it over all the rows of your dataframe.
score <- function(x) {
a <- which(x == 2)
ifelse(length(a) > 0, ifelse(a[1] >=2, 2 - x[a[1] - 1], 1), 0)
}
df <- structure(list(X1 = c(1, 1, 1, 1, 1, 1, 1, 1, 0, 1),
X2 = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 2),
X3 = c(0, 1, 1, 1, 1, 0, 0, 1, 0, 0),
X4 = c(0, 1, 1, 0, 1, 1, 0, 0, 0, 0),
X5 = c(2, 1, 1, 0, 2, 1, 1, 0, 0, 0),
X6 = c(2, 1, 1, 0, 2, 1, 1, 0, 0, 0),
X7 = c(2, 1, 1, 1, 2, 1, 1, 2, 0, 0),
X8 = c(0, 1, 1, 1, 2, 1, 2, 2, 2, 0)),
class = "data.frame", row.names = c(NA, 10L))
df$situation <- sapply(1:nrow(df), function(i) score(as.numeric(df[i,])))
df
Here's a tidyverse approach.
I'll first concatenate all columns together, then use grepl() to look for 12 or 02.
library(tidyverse)
df %>% rowwise() %>%
mutate(concat = paste(c_across(everything()), collapse = "")) %>%
ungroup() %>%
mutate(situation = case_when(
!grepl(2, concat) ~ 0,
grepl("12", concat) ~ 1,
grepl("02", concat) ~ 2
)) %>%
select(-concat)
Output
# A tibble: 10 x 9
X1 X2 X3 X4 X5 X6 X7 X8 situation
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 0 0 2 2 2 0 2
2 1 1 1 1 1 1 1 1 0
3 1 1 1 1 1 1 1 1 0
4 1 1 1 0 0 0 1 1 0
5 1 0 1 1 2 2 2 2 1
6 1 0 0 1 1 1 1 1 0
7 1 0 0 0 1 1 1 2 1
8 1 0 1 0 0 0 2 2 2
9 0 0 0 0 0 0 0 2 2
10 1 2 0 0 0 0 0 0 1
Note that this solution assumes that:
2 will not appear in the first column
1 or 2 in the situation is defined by the number immediately before 2 in your dataset
There will not be a case of 12 and 02 happening in the same row

R: Create a variable that uses value from previous row in a for loop

I am trying to create a variable that takes on a value for a variable (z) when two rows differ on another variable (x). So if row numbers 1 and 2 differs for x (starting on row #2), I would like z to take the value of 1, otherwise 0.
I have tried with different if and if-else sentences based on this question (For Loop that References the Previous Row in R), but it does not give me the desired results.
df <-
data.frame(
x = c(1, 1, 2, 0, 0, 0, 0, 1, 1, 2),
y = c(1, 1, 2, 0, 0, 0, 0, 1, 1, 2),
z = c(0, 1, 2, 0, 0, 0, 0, 1, 1, 2)
)
for (i in 2:length(df)) {
df$z <- ifelse(df$x[i] != df$x[i - 1], 1, 0)
}
for (i in 2:length(df)) {
if (df$x[i] != df$x[i - 1]) {
df$z == 1
} else{
df$z == 0
}
}
My expected results are:
df_expected <-
data.frame(
x = c(1, 1, 2, 0, 0, 0, 0, 1, 1, 2),
y = c(1, 1, 2, 0, 0, 0, 0, 1, 1, 2),
z = c(NA, 1, 1, 1, 0, 0, 0, 1, 0, 1)
)
Thanks a lot in advance!
Edit
If you need to use a for-loop, you could use
df$z <- 0
for (i in 2:nrow(df)) {
df[i, "z"] <- +(df[i, "x"] != df[i - 1, "x"])
}
The problem with your code is:
df$z == 1
doesn't assign anything, is a logical comparison.
You could use
library(dplyr)
df %>%
mutate(z = +(x != lag(x, default = first(x))))
This returns
x y z
1 1 1 0
2 1 1 0
3 2 2 1
4 0 0 1
5 0 0 0
6 0 0 0
7 0 0 0
8 1 1 1
9 1 1 0
10 2 2 1
Using data.table
library(data.table)
setDT(df)[, z := as.integer(x != shift(x, fill = first(x)))]

How to create multiple data frame from one data frame with multiple condition in R

I would like to create four data sets from the following given data frame by multiple conditions in x1 and x2
mydata=structure(list(y = c(-3, 24, 4, 5, 3, -3, -3, 24, 5, 4, 8, 7,
9, 2, 4, 8, 7, 3, 8, 12, 9, 10, 12, 11, 2),
x1 = c(0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
0, 1, 0, 1, 1, 0, 0, 1, 1, 1
),
x2 = c(1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0,
0, 1, 0, 0, 1, 1, 1, 0)), class = "data.frame",
row.names = c(NA,25L))
The first data set is mydata00 which is constructed with these conditions x1=0 and x2=0,
mydata00=filter(mydata, c(mydata$x1==0 & mydata$x2==0))
> mydata00
y x1 x2
1 -3 0 0
2 -3 0 0
3 8 0 0
4 3 0 0
5 9 0 0
Now, I need only the unique values of y and corresponding x1 and x2. Finally, I would like to sort y. So my final data set must look like
y x1 x2
1 -3 0 0
2 3 0 0
3 8 0 0
4 9 0 0
I would like to do the job for mydata11, mydata10, mydata01, where ,
mydata11=filter(mydata, c(mydata$x1==1 & mydata$x2==1))
mydata10=filter(mydata, c(mydata$x1==1 & mydata$x2==0))
mydata01=filter(mydata, c(mydata$x1==0 & mydata$x2==1))
Can I use any for loop or builtin functionn to create these data sets?
Any help is appreciated.
We can split the data based on unique values of x1 and x2 and get unique rows in each list after ordering it by y.
temp <- lapply(split(mydata, list(mydata$x1, mydata$x2)), function(x)
unique(x[order(x$y), ]))
temp
#$`0.0`
# y x1 x2
#6 -3 0 0
#18 3 0 0
#16 8 0 0
#21 9 0 0
#$`1.0`
# y x1 x2
#14 2 1 0
#5 3 1 0
#10 4 1 0
#4 5 1 0
#...
If we need data as a separate dataframe, we can name them appropriately and use list2env.
names(temp) <- paste0("mydata", names(temp))
list2env(temp, .GlobalEnv)
tidyverse way of doing this would be :
library(tidyverse)
mydata %>% group_split(x1, x2) %>% map(~.x %>% arrange(y) %>% distinct)

Counts of occurrences for column combinations

I have a dataset with three variables (X1,X2,X3) and these variables only take the value of 0 or 1.
The dataset is
dput(data)
structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0), .Dim = c(10L, 3L), .Dimnames = list(
NULL, c("x1", "x2", "x3")))
In the example above each row is an observation and each column is a variable.
I need to know the frequency of
(X1=1), (X2=1), (X3=1), (X1=1,X2=1), (X1=1,X3=1), (X2=1,X3=1), (X1=1,X2=1,X3=1)
I tried
table(rowSums(data !=0))
But this only give me the frequencies of one, two or three variables happens.
You could do:
comb <- sapply(1:3, combn, x = 3)
find <- function(colComb) sum(rowSums(data.frame(df[, colComb])) == length(colComb))
list <- sapply(comb, function(colComb) apply(colComb, 2, find))
names(list) <- sapply(comb, function(colComb) paste(apply(colComb, 2, paste, collapse = "&"), collapse = "|"))
$`1|2|3`
[1] 10 9 4
$`1&2|1&3|2&3`
[1] 9 4 3
$`1&2&3`
[1] 3
As suggested by user2957945 the short version:
lapply(1:3, function(x) combn(3, x, FUN=function(y) sum(Reduce("&", as.data.frame(df[,y])))))
You could use xtabs which is meant for three-way tables:
s <- structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0), .Dim = c(10L, 3L), .Dimnames = list(
NULL, c("x1", "x2", "x3")))
mytab <- xtabs(~x1+x2+x3, data = s)
mytab
, , x3 = 0
x2
x1 0 1
1 0 6
, , x3 = 1
x2
x1 0 1
1 1 3
If you would like it to look better, follow this up with ftable
ftable(mytab)
x3 0 1
x1 x2
1 0 0 1
1 6 3
Please note though, your example only has one value for x1.

Main diagonal into anti-diagonal

I need to transform main diagonal
{matrix(
1 1 1 1,
0 2 2 2,
0 0 3 3,
0 0 0 4)
}
into:
{matrix(
0 0 0 1,
0 0 1 2,
0 1 2 3,
1 2 3 4)
}
I tried all operators I could find t(), arev(), flipud(), apply(x,2,rev) and so on. Without a positive result. Hope you can help me.
Does this work for you? Takes each column and 'rotates' (for lack of a better word) x places, where x is the column index.
res <- sapply(1:ncol(input),function(x){
#get relevant column
base <- input[,x]
n <- length(base)
indices <- 1:n
#reshuffle indices: first above x, then below x
out <- base[c(indices[indices>x],indices[indices<=x])]
out
})
all(res==output)
[1] TRUE
data used:
input <- structure(c(1, 0, 0, 0, 1, 2, 0, 0, 1, 2, 3, 0, 1, 2, 3, 4), .Dim = c(4L,
4L))
output <- structure(c(0, 0, 0, 1, 0, 0, 1, 2, 0, 1, 2, 3, 1, 2, 3, 4), .Dim = c(4L,
4L))

Resources