Related
I would like to do something very specific. I have a vast set of data, which, in summary, looks more or less like this, with values 0, 1 and 2:
I need to create a situation variable so that it contains the value 0, 1 and 2.
The value 0 for cases that contain only 0's and 1's in the entire line.
The value 1 for the case where the value 2 appears, but at some point 1 appears before it.
The value 2 for the case where the value 2 appears, but at some point 0 appears before it.
So it's something close to:
structure(list(X1 = c(1, 1, 1, 1, 1, 1, 1, 1, 0, 1), X2 = c(1,
1, 1, 1, 0, 0, 0, 0, 0, 2), X3 = c(0, 1, 1, 1, 1, 0, 0, 1, 0,
0), X4 = c(0, 1, 1, 0, 1, 1, 0, 0, 0, 0), X5 = c(2, 1, 1, 0,
2, 1, 1, 0, 0, 0), X6 = c(2, 1, 1, 0, 2, 1, 1, 0, 0, 0), X7 = c(2,
1, 1, 1, 2, 1, 1, 2, 0, 0), X8 = c(0, 1, 1, 1, 2, 1, 2, 2, 2,
0)), class = "data.frame", row.names = c(NA, 10L))
I wrote a score function and applied it over all the rows of your dataframe.
score <- function(x) {
a <- which(x == 2)
ifelse(length(a) > 0, ifelse(a[1] >=2, 2 - x[a[1] - 1], 1), 0)
}
df <- structure(list(X1 = c(1, 1, 1, 1, 1, 1, 1, 1, 0, 1),
X2 = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 2),
X3 = c(0, 1, 1, 1, 1, 0, 0, 1, 0, 0),
X4 = c(0, 1, 1, 0, 1, 1, 0, 0, 0, 0),
X5 = c(2, 1, 1, 0, 2, 1, 1, 0, 0, 0),
X6 = c(2, 1, 1, 0, 2, 1, 1, 0, 0, 0),
X7 = c(2, 1, 1, 1, 2, 1, 1, 2, 0, 0),
X8 = c(0, 1, 1, 1, 2, 1, 2, 2, 2, 0)),
class = "data.frame", row.names = c(NA, 10L))
df$situation <- sapply(1:nrow(df), function(i) score(as.numeric(df[i,])))
df
Here's a tidyverse approach.
I'll first concatenate all columns together, then use grepl() to look for 12 or 02.
library(tidyverse)
df %>% rowwise() %>%
mutate(concat = paste(c_across(everything()), collapse = "")) %>%
ungroup() %>%
mutate(situation = case_when(
!grepl(2, concat) ~ 0,
grepl("12", concat) ~ 1,
grepl("02", concat) ~ 2
)) %>%
select(-concat)
Output
# A tibble: 10 x 9
X1 X2 X3 X4 X5 X6 X7 X8 situation
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 0 0 2 2 2 0 2
2 1 1 1 1 1 1 1 1 0
3 1 1 1 1 1 1 1 1 0
4 1 1 1 0 0 0 1 1 0
5 1 0 1 1 2 2 2 2 1
6 1 0 0 1 1 1 1 1 0
7 1 0 0 0 1 1 1 2 1
8 1 0 1 0 0 0 2 2 2
9 0 0 0 0 0 0 0 2 2
10 1 2 0 0 0 0 0 0 1
Note that this solution assumes that:
2 will not appear in the first column
1 or 2 in the situation is defined by the number immediately before 2 in your dataset
There will not be a case of 12 and 02 happening in the same row
I got data like this
structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), drug_1 = c(0,
0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1), drug_2 = c(0, 1, 1, 1, 1, 0,
1, 0, 0, 1, 0, 1)), class = "data.frame", row.names = c(NA, -12L
))
I would like to get the cumulative count of each column for each id and get the data like this
structure(list(id2 = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), drug_1_b = c(0,
0, 0, 0, 0, 1, 2, 0, 0, 1, 0, 2), drug_2_b = c(0, 1, 2, 3, 4,
0, 5, 0, 0, 1, 0, 2)), class = "data.frame", row.names = c(NA,
-12L))
You can get a cumulative sum with cumsum.
To split data.frame into subsets, you can use split and then lapply cumsum over the list of the data.frames and again over the list of the columns, or you can use the ave function which does exactly that:
data = structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), drug_1 = c(0,
0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1), drug_2 = c(0, 1, 1, 1, 1, 0,
1, 0, 0, 1, 0, 1)), class = "data.frame", row.names = c(NA, -12L
))
data[-1] = ave(data[-1], data$id, FUN=cumsum)
edit:
I assumed that the cumulative sum is requested (as per instructions) and that there is a mistake in the example data. If the example data is correct, then the condition is If the count is zero, don't do cumulative sum and leave at zero or ifelse(x == 0, 0, cumsum(x)) (as per #r2evans). However, this construct doesn't work when applied for the data.frame. A more complex helper function is required:
data[-1] = ave(data[-1], data$id, FUN=function(x){
y = cumsum(x)
y[x == 0] = 0
y
})
We can now compare it with the requested (renamed) data:
result = structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), drug_1 = c(0,
0, 0, 0, 0, 1, 2, 0, 0, 1, 0, 2), drug_2 = c(0, 1, 2, 3, 4,
0, 5, 0, 0, 1, 0, 2)), class = "data.frame", row.names = c(NA,
-12L))
identical(data, result)
Base R,
ave(df$drug_2, df$id, FUN = function(z) ifelse(z == 0, z, cumsum(z)))
# [1] 0 1 2 3 4 0 5 0 0 1 0 2
Edit Simplified the solution after reading r2evans' approach.
You could use
library(dplyr)
df %>%
group_by(id) %>%
mutate(across(starts_with("drug"),
~ifelse(.x == 0, 0, cumsum(.x)))) %>%
ungroup()
This returns
# A tibble: 12 x 3
id drug_1 drug_2
<dbl> <dbl> <dbl>
1 1 0 0
2 1 0 1
3 1 0 2
4 1 0 3
5 1 0 4
6 1 1 0
7 1 2 5
8 2 0 0
9 2 0 0
10 2 1 1
11 2 0 0
12 2 2 2
Base R solution:
# Resolve the names of vectors we want to cumulatively sum:
# drug_vec_names => character vector
drug_vec_names <- grep( "^drug\\_", colnames(df), value = TRUE)
# Resolve the names of vectors we want to keep:
# not_drug_vec_names => character vector
not_drug_vec_names <- names(df)[!(names(df) %in% drug_vec_names)]
# Calculate the result: res => data.frame
res <- setNames(
cbind(
df[,not_drug_vec_names],
replace(
ave(
df[,drug_vec_names],
df[,not_drug_vec_names],
FUN = cumsum
),
df[,drug_vec_names] == 0,
0
)
),
c(not_drug_vec_names, drug_vec_names)
)
If you have binary values (1/0) in drug columns, you can multiply the cumulative sum with itself to get 0 for 0 values.
library(dplyr)
df %>%
group_by(id) %>%
mutate(across(starts_with('drug'), ~cumsum(.) * .)) %>%
ungroup
# id drug_1 drug_2
# <dbl> <dbl> <dbl>
# 1 1 0 0
# 2 1 0 1
# 3 1 0 2
# 4 1 0 3
# 5 1 0 4
# 6 1 1 0
# 7 1 2 5
# 8 2 0 0
# 9 2 0 0
#10 2 1 1
#11 2 0 0
#12 2 2 2
I'm trying to compare to matrices. When the values aren't equivalent then I want to use the value from mat2 so long as it is greater than 0; if it is zero, then I want the value from mat1. As the code is currently, it appears to constantly return the value of mat1.
Here is my attempt:
mat.data1 <- c(1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1)
mat1 <- matrix(data = mat.data1, nrow = 5, ncol = 5, byrow = TRUE)
mat.data2 <- c(0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 1, 2, 2, 0, 0, 0, 1, 2, 2, 0, 2, 1, 0, 1)
mat2 <- matrix(data = mat.data2, nrow = 5, ncol = 5, byrow = TRUE)
mat3 = if(mat1 == mat2){mat1} else {if(mat2>0){mat2} else {mat1}}
the expected output should be
1 0 1 1 1
0 1 2 1 1
1 1 2 2 0
1 1 1 2 2
1 1 1 0 1
Here is one potential way to do it.
mat.data1 <- c(1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1)
mat1 <- matrix(data = mat.data1, nrow = 5, ncol = 5, byrow = TRUE)
mat.data2 <- c(0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 1, 2, 2, 0, 0, 0, 1, 2, 2, 0, 2, 1, 0, 1)
mat2 <- matrix(data = mat.data2, nrow = 5, ncol = 5, byrow = TRUE)
mat3 <- mat1
to_change <- which(mat2 != mat1 & mat2 > 0)
mat3[to_change] <- mat2[to_change]
This specific use of which essentially asks for the locations in mat2 that are not equal to that in mat1 AND where mat2 is greater than zero. You can then just do a subset and place those values in mat3.
This output is then:
> mat3
[,1] [,2] [,3] [,4] [,5]
[1,] 1 0 1 1 1
[2,] 0 1 2 1 1
[3,] 1 1 2 2 0
[4,] 1 1 1 2 2
[5,] 1 2 1 0 1
We can use coalesce
library(dplyr)
out <- coalesce(replace(mat2, !mat2, NA), replace(mat1, !mat1, NA))
replace(out, is.na(out), 0)
Or as #Axeman mentioned
coalesce(out, 0)
I wish to number the non-zero elements in a matrix by row. Here is a small data set and the desired result. I would prefer a solution in base R.
my.data <- matrix(c(10, 0, 0, 0, 0,
0, 3, 9, 0, 1,
2, 12, 0, 0, 0,
5, 5, 5, 0, 5,
0, 0, 0, 0, 0), nrow = 5, byrow = TRUE)
desired.result <- matrix(c( 1, 0, 0, 0, 0,
0, 1, 2, 0, 3,
1, 2, 0, 0, 0,
1, 2, 3, 0, 4,
0, 0, 0, 0, 0), nrow = 5, byrow = TRUE)
Another couple options:
# create new matrix with multiplication
t(apply(my.data != 0, 1, cumsum)) * (my.data != 0)
# alternative:
# replace elements in original matrix
my.data[my.data != 0] = t(apply(my.data != 0, 1, cumsum))[my.data != 0]
my.data
# [,1] [,2] [,3] [,4] [,5]
# [1,] 1 0 0 0 0
# [2,] 0 1 2 0 3
# [3,] 1 2 0 0 0
# [4,] 1 2 3 0 4
# [5,] 0 0 0 0 0
Here's a relatively naive base R method:
t(apply(my.data, 1, function(x) {
x[x != 0] <- seq_len(sum(x != 0))
x
}))
[,1] [,2] [,3] [,4] [,5]
[1,] 1 0 0 0 0
[2,] 0 1 2 0 3
[3,] 1 2 0 0 0
[4,] 1 2 3 0 4
[5,] 0 0 0 0 0
I have a data-frame which looks like this
a <- as.data.frame(c(1,0,0, 0,1,1,1,1,1,0,0))
I want to find the column indexes where for every row when it was 1 for the first time and last time
eg: for row a it is 2,10
You could do:
x <- df==1
rbind(max.col(x, "first"), max.col(x, "last"))
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
#[1,] 2 1 1 1 2 3 3 2 2 4
#[2,] 2 6 6 4 4 5 6 6 2 6
OR with apply:
apply(df, 1, function(x) c(min(which(x==1)),max(which(x==1))))
data
df <- structure(list(a = c(0, 1, 1, 1, 0, 0, 0, 0, 0, 0), b = c(1,
0, 0, 0, 1, 0, 0, 1, 1, 0), c = c(0, 1, 0, 1, 0, 1, 1, 1, 0,
0), d = c(0, 1, 0, 1, 1, 0, 1, 0, 0, 1), e = c(0, 0, 0, 0, 0,
1, 0, 1, 0, 1), f = c(0, 1, 1, 0, 0, 0, 1, 1, 0, 1)), .Names = c("a",
"b", "c", "d", "e", "f"), row.names = c(NA, -10L), class = "data.frame")