Create column conditioning the behavior of rows in the dataset - r

I would like to do something very specific. I have a vast set of data, which, in summary, looks more or less like this, with values 0, 1 and 2:
I need to create a situation variable so that it contains the value 0, 1 and 2.
The value 0 for cases that contain only 0's and 1's in the entire line.
The value 1 for the case where the value 2 appears, but at some point 1 appears before it.
The value 2 for the case where the value 2 appears, but at some point 0 appears before it.
So it's something close to:
structure(list(X1 = c(1, 1, 1, 1, 1, 1, 1, 1, 0, 1), X2 = c(1,
1, 1, 1, 0, 0, 0, 0, 0, 2), X3 = c(0, 1, 1, 1, 1, 0, 0, 1, 0,
0), X4 = c(0, 1, 1, 0, 1, 1, 0, 0, 0, 0), X5 = c(2, 1, 1, 0,
2, 1, 1, 0, 0, 0), X6 = c(2, 1, 1, 0, 2, 1, 1, 0, 0, 0), X7 = c(2,
1, 1, 1, 2, 1, 1, 2, 0, 0), X8 = c(0, 1, 1, 1, 2, 1, 2, 2, 2,
0)), class = "data.frame", row.names = c(NA, 10L))

I wrote a score function and applied it over all the rows of your dataframe.
score <- function(x) {
a <- which(x == 2)
ifelse(length(a) > 0, ifelse(a[1] >=2, 2 - x[a[1] - 1], 1), 0)
}
df <- structure(list(X1 = c(1, 1, 1, 1, 1, 1, 1, 1, 0, 1),
X2 = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 2),
X3 = c(0, 1, 1, 1, 1, 0, 0, 1, 0, 0),
X4 = c(0, 1, 1, 0, 1, 1, 0, 0, 0, 0),
X5 = c(2, 1, 1, 0, 2, 1, 1, 0, 0, 0),
X6 = c(2, 1, 1, 0, 2, 1, 1, 0, 0, 0),
X7 = c(2, 1, 1, 1, 2, 1, 1, 2, 0, 0),
X8 = c(0, 1, 1, 1, 2, 1, 2, 2, 2, 0)),
class = "data.frame", row.names = c(NA, 10L))
df$situation <- sapply(1:nrow(df), function(i) score(as.numeric(df[i,])))
df

Here's a tidyverse approach.
I'll first concatenate all columns together, then use grepl() to look for 12 or 02.
library(tidyverse)
df %>% rowwise() %>%
mutate(concat = paste(c_across(everything()), collapse = "")) %>%
ungroup() %>%
mutate(situation = case_when(
!grepl(2, concat) ~ 0,
grepl("12", concat) ~ 1,
grepl("02", concat) ~ 2
)) %>%
select(-concat)
Output
# A tibble: 10 x 9
X1 X2 X3 X4 X5 X6 X7 X8 situation
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 0 0 2 2 2 0 2
2 1 1 1 1 1 1 1 1 0
3 1 1 1 1 1 1 1 1 0
4 1 1 1 0 0 0 1 1 0
5 1 0 1 1 2 2 2 2 1
6 1 0 0 1 1 1 1 1 0
7 1 0 0 0 1 1 1 2 1
8 1 0 1 0 0 0 2 2 2
9 0 0 0 0 0 0 0 2 2
10 1 2 0 0 0 0 0 0 1
Note that this solution assumes that:
2 will not appear in the first column
1 or 2 in the situation is defined by the number immediately before 2 in your dataset
There will not be a case of 12 and 02 happening in the same row

Related

Create new columns using across() and if_else()

I have survey data that has a binary 1, 0 (indicating peak or off-peak) variable with the related peak or off-peak numbers in two separate columns.
structure(list(q9_jul_2019 = c(1, 0, 1, 0, 1, 0), q9_aug_2019 = c(1,
0, 1, 0, 1, 0), q9_sep_2019 = c(1, 0, 1, 0, 1, 0), q9_oct_2019 = c(0,
0, 1, 0, 1, 0), q9_nov_2019 = c(0, 0, 1, 0, 1, 0), q9_dec_2019 = c(0,
0, 1, 0, 0, 0), q9_jan_2020 = c(0, 0, 1, 0, 0, 0), q9_feb_2020 = c(0,
1, 0, 1, 0, 0), q9_mar_2020 = c(1, 1, 0, 1, 0, 0), q9_apr_2020 = c(1,
1, 1, 1, 0, 1), q9_may_2020 = c(0, 1, 0, 0, 0, 0), q9_jun_2020 = c(0,
0, 0, 0, 0, 0), q15 = c(1, 10, 30, 0, 2, 0), q22 = c(0, 10, 6,
0, 0, 0)), row.names = c(NA, 6L), class = "data.frame")
I have created new monthly columns that have the associated visitation numbers in that column but I'm sure there must be a neater way to do it using across(). I haven't been able to make it work though, so at the moment I'm stuck at the following:
survey <- survey %>%
mutate(visitation_jul_19 = if_else(q9_jul_2019 == 1, q15, q22),
visitation_aug_19 = if_else(q9_aug_2019 == 1, q15, q22),
visitation_sep_19 = if_else(q9_sep_2019 == 1, q15, q22),
visitation_oct_19 = if_else(q9_oct_2019 == 1, q15, q22),
visitation_nov_19 = if_else(q9_nov_2019 == 1, q15, q22),
visitation_dec_19 = if_else(q9_dec_2019 == 1, q15, q22),
visitation_jan_20 = if_else(q9_jan_2020 == 1, q15, q22),
visitation_feb_20 = if_else(q9_feb_2020 == 1, q15, q22),
visitation_mar_20 = if_else(q9_mar_2020 == 1, q15, q22),
visitation_apr_20 = if_else(q9_apr_2020 == 1, q15, q22),
visitation_may_20 = if_else(q9_may_2020 == 1, q15, q22),
visitation_jun_20 = if_else(q9_jun_2020 == 1, q15, q22))
You may try
library(dplyr)
survey %>%
mutate(across(q9_jul_2019:q9_jun_2020, ~ ifelse(.x == 1, q15, q22)))
q9_jul_2019 q9_aug_2019 q9_sep_2019 q9_oct_2019 q9_nov_2019 q9_dec_2019 q9_jan_2020 q9_feb_2020 q9_mar_2020 q9_apr_2020
1 1 1 1 0 0 0 0 0 1 1
2 10 10 10 10 10 10 10 10 10 10
3 30 30 30 30 30 30 30 6 6 30
4 0 0 0 0 0 0 0 0 0 0
5 2 2 2 2 2 0 0 0 0 0
6 0 0 0 0 0 0 0 0 0 0
q9_may_2020 q9_jun_2020 q15 q22
1 0 0 1 0
2 10 10 10 10
3 6 6 30 6
4 0 0 0 0
5 0 0 2 0
6 0 0 0 0

Cumulative count for a column using R

I got data like this
structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), drug_1 = c(0,
0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1), drug_2 = c(0, 1, 1, 1, 1, 0,
1, 0, 0, 1, 0, 1)), class = "data.frame", row.names = c(NA, -12L
))
I would like to get the cumulative count of each column for each id and get the data like this
structure(list(id2 = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), drug_1_b = c(0,
0, 0, 0, 0, 1, 2, 0, 0, 1, 0, 2), drug_2_b = c(0, 1, 2, 3, 4,
0, 5, 0, 0, 1, 0, 2)), class = "data.frame", row.names = c(NA,
-12L))
You can get a cumulative sum with cumsum.
To split data.frame into subsets, you can use split and then lapply cumsum over the list of the data.frames and again over the list of the columns, or you can use the ave function which does exactly that:
data = structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), drug_1 = c(0,
0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1), drug_2 = c(0, 1, 1, 1, 1, 0,
1, 0, 0, 1, 0, 1)), class = "data.frame", row.names = c(NA, -12L
))
data[-1] = ave(data[-1], data$id, FUN=cumsum)
edit:
I assumed that the cumulative sum is requested (as per instructions) and that there is a mistake in the example data. If the example data is correct, then the condition is If the count is zero, don't do cumulative sum and leave at zero or ifelse(x == 0, 0, cumsum(x)) (as per #r2evans). However, this construct doesn't work when applied for the data.frame. A more complex helper function is required:
data[-1] = ave(data[-1], data$id, FUN=function(x){
y = cumsum(x)
y[x == 0] = 0
y
})
We can now compare it with the requested (renamed) data:
result = structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), drug_1 = c(0,
0, 0, 0, 0, 1, 2, 0, 0, 1, 0, 2), drug_2 = c(0, 1, 2, 3, 4,
0, 5, 0, 0, 1, 0, 2)), class = "data.frame", row.names = c(NA,
-12L))
identical(data, result)
Base R,
ave(df$drug_2, df$id, FUN = function(z) ifelse(z == 0, z, cumsum(z)))
# [1] 0 1 2 3 4 0 5 0 0 1 0 2
Edit Simplified the solution after reading r2evans' approach.
You could use
library(dplyr)
df %>%
group_by(id) %>%
mutate(across(starts_with("drug"),
~ifelse(.x == 0, 0, cumsum(.x)))) %>%
ungroup()
This returns
# A tibble: 12 x 3
id drug_1 drug_2
<dbl> <dbl> <dbl>
1 1 0 0
2 1 0 1
3 1 0 2
4 1 0 3
5 1 0 4
6 1 1 0
7 1 2 5
8 2 0 0
9 2 0 0
10 2 1 1
11 2 0 0
12 2 2 2
Base R solution:
# Resolve the names of vectors we want to cumulatively sum:
# drug_vec_names => character vector
drug_vec_names <- grep( "^drug\\_", colnames(df), value = TRUE)
# Resolve the names of vectors we want to keep:
# not_drug_vec_names => character vector
not_drug_vec_names <- names(df)[!(names(df) %in% drug_vec_names)]
# Calculate the result: res => data.frame
res <- setNames(
cbind(
df[,not_drug_vec_names],
replace(
ave(
df[,drug_vec_names],
df[,not_drug_vec_names],
FUN = cumsum
),
df[,drug_vec_names] == 0,
0
)
),
c(not_drug_vec_names, drug_vec_names)
)
If you have binary values (1/0) in drug columns, you can multiply the cumulative sum with itself to get 0 for 0 values.
library(dplyr)
df %>%
group_by(id) %>%
mutate(across(starts_with('drug'), ~cumsum(.) * .)) %>%
ungroup
# id drug_1 drug_2
# <dbl> <dbl> <dbl>
# 1 1 0 0
# 2 1 0 1
# 3 1 0 2
# 4 1 0 3
# 5 1 0 4
# 6 1 1 0
# 7 1 2 5
# 8 2 0 0
# 9 2 0 0
#10 2 1 1
#11 2 0 0
#12 2 2 2

Create a new variable based on other columns values

I have a paneldata dataframe structure, something like this:
df <- data.frame("id" = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3),
"Status_2014" = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0),
"Status_2015" = c(0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0),
"Status_2016" = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0))
I want to generate a new dummy variable, that takes the value 1, if the rows contains 1 in any of the three columns or otherwise 0 if not. It should end up like this:
df <- data.frame("id" = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3),
"Status_2014" = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0),
"Status_2015" = c(0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0),
"Status_2016" = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
"Final_status" = c(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0))
Can anyone help me achieve this?
We can use if_any on the columns that starts_with 'Status', to check for any 1 value in a row and it returns TRUE if there is one or else FALSE which is coerced to binary with as.integer/+
library(dplyr)
df %>%
mutate(Final_status = +(if_any(starts_with('Status'), ~ . ==1)))
-outptu
id Status_2014 Status_2015 Status_2016 Final_status
1 1 1 0 0 1
2 1 1 0 0 1
3 1 1 0 0 1
4 1 1 0 0 1
5 2 0 1 0 1
6 2 0 1 0 1
7 2 0 1 0 1
8 2 0 1 0 1
9 3 0 0 0 0
10 3 0 0 0 0
11 3 0 0 0 0
12 3 0 0 0 0
Or using rowSums from base R
df$Final_status <- +(rowSums(df[-1] > 0) > 0)
You write an if condition to define the variable as 1 or 0, and inside this condition the most straight forward ways would be a dplyr pipe.
I don't have the dplyr syntax in my head, to long not used, but dplyr is what you want.
https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf
best greetings

How to create multiple data frame from one data frame with multiple condition in R

I would like to create four data sets from the following given data frame by multiple conditions in x1 and x2
mydata=structure(list(y = c(-3, 24, 4, 5, 3, -3, -3, 24, 5, 4, 8, 7,
9, 2, 4, 8, 7, 3, 8, 12, 9, 10, 12, 11, 2),
x1 = c(0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
0, 1, 0, 1, 1, 0, 0, 1, 1, 1
),
x2 = c(1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0,
0, 1, 0, 0, 1, 1, 1, 0)), class = "data.frame",
row.names = c(NA,25L))
The first data set is mydata00 which is constructed with these conditions x1=0 and x2=0,
mydata00=filter(mydata, c(mydata$x1==0 & mydata$x2==0))
> mydata00
y x1 x2
1 -3 0 0
2 -3 0 0
3 8 0 0
4 3 0 0
5 9 0 0
Now, I need only the unique values of y and corresponding x1 and x2. Finally, I would like to sort y. So my final data set must look like
y x1 x2
1 -3 0 0
2 3 0 0
3 8 0 0
4 9 0 0
I would like to do the job for mydata11, mydata10, mydata01, where ,
mydata11=filter(mydata, c(mydata$x1==1 & mydata$x2==1))
mydata10=filter(mydata, c(mydata$x1==1 & mydata$x2==0))
mydata01=filter(mydata, c(mydata$x1==0 & mydata$x2==1))
Can I use any for loop or builtin functionn to create these data sets?
Any help is appreciated.
We can split the data based on unique values of x1 and x2 and get unique rows in each list after ordering it by y.
temp <- lapply(split(mydata, list(mydata$x1, mydata$x2)), function(x)
unique(x[order(x$y), ]))
temp
#$`0.0`
# y x1 x2
#6 -3 0 0
#18 3 0 0
#16 8 0 0
#21 9 0 0
#$`1.0`
# y x1 x2
#14 2 1 0
#5 3 1 0
#10 4 1 0
#4 5 1 0
#...
If we need data as a separate dataframe, we can name them appropriately and use list2env.
names(temp) <- paste0("mydata", names(temp))
list2env(temp, .GlobalEnv)
tidyverse way of doing this would be :
library(tidyverse)
mydata %>% group_split(x1, x2) %>% map(~.x %>% arrange(y) %>% distinct)

Counts of occurrences for column combinations

I have a dataset with three variables (X1,X2,X3) and these variables only take the value of 0 or 1.
The dataset is
dput(data)
structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0), .Dim = c(10L, 3L), .Dimnames = list(
NULL, c("x1", "x2", "x3")))
In the example above each row is an observation and each column is a variable.
I need to know the frequency of
(X1=1), (X2=1), (X3=1), (X1=1,X2=1), (X1=1,X3=1), (X2=1,X3=1), (X1=1,X2=1,X3=1)
I tried
table(rowSums(data !=0))
But this only give me the frequencies of one, two or three variables happens.
You could do:
comb <- sapply(1:3, combn, x = 3)
find <- function(colComb) sum(rowSums(data.frame(df[, colComb])) == length(colComb))
list <- sapply(comb, function(colComb) apply(colComb, 2, find))
names(list) <- sapply(comb, function(colComb) paste(apply(colComb, 2, paste, collapse = "&"), collapse = "|"))
$`1|2|3`
[1] 10 9 4
$`1&2|1&3|2&3`
[1] 9 4 3
$`1&2&3`
[1] 3
As suggested by user2957945 the short version:
lapply(1:3, function(x) combn(3, x, FUN=function(y) sum(Reduce("&", as.data.frame(df[,y])))))
You could use xtabs which is meant for three-way tables:
s <- structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0), .Dim = c(10L, 3L), .Dimnames = list(
NULL, c("x1", "x2", "x3")))
mytab <- xtabs(~x1+x2+x3, data = s)
mytab
, , x3 = 0
x2
x1 0 1
1 0 6
, , x3 = 1
x2
x1 0 1
1 1 3
If you would like it to look better, follow this up with ftable
ftable(mytab)
x3 0 1
x1 x2
1 0 0 1
1 6 3
Please note though, your example only has one value for x1.

Resources