Rowwise name of column where first non-zero value appears - r

I've got a bunch of columns all starting with the prefix wtp_ that occur in the midst of a wide dataframe (with several columns before and after the wtp_ columns). Mini example:
df <- tribble(~id, ~complete, ~wtp_20,~wtp_40,~wtp_60,~wtp_80,~wtp_100, ~sex,
1, 1, 0,0,1,1,1, "F",
2, 0, 0,0,0,1,1, "F",
3, 0, 0,0,0,0,1, "M",
4, 1, 1,1,1,1,1, "M",
5, 1, 0,0,0,0,0, "M",
6, 0, 0,1,1,1,1, "F"); df
What I'm looking for: I need to create a new variable (min_wtp) that returns the name of the column the first time that one of the wtp_ columns switches from 0 to 1. In other words, I need a solution to create the following:
df_needed <- tribble(~id, ~complete, ~wtp_20,~wtp_40,~wtp_60,~wtp_80,~wtp_100, ~sex, ~min_wtp,
1, 1, 0,0,1,1,1, "F", "wtp_60",
2, 0, 0,0,0,1,1, "F", "wtp_80",
3, 0, 0,0,0,0,1, "M", "wtp_100",
4, 1, 1,1,1,1,1, "M", "wtp_20",
5, 1, 0,0,0,0,0, "M", "NA",
6, 0, 0,1,1,1,1, "F", "wtp_40"); df_needed
Please note the following complications:
-Some people (like id==5) never change to 1 while others (like id==4) are 1 all along.
-There are some irrelevant columns occurring before the wtp_ columns that have 0s and 1s in them which should be ignored in the construction of min_wtp.
-There are way more columns (including wtp_ columns) than the minimal example I included above.
I've tried playing with which and colnames functions in combination with select(starts_with("wtp_")) but have been unsuccessful.
If anyone has a dplyr solution, that would be preferred.

We can use apply to get, for each row, the number of first column that satisfies your conditions. Then we use that number as the index to get the column name.
df$min_wtp = apply(df[ , grepl("wtp", names(df))], 1, function(x) {
names(x)[min(which(x > 0))]
})
df
id complete wtp_20 wtp_40 wtp_60 wtp_80 wtp_100 sex min_wtp
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 1 1 0 0 1 1 1 F wtp_60
2 2 0 0 0 0 1 1 F wtp_80
3 3 0 0 0 0 0 1 M wtp_100
4 4 1 1 1 1 1 1 M wtp_20
5 5 1 0 0 0 0 0 M NA
6 6 0 0 1 1 1 1 F wtp_40

It would be much easier if you get the data in long format :
library(dplyr)
df %>%
tidyr::pivot_longer(cols = starts_with('wtp')) %>%
group_by(id) %>%
summarise(min_wtp = name[which(value == 1 &
lag(value, default = 0) == 0)[1]]) %>%
left_join(df, by = 'id')
# A tibble: 6 x 9
# id min_wtp complete wtp_20 wtp_40 wtp_60 wtp_80 wtp_100 sex
# <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#1 1 wtp_60 1 0 0 1 1 1 F
#2 2 wtp_80 0 0 0 0 1 1 F
#3 3 wtp_100 0 0 0 0 0 1 M
#4 4 wtp_20 1 1 1 1 1 1 M
#5 5 NA 1 0 0 0 0 0 M
#6 6 wtp_40 0 0 1 1 1 1 F
Without reshaping the data you can use rowwise with c_across :
apply_fun <- function(x) {
which(x == 1 & lag(x, default = 0) == 0)[1]
}
cols <- grep('^wtp', names(df), value = TRUE)
df %>%
rowwise() %>%
mutate(min_wtp = cols[apply_fun(c_across(cols))])

If it never goes backwards from 1 to 0, then you can find the change point very quickly with some basic sums:
sw <- startsWith(names(df), "wtp_")
names(df[sw])[sum(sw) - rowSums(df[sw]) + 1]
#[1] "wtp_60" "wtp_80" "wtp_100" "wtp_20" NA "wtp_40"

Related

"N-1" Cumulative Averages

I have the following data:
library(dplyr)
my_data = data.frame(patient_id = c(1,1,1,1, 2,2,2),
age = c(43, 43, 44, 44, 21, 21, 21),
gender = c("M", "M", "M", "M", "F", "F", "F"),
appointment_number = c(1,2,3,4,1,2,3),
missed = c(0, 0, 1, 1, 1, 1, 1))
My Question: Grouped by each ID, I want to create two variables:
The first variable takes the value of the previous appointment value
The second variable takes the "n-1" cumulative average of the previous appointment values (e.g. If patient_id = 1 has 8 rows, the cumulative average at this row would be the cumulative average of the first 7 rows)
Here is my attempt to do this:
my_data_final <- my_data %>%
group_by(patient_id) %>%
mutate(cummean = cumsum(missed)/(row_number() - 1)) %>%
mutate(previous_apt = lag(missed))
This results in the cummean variable being greater than 1, even though the variable in question can only be 1 or 0:
# A tibble: 7 x 7
# Groups: patient_id [2]
patient_id age gender appointment_number missed cummean previous_apt
<dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
1 1 43 M 1 0 NaN NA
2 1 43 M 2 0 0 0
3 1 44 M 3 1 0.5 0
4 1 44 M 4 1 0.667 1
5 2 21 F 1 1 Inf NA
6 2 21 F 2 1 2 1
7 2 21 F 3 1 1.5 1
Can someone please show me how to fix this?
Thanks!
Note: I tried to resolve this - is this correct?
my_data %>%
group_by(patient_id) %>%
mutate(previous_apt = lag(missed)) %>%
mutate(cummean = (cumsum(missed) - missed) / (row_number() - 1)) %>% mutate(previous_apt_2 = lag(missed, 2))

Assigning values to a column in the based on values of another column in the same dataframe in R

I have a dataframe with 3 columns and I want to assign values to a fourth column of this dataframe if the sum of a condition is met in another row. In this example I want to assign 1 to df[,4], if df[,3]>=2 for each row.
An example of what I want as the output is:
Any help is appreciated.
Thank you,
library(tidyverse)
data <-
tribble(
~ID, ~time1, ~time2,
'jkjkdf', 1, 1,
'kjkj', 1, 0,
'fgf', 1, 1,
'jhkj', 0, 1,
'hgd', 0,0
)
mutate(data, label = if_else(time1 + time2 >= 2, 1, 0))
#> # A tibble: 5 x 4
#> ID time1 time2 label
#> <chr> <dbl> <dbl> <dbl>
#> 1 jkjkdf 1 1 1
#> 2 kjkj 1 0 0
#> 3 fgf 1 1 1
#> 4 jhkj 0 1 0
#> 5 hgd 0 0 0
#or with n time columns
data %>%
rowwise() %>%
mutate(label = if_else(sum(across(starts_with('time'))) >= 2, 1, 0))
#> # A tibble: 5 x 4
#> # Rowwise:
#> ID time1 time2 label
#> <chr> <dbl> <dbl> <dbl>
#> 1 jkjkdf 1 1 1
#> 2 kjkj 1 0 0
#> 3 fgf 1 1 1
#> 4 jhkj 0 1 0
#> 5 hgd 0 0 0
Created on 2021-06-06 by the reprex package (v2.0.0)
Do you want to assign 1 if both time1 and time2 are 1 ?
If there are only two columns you can do -
df$label <- as.integer(df$time1 == 1 & df$time2 == 1)
If there are many such time columns we can take help of rowSums -
cols <- grep('time', names(df))
df$label <- as.integer(rowSums(df[cols] == 1) == length(cols))
df
# a time1 time2 label
#1 a 1 1 1
#2 b 1 0 0
#3 c 1 1 1
#4 d 0 1 0
#5 e 0 0 0
data
Images are not the right way to share data, provide them in a reproducible format.
df <- data.frame(a = letters[1:5],
time1 = c(1, 1, 1, 0, 0),
time2 = c(1, 0, 1, 1, 0))
We could do thin in a vectorized way using tidyverse methods - select the columns that starts_with 'time' in column name, reduce it to a single vector by adding (+) the corresponding elements, use the aliases from magrittr to convert it to binary for creating the 'label' column. Finally, the object should be assigned (<-) to original data if we want the original object to be changed
library(dplyr)
library(purrr)
library(magrittr)
df %>%
mutate(label = select(cur_data(), starts_with('time')) %>%
reduce(`+`) %>%
is_weakly_greater_than(2) %>%
multiply_by(1))
a time1 time2 label
1 a 1 1 1
2 b 1 0 0
3 c 1 1 1
4 d 0 1 0
5 e 0 0 0
data
df <- structure(list(a = c("a", "b", "c", "d", "e"), time1 = c(1, 1,
1, 0, 0), time2 = c(1, 0, 1, 1, 0)), class = "data.frame", row.names = c(NA,
-5L))

Create new columns based on comma-separated values in another column in R [duplicate]

This question already has answers here:
Convert column with pipe delimited data into dummy variables [duplicate]
(4 answers)
Closed 2 years ago.
I have some data similar to that below.
df <- data.frame(id = 1:5, tags = c("A,B,AB,C", "C", "AB,E", NA, "B,C"))
df
# id tags
# 1 1 A,B,AB,C
# 2 2 C
# 3 3 AB,E
# 4 4 <NA>
# 5 5 B,C
I'd like to create a new dummy variable column for each tag in the "tags" column, resulting in a dataframe like the following:
correct_df <- data.frame(id = 1:5,
tags = c("A,B,AB,C", "C", "AB,E", NA, "B,C"),
A = c(1, 0, 0, 0, 0),
B = c(1, 0, 0, 0, 1),
C = c(1, 1, 0, 0, 1),
E = c(0, 0, 1, 0, 0),
AB = c(1, 0, 1, 0, 0)
)
correct_df
# id tags A B C E AB
# 1 1 A,B,AB,C 1 1 1 0 1
# 2 2 C 0 0 1 0 0
# 3 3 AB,E 0 0 0 1 1
# 4 4 <NA> 0 0 0 0 0
# 5 5 B,C 0 1 1 0 0
One of the challenges is ensuring that the "A" column has 1 only for the "A" tag, so that it doesn't has 1 for the "AB" tag, for example. The following won't work for this reason, since "A" gets 1 for the "AB" tag:
df <- df %>%
mutate(A = ifelse(grepl("A", tags, fixed = T), 1, 0))
df
# id tags A
# 1 1 A,B,AB,C 1
# 2 2 C 0
# 3 3 AB,E 1 < Incorrect
# 4 4 <NA> 0
# 5 5 B,C 0
Another challenge is doing this programmatically. I can probably deal with a solution that manually creates a column for each tag, but a solution that doesn't assume which tag columns need to be created beforehand is best, since there can potentially be many different tags. Is there some relatively simple solution that I'm overlooking?
Does this work:
> library(tidyr)
> library(dplyr)
> df %>% separate_rows(tags) %>% mutate(A = case_when(tags == 'A' ~ 1, TRUE ~ 0),
+ B = case_when(tags == 'B' ~ 1, TRUE ~ 0),
+ C = case_when(tags == 'C' ~ 1, TRUE ~ 0),
+ E = case_when(tags == 'E' ~ 1, TRUE ~ 0),
+ AB = case_when(tags == 'AB' ~ 1, TRUE ~ 0)) %>%
+ group_by(id) %>% mutate(tags = toString(tags)) %>% group_by(id, tags) %>% summarise(across(A:AB, sum))
`summarise()` regrouping output by 'id' (override with `.groups` argument)
# A tibble: 5 x 7
# Groups: id [5]
id tags A B C E AB
<int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 A, B, AB, C 1 1 1 0 1
2 2 C 0 0 1 0 0
3 3 AB, E 0 0 0 1 1
4 4 NA 0 0 0 0 0
5 5 B, C 0 1 1 0 0
>
Here's a solution:
library(dplyr)
library(stringr)
library(magrittr)
library(tidyr)
#Data
df <- data.frame(id = 1:5, tags = c("A,B,AB,C", "C", "AB,E", NA, "B,C"))
#Separate into rows
df %<>% mutate(t2 = tags) %>% separate_rows(t2, sep = ",")
#Create a presence/absence column
df %<>% mutate(pa = 1)
#Pivot wider and use the presence/absence
#column as entries; fill with 0 if absent
df %<>% pivot_wider(names_from = t2, values_from = pa, values_fill = 0)
df
# # A tibble: 5 x 8
# id tags A B AB C E `NA`
# <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 A,B,AB,C 1 1 1 1 0 0
# 2 2 C 0 0 0 1 0 0
# 3 3 AB,E 0 0 1 0 1 0
# 4 4 NA 0 0 0 0 0 1
# 5 5 B,C 0 1 0 1 0 0
Edit: updated the code to enable it to retain the tags column. Sorry.

Create a random binary variable for a subset of observations assigning 1 to a specific proportion of rows

I have a dataframe...
df <- tibble(
id = 1:10,
family = c("a","a","b","b","c", "d", "e", "f", "g", "h")
)
Families will only contain 2 members at most (so they're either individuals or pairs).
For individuals (families with only one row, i.e. id = 5:10), I want to create a column called 'random' that randomly assigns 50% of the entries as 1 and the rest as 0. All other rows (those belonging to families with 2 members) should also equal 0.
By the end, the data should look like the following (depending on which 50% of rows are assigned 1)...
df <- tibble(
id = 1:10,
family = c("a","a","b","b","c", "d", "e", "f", "g", "h"),
random = c(0, 0, 0, 0, 1, 0, 1, 1, 0, 0)
)
I am mostly using Tidyverse and would like to include it within a pipe.
I am currently trying something along the lines of...
df %>%
group_by(family) %>%
mutate(random = if(n() == 1) *not sure what goes here* else 0)
We can assign 0 if number of rows in a family is greater than 1 else select a random value between 0 and 1.
library(dplyr)
df %>%
group_by(family) %>%
mutate(random = if(n() > 1) 0 else sample(0:1, 1))
# id family random
# <int> <chr> <dbl>
# 1 1 a 0
# 2 2 a 0
# 3 3 b 0
# 4 4 b 0
# 5 5 c 1
# 6 6 d 1
# 7 7 e 0
# 8 8 f 0
# 9 9 g 0
#10 10 h 0
If we want a fixed number of 1's and 0's for groups with 1 value we can use
df %>%
add_count(family) %>%
mutate(n = replace(n, n > 1, 0),
n = replace(n, {inds = which(n == 1);sample(inds, length(inds)/2)}, 0))
# A tibble: 10 x 3
# id family n
# <int> <chr> <dbl>
# 1 1 a 0
# 2 2 a 0
# 3 3 b 0
# 4 4 b 0
# 5 5 c 1
# 6 6 d 0
# 7 7 e 0
# 8 8 f 1
# 9 9 g 1
#10 10 h 0
Using data.table
library(data.table)
setDT(df)[, if(.N > 1) 0 else sample(0:1, 1), family]

Check condition and return name of column for which the condition is fulfilled

I have a dataframe that looks like this:
df_start <- data.frame(
a = c(1, 1, 1, 1, 1),
b = c(0, 1, 0, 0, 0),
c = c(1, 0, 0, 0, 0),
n = c(0, 0, 0, 1, 0))
I want to test the condition if any of the columns from df_start[,2:n] (where n shows the last column of the dataframe) are equal to df$a then create two new columns out of which the first one returns 1 if the condition is TRUE and 0 if it is not, and the other gives the name of the column for which the condition was TRUE.
I managed to create the first column like this:
library(dplyr)
# check condition
df_start <- df_start %>% mutate(cond = ifelse(a == b | a == c | a == n, 1, 0))
Even though I think I need a different approach since I may have different number of columns every time. So I need to test the condition for column a and all columns from the 2 to the last one but I also would need to know for which column the condition was fulfilled.
Desired output:
# desired output
df_end <- data.frame(a = c(1, 1, 1, 1, 1),
b = c(0, 1, 0, 0, 0),
c = c(1, 0, 0, 0, 0),
n = c(0, 0, 0, 1, 0),
cond = c(1,1,0,1,0),
col_name = c("c", "b", NA, "n", NA))
Is there a way to do this with dplyr maybe or base R ? Although any other solutions are appreciated.
Another base R solution:
m <- df_start[,1] == df_start[,2:4]
df_start$cond <- rowSums(m)
df_start$col_name[!!rowSums(m)] <- names(df_start[2:4])[max.col(m) * rowSums(m)]
which gives:
> df_start
a b c n cond col_name
1 1 0 1 0 1 c
2 1 1 0 0 1 b
3 1 0 0 0 0 <NA>
4 1 0 0 1 1 n
5 1 0 0 0 0 <NA>
You can try a tidyverse. I like to use gather and spread for such approaches.
library(tidyverse)
df_start %>%
rownames_to_column() %>%
gather(k, v, -a, -rowname) %>%
group_by(rowname) %>%
mutate(cond=ifelse(any(a==v), 1, 0)) %>%
mutate(col_name=ifelse(cond==1, k[v==1], NA)) %>%
ungroup() %>%
spread(k, v) %>%
select(-rowname)
# A tibble: 5 x 6
a cond col_name b c n
<dbl> <dbl> <chr> <dbl> <dbl> <dbl>
1 1 1 c 0 1 0
2 1 1 b 1 0 0
3 1 0 NA 0 0 0
4 1 1 n 0 0 1
5 1 0 NA 0 0 0
Or without transforming a piping/apply-solution like
df_start %>%
mutate(col_name=apply(.[-1], 1, function(x, y) y[x==1], colnames(.)[-1])) %>%
mutate(cond=as.numeric(apply(.[-ncol(.)], 1, function(x) any(x[1] == x[-1]))))
a b c n col_name cond
1 1 0 1 0 c 1
2 1 1 0 0 b 1
3 1 0 0 0 0
4 1 0 0 1 n 1
5 1 0 0 0 0
The following uses base R only.
Note that in order to create column cond there is no need for ifelse.
df_end <- df_start
df_end$cond <- with(df_start, as.integer(a == b | a == c | a == n))
df_end$col_name <- NA
inx <- apply(df_start[-1] == df_start[[1]], 1, function(x) min(which(x)) + 1)
is.na(inx) <- is.infinite(inx)
df_end$col_name <- names(df_start)[inx]
df_end
# a b c n cond col_name
#1 1 0 1 0 1 c
#2 1 1 0 0 1 b
#3 1 0 0 0 0 <NA>
#4 1 0 0 1 1 n
#5 1 0 0 0 0 <NA>
Great solution #Jimbou with tidyverse. For completion, you can arrange the outcome by changing the last line in Jimbous code with:
select(-c(rowname, cond, col_name), c(cond, col_name))

Resources