Consolidate flag columns into single column in R - r

Suppose I have a dataframe like this
testtbl <- data.frame(ID = c('1','2','3','4'),
A = c(1,0,1,1),
B = c(1,1,1,1),
C = c(0,0,0,1),
D = c(0,1,1,1))
> testtbl
ID A B C D
1 1 1 1 0 0
2 2 0 1 0 1
3 3 1 1 0 1
4 4 1 1 1 1
Where columns A-D are flags that can either be 1 or 0. I would like to consolidate these columns into 1 column, where the new dataframe would look something like:
> testtbl
ID flag
1 1 A,B
2 2 B,D
3 3 A,B,D
4 4 A,B,C,D
At little confused on how I would approach this and would appreciate any hints or help.

A solution from dplyr and tidyr.
library(dplyr)
library(tidyr)
testtbl2 <- testtbl %>%
gather(Col, Val, -ID) %>%
filter(Val == 1) %>%
select(-Val) %>%
group_by(ID) %>%
summarise(flag = toString(Col))
testtbl2
# # A tibble: 4 x 2
# ID flag
# <fctr> <chr>
# 1 1 A, B
# 2 2 B, D
# 3 3 A, B, D
# 4 4 A, B, C, D

You can also do it without any libraries with an apply:
testtbl <- data.frame(ID = c('1','2','3','4'),
A = c(1,0,1,1),
B = c(1,1,1,1),
C = c(0,0,0,1),
D = c(0,1,1,1))
test<-data.frame(ID=testtbl$ID, flag=(apply(testtbl[,-1], 1, function(x) paste0(names(x)[which(x==1)], collapse=','))))

Base R
do.call(rbind, lapply(split(testtbl, testtbl$ID), function(x)
data.frame(ID = x[1],
flag = paste(sort(names(x)[-1][x[-1] > 0]),
collapse = ","))))
# ID flag
#1 1 A,B
#2 2 B,D
#3 3 A,B,D
#4 4 A,B,C,D

Related

Find 2 out of 3 conditions per ID

I have the following dataframe:
df <-read.table(header=TRUE, text="id code
1 A
1 B
1 C
2 A
2 A
2 A
3 A
3 B
3 A")
Per id, I would love to find those individuals that have at least 2 conditions, namely:
conditionA = "A"
conditionB = "B"
conditionC = "C"
and create a new colum with "index", 1 if there are two or more conditions met and 0 otherwise:
df_output <-read.table(header=TRUE, text="id code index
1 A 1
1 B 1
1 C 1
2 A 0
2 A 0
2 A 0
3 A 1
3 B 1
3 A 1")
So far I have tried the following:
df_output = df %>%
group_by(id) %>%
mutate(index = ifelse(grepl(conditionA|conditionB|conditionC, code), 1, 0))
and as you can see I am struggling to get the threshold count into the code.
You can create a vector of conditions, and then use %in% and sum to count the number of occurrences in each group. Use + (or ifelse) to convert logical into 1 and 0:
conditions = c("A", "B", "C")
df %>%
group_by(id) %>%
mutate(index = +(sum(unique(code) %in% conditions) >= 2))
id code index
1 1 A 1
2 1 B 1
3 1 C 1
4 2 A 0
5 2 A 0
6 2 A 0
7 3 A 1
8 3 B 1
9 3 A 1
You could use n_distinct(), which is a faster and more concise equivalent of length(unique(x)).
df %>%
group_by(id) %>%
mutate(index = +(n_distinct(code) >= 2)) %>%
ungroup()
# # A tibble: 9 × 3
# id code index
# <int> <chr> <int>
# 1 1 A 1
# 2 1 B 1
# 3 1 C 1
# 4 2 A 0
# 5 2 A 0
# 6 2 A 0
# 7 3 A 1
# 8 3 B 1
# 9 3 A 1
You can check conditions using intersect() function and check whether resulting list is of minimal (eg- 2) length.
conditions = c('A', 'B', 'C')
df_output2 =
df %>%
group_by(id) %>%
mutate(index = as.integer(length(intersect(code, conditions)) >= 2))

Multiplying column value by another value matching column name R

I have a data frame which looks like this:
Value1 = c("1","2","1","3")
Letter = c("A","B","B","A")
A = c("2","2","0","1")
B = c("1","1","1","0")
data <- data.frame(Value1,Letter,A,B)
data
Value1 Letter A B
1 1 A 2 1
2 2 B 2 1
3 1 B 0 1
4 3 A 1 0
I'm trying to add a new column which is the multiplication of column Value1, by column A or B depending on what is in the Letter column. The expected result would be:
Value1 Letter A B Results
1 1 A 2 1 2
2 2 B 2 1 2
3 1 B 0 1 1
4 3 A 1 0 3
I'm trying to use the match() function, but without success.
Thanks!
With base R:
data <- type.convert(data, as.is = TRUE)
data$Results <- ifelse(data$Letter == 'A', data$A * data$Value1, data$B * data$Value1)
Output
Value1 Letter A B Results
1 1 A 2 1 2
2 2 B 2 1 2
3 1 B 0 1 1
4 3 A 1 0 3
Another option would be to pivot to long form, do the calculation, then pivot back to wide format.
library(tidyverse)
data %>%
type.convert(as.is = TRUE) %>%
pivot_longer(c(A, B)) %>%
mutate(Results = ifelse(Letter == name, value * Value1, NA_integer_)) %>%
pivot_wider(names_from = "name", values_from = "value") %>%
group_by(Value1, Letter) %>%
summarise_all(discard, is.na)
Output
Value1 Letter Results A B
<int> <chr> <int> <int> <int>
1 1 A 2 2 1
2 1 B 1 0 1
3 2 B 2 2 1
4 3 A 3 1 0
Use case_when or ifelse
library(dplyr)
data <- data %>%
type.convert(as.is = TRUE) %>%
mutate(Results = case_when(Letter == 'A' ~ A * Value1,
TRUE ~ B * Value1))
-output
data
Value1 Letter A B Results
1 1 A 2 1 2
2 2 B 2 1 2
3 1 B 0 1 1
4 3 A 1 0 3
Or use get with rowwise
data <- data %>%
type.convert(as.is = TRUE) %>%
rowwise %>%
mutate(Result = get(Letter) * Value1) %>%
# or may also use
# mutate(Result = cur_data()[[Letter]] * Value1) %>%
ungroup
-output
data
# A tibble: 4 × 5
Value1 Letter A B Result
<int> <chr> <int> <int> <int>
1 1 A 2 1 2
2 2 B 2 1 2
3 1 B 0 1 1
4 3 A 1 0 3
In base R, we may use row/column indexing as vectorized option
data <- type.convert(data, as.is = TRUE)
nm1 <- unique(data$Letter)
data$Results <-data[nm1][cbind(seq_len(nrow(data)),
match(data$Letter, nm1))] * data$Value1

Realization of a data frame with the smallest values grouped by a column

How can I create a new data frame with the smallest values group by a column.
For example this df:
df <- read.table(header = TRUE, text = 'Gene Value
A 12
A 10
B 3
B 0
B 6
C 1
D 0
D 4')
Now with:
test <- setDT(df)[, .SD[which.min(Value)], by=Gene]
I get this:
> test
Gene Value
1: A 10
2: B 0
3: C 1
4: D 0
But how can I use a second condition for Value > 0 here? I want to have this output:
> test
Gene Value
1: A 10
2: B 3
3: C 1
4: D 4
Could do:
setDT(df)[, .(Value = min(Value[Value > 0])), by=Gene]
Output:
Gene Value
1: A 10
2: B 3
3: C 1
4: D 4
Using tidyverse you can group, filter and then summarize the min value:
library(tidyverse)
df2 <- df %>%
group_by(Gene) %>%
filter(Value != 0) %>%
summarise(Value = min(Value))
# A tibble: 4 x 2
Gene Value
<fct> <dbl>
1 A 10
2 B 3
3 C 1
4 D 4
Using aggregate from base R
aggregate(Value ~ Gene, subset(df, Value > 0), min)
# Gene Value
#1 A 10
#2 B 3
#3 C 1
#4 D 4

Replace all NA values for variable with one row equal to 0

Slightly difficult to phrase, as far as I saw none of the similar questions answered my problem.
I have a data.frame such as:
df1 <- data.frame(id = rep(c("a", "b","c"), each = 4),
val = c(NA, NA, NA, NA, 1, 2, 2, 3,NA,2,NA,3))
df1
id val
1 a NA
2 a NA
3 a NA
4 a NA
5 b 1
6 b 2
7 b 2
8 b 3
9 c NA
10 c 2
11 c NA
12 c 3
and I want to get rid of all the NA values (easy enough using e.g. filter() ) but make sure that if this removes all of one id value (in this case it removes every instance of "a") that one extra row is inserted of (e.g.) a = 0
so that:
id val
1 a 0
2 b 1
3 b 2
4 b 2
5 b 3
6 c 2
7 c 3
obviously easy enough to do this in a roundabout way but I was wondering if there's a tidy/elegant way to do this. I thought tidyr::complete() might help but not entirely sure how to apply it to a case like this
I don't care about the order of the rows
Cheers!
edit: updated with clearer desired output. might make desired answers submitted before that a bit less clear
Another idea using dplyr,
library(dplyr)
df1 %>%
group_by(id) %>%
mutate(val = ifelse(row_number() == 1 & all(is.na(val)), 0, val)) %>%
na.omit()
which gives,
# A tibble: 5 x 2
# Groups: id [2]
id val
<fct> <dbl>
1 a 0
2 b 1
3 b 2
4 b 2
5 b 3
We may do
df1 %>% group_by(id) %>% do(if(all(is.na(.$val))) replace(.[1, ], 2, 0) else na.omit(.))
# A tibble: 5 x 2
# Groups: id [2]
# id val
# <fct> <dbl>
# 1 a 0
# 2 b 1
# 3 b 2
# 4 b 2
# 5 b 3
After grouping by id, if everything in val is NA, then we leave only the first row with the second element replaced by 0, otherwise the same data is returned after applying na.omit.
In a more readable format that would be
df1 %>% group_by(id) %>%
do(if(all(is.na(.$val))) data.frame(id = .$id[1], val = 0) else na.omit(.))
(Here I presume that you indeed want to get rid of all NA values; otherwise there is no need for na.omit.)
df1[is.na(df1)] <- 0
df1[!(duplicated(df1$id) & df1$val == 0), ]
id val
1 a 0
5 b 1
6 b 2
7 b 2
8 b 3
Base R option is to find groups with all NAs and transform them by changing their val to 0 and select only unique rows so that there is only one row per group. We rbind this dataframe with the groups which are !all_NA.
all_NA <- with(df1, ave(is.na(val), id, FUN = all))
rbind(unique(transform(df1[all_NA, ], val = 0)), df1[!all_NA, ])
# id val
#1 a 0
#5 b 1
#6 b 2
#7 b 2
#8 b 3
dplyr option looks ugly but one way is to make two groups of dataframes one with groups of all NA values and other with groups of all non-NA values. For groups with all NA values we add row with it's id and val as 0 and bind this to the other group.
library(dplyr)
bind_rows(df1 %>%
group_by(id) %>%
filter(all(!is.na(val))),
df1 %>%
group_by(id) %>%
filter(all(is.na(val))) %>%
ungroup() %>%
summarise(id = unique(id),
val = 0)) %>%
arrange(id)
# id val
# <fct> <dbl>
#1 a 0
#2 b 1
#3 b 2
#4 b 2
#5 b 3
Changed the df to make example more exhaustive -
df1 <- data.frame(id = rep(c("a", "b","c"), each = 4),
val = c(NA, NA, NA, NA, 1, 2, 2, 3,NA,2,NA,3))
library(dplyr)
df1 %>%
group_by(id) %>%
mutate(case=sum(is.na(val))==n(), row_num=row_number() ) %>%
mutate(val=ifelse(is.na(val)&case,0,val)) %>%
filter( !(case&row_num!=1) ) %>%
select(id, val)
Output
id val
<fct> <dbl>
1 a 0
2 b 1
3 b 2
4 b 2
5 b 3
6 c NA
7 c 2
8 c NA
9 c 3
Another base approach, one that doesn't maintain the order of the rows and takes advantage of factors remembering lost values:
df1 <- na.omit(df1)
df1 <- rbind(
df1,
data.frame(
id = levels(df1$id)[!levels(df1$id) %in% df1$id],
val = 0)
)
I do personally prefer the dplyr approach given by Sotos, as I don't like rbind-ing data.frames back together so it's a matter of taste, but this isn't unbearably complicated by my eye. It's easy enough to adapt to a character id column with a unique(df1$id) variable.
Here is an option too:
df1 %>%
mutate_if(is.factor,as.character) %>%
mutate_all(funs(replace(.,is.na(.),0))) %>%
slice(4:nrow(.))
This gives:
id val
1 a 0
2 b 1
3 b 2
4 b 2
5 b 3
Alternative:
df1 %>%
mutate_if(is.factor,as.character) %>%
mutate_all(funs(replace(.,is.na(.),0))) %>%
unique()
UPDATE based on other requirements:
Some users suggested to test on this dataframe. Of course this answer assumes you'll look at everything by hand. Might be less useful if you have to look at everything by "hand" but here goes:
df1 <- data.frame(id = rep(c("a", "b","c"), each = 4), val = c(NA, NA, NA, NA, 1, 2, 2, 3,NA,2,NA,3))
df1 %>%
mutate_if(is.factor,as.character) %>%
mutate(val=ifelse(id=="a",0,val)) %>%
slice(4:nrow(.))
This yields:
id val
1 a 0
2 b 1
3 b 2
4 b 2
5 b 3
6 c NA
7 c 2
8 c NA
9 c 3
Here is a base R solution.
res <- lapply(split(df1, df1$id), function(DF){
if(anyNA(DF$val)) {
i <- is.na(DF$val)
DF$val[i] <- 0
DF <- rbind(DF[i & !duplicated(DF[i, ]), ], DF[!i, ])
}
DF
})
res <- do.call(rbind, res)
row.names(res) <- NULL
res
# id val
#1 a 0
#2 b 1
#3 b 2
#4 b 2
#5 b 3
Edit.
A dplyr solution could be the following.
It was tested with the original dataset posted by the OP, with the dataset in Vivek Kalyanarangan's answer and with the dataset in markus' comment, renamed df2 and df3, respectively.
library(dplyr)
na2zero <- function(DF){
DF %>%
group_by(id) %>%
mutate(val = ifelse(is.na(val), 0, val),
crit = val == 0 & duplicated(val)) %>%
filter(!crit) %>%
select(-crit)
}
na2zero(df1)
na2zero(df2)
na2zero(df3)
One may try this :
df1 = data.frame(id = rep(c("a", "b","c"), each = 4),
val = c(NA, NA, NA, NA, 1, 2, 2, 3,NA,2,NA,3))
df1
# id val
#1 a NA
#2 a NA
#3 a NA
#4 a NA
#5 b 1
#6 b 2
#7 b 2
#8 b 3
#9 c NA
#10 c 2
#11 c NA
#12 c 3
Task is to remove all rows corresponding to any id IFF val for the corresponding id is all NAs and add new row with this id and val = 0.
In this example, id = a.
Note : val for c also has NAs but all the val corresponding to c are not NA therefore we need to remove the corresponding row for c where val = NA.
So lets create another column say, val2 which indicates 0 means its all NAs and 1 otherwise.
library(dplyr)
df1 = df1 %>%
group_by(id) %>%
mutate(val2 = if_else(condition = all(is.na(val)),true = 0, false = 1))
df1
# A tibble: 12 x 3
# Groups: id [3]
# id val val2
# <fct> <dbl> <dbl>
#1 a NA 0
#2 a NA 0
#3 a NA 0
#4 a NA 0
#5 b 1 1
#6 b 2 1
#7 b 2 1
#8 b 3 1
#9 c NA 1
#10 c 2 1
#11 c NA 1
#12 c 3 1
Get the list of ids with corresponding val = NA for all.
all_na = unique(df1$id[df1$val2 == 0])
Then remove theids from the dataframe df1 with val = NA.
df1 = na.omit(df1)
df1
# A tibble: 6 x 3
# Groups: id [2]
# id val val2
# <fct> <dbl> <dbl>
# 1 b 1 1
# 2 b 2 1
# 3 b 2 1
# 4 b 3 1
# 5 c 2 1
# 6 c 3 1
And create a new dataframe with ids in all_na and val = 0
all_na_df = data.frame(id = all_na, val = 0)
all_na_df
# id val
# 1 a 0
then combine these two dataframes.
df1 = bind_rows(all_na_df, df1[,c('id', 'val')])
df1
# id val
# 1 a 0
# 2 b 1
# 3 b 2
# 4 b 2
# 5 b 3
# 6 c 2
# 7 c 3
Hope this helps and Edits are most welcomed :-)

Using 'window' functions in dplyr

I need to process rows of a data-frame in order, but need to look-back for certain rows. Here is an approximate example:
library(dplyr)
d <- data_frame(trial = rep(c("A","a","b","B","x","y"),2))
d <- d %>%
mutate(cond = rep('', n()), num = as.integer(rep(0,n())))
for (i in 1:nrow(d)){
if(d$trial[i] == "A"){
d$num[i] <- 0
d$cond[i] <- "A"
}
else if(d$trial[i] == "B"){
d$num[i] <- 0
d$cond[i] <- "B"
}
else{
d$num[i] <- d$num[i-1] +1
d$cond[i] <- d$cond[i-1]
}
}
The resulting data-frame looks like
> d
Source: local data frame [12 x 3]
trial cond num
1 A A 0
2 a A 1
3 b A 2
4 B B 0
5 x B 1
6 y B 2
7 A A 0
8 a A 1
9 b A 2
10 B B 0
11 x B 1
12 y B 2
What is the proper way of doing this using dplyr?
dlpyr-only solution:
d %>%
group_by(i=cumsum(trial %in% c('A','B'))) %>%
mutate(cond=trial[1],num=seq(n())-1) %>%
ungroup() %>%
select(-i)
# trial cond num
# 1 A A 0
# 2 a A 1
# 3 b A 2
# 4 B B 0
# 5 x B 1
# 6 y B 2
# 7 A A 0
# 8 a A 1
# 9 b A 2
# 10 B B 0
# 11 x B 1
# 12 y B 2
Try
d %>%
mutate(cond = zoo::na.locf(ifelse(trial=="A"|trial=="B", trial, NA))) %>%
group_by(id=rep(1:length(rle(cond)$values), rle(cond)$lengths)) %>%
mutate(num = 0:(n()-1)) %>% ungroup %>%
select(-id)
Here is one way. The first thing was to add A or B in cond using ifelse. Then, I employed na.locf() from the zoo package in order to fill NA with A or B. I wanted to assign a temporary group ID before I took care of num. I borrowed rleid() in the data.table package. Grouping the data with the temporary group ID (i.e., foo), I used row_number() which is one of the window functions in the dplyr package. Note that I tried to remove foo doing select(-foo). But, the column wanted to stay. I think this is probably something to do with compatibility of the function.
library(zoo)
library(dplyr)
library(data.table)
d <- data_frame(trial = rep(c("A","a","b","B","x","y"),2))
mutate(d, cond = ifelse(trial == "A" | trial == "B", trial, NA),
cond = na.locf(cond),
foo = rleid(cond)) %>%
group_by(foo) %>%
mutate(num = row_number() - 1)
# trial cond foo num
#1 A A 1 0
#2 a A 1 1
#3 b A 1 2
#4 B B 2 0
#5 x B 2 1
#6 y B 2 2
#7 A A 3 0
#8 a A 3 1
#9 b A 3 2
#10 B B 4 0
#11 x B 4 1
#12 y B 4 2

Resources