Mutate multiple columns if grouping variable has all observations missing - r

I'm trying to mutate the columns "a" and "b" only if the grouping variable "group" has all observations missing. The attempted solution changes the group "blue", in which not all observations are missing. Thanks in advance for your valuable time!
Code below:
library(tidyverse)
# sample data
a <- c(NA,NA,1,1,NA,1)
b <- c(1,1,NA,NA,1,NA)
c <- letters[1:6]
group <- c("yellow","yellow","black","black", "blue", "blue")
(data <- as_tibble(data.frame(a,b,c,group)))
# a b c group
# <dbl> <dbl> <fct> <fct>
# 1 NA 1 a yellow
# 2 NA 1 b yellow
# 3 1 NA c black
# 4 1 NA d black
# 5 NA 1 e blue
# 6 1 NA f blue
# failed attempt: observations from group "blue" change
(data %>%
dplyr::group_by(group) %>%
dplyr::mutate(across(1:2, ~ ifelse(all(is.na(.x)), 99999,.x))))
# a b c group
# <dbl> <dbl> <fct> <fct>
# 1 99999 1 a yellow
# 2 99999 1 b yellow
# 3 1 99999 c black
# 4 1 99999 d black
# 5 NA 1 e blue
# 6 NA 1 f blue
# desired output - observations from blue remain the same
a2 <- c(99999,99999,1,1,NA,1)
b2 <- c(1,1,99999,99999,1,NA)
c2 <- letters[1:6]
group2 <- c("yellow","yellow","black","black", "blue", "blue")
(data_desired <- as_tibble(data.frame(a2,b2,c2,group2)))
# a2 b2 c2 group2
# <dbl> <dbl> <fct> <fct>
# 1 99999 1 a yellow
# 2 99999 1 b yellow
# 3 1 99999 c black
# 4 1 99999 d black
# 5 NA 1 e blue
# 6 1 NA f blue

You could try this:
library(tidyverse)
# sample data
a <- c(NA,NA,1,1,NA,1)
b <- c(1,1,NA,NA,1,NA)
c <- letters[1:6]
group <- c("yellow","yellow","black","black", "blue", "blue")
(data <- as_tibble(data.frame(a,b,c,group)))
(data %>%
dplyr::group_by(group) %>%
dplyr::mutate(across(1:2, ~ ifelse(is.na(.x), 99999,.x))))
# A tibble: 6 x 4
# Groups: group [3]
a b c group
<dbl> <dbl> <fct> <fct>
1 99999 1 a yellow
2 99999 1 b yellow
3 1 99999 c black
4 1 99999 d black
5 99999 1 e blue
6 1 99999 f blue

Not the best solution but you can deal with it...
data <- data %>%
group_by(group) %>%
mutate(new = paste0(a, "_", b),
new1 = if_else(new == lag(new), str_replace(new, "NA", "99999"), new),
new2 = if_else(new == lead(new), str_replace(new, "NA", "99999"), new)
) %>%
separate(col = new1, into = c("a_new1", "b_new1"), sep = "_", extra = "drop") %>%
separate(col = new2, into = c("a_new2", "b_new2"), sep = "_", extra = "drop") %>%
mutate(a2 = if_else(is.na(a_new1), replace_na(a_new2), a_new1),
b2 = if_else(is.na(b_new1), replace_na(b_new2), b_new1)
) %>%
select(a, b, c, group, a2, b2) %>%
type_convert()
data
# A tibble: 6 x 6
# Groups: group [3]
a b c group a2 b2
<dbl> <dbl> <fct> <fct> <dbl> <dbl>
1 NA 1 a yellow 99999 1
2 NA 1 b yellow 99999 1
3 1 NA c black 1 99999
4 1 NA d black 1 99999
5 NA 1 e blue NA 1
6 1 NA f blue 1 NA

Thanks all for the input!
Finally, this is how I resolved this with lists and purrr.
library(tidyverse)
library(purrr)
# sample data
a <- c(NA,NA,1,1,NA,1)
b <- c(1,1,NA,NA,1,NA)
c <- letters[1:6]
group <- c("yellow","yellow","black","black", "blue", "blue")
(data <- as_tibble(data.frame(a,b,c,group)))
# list with groups in which all cases are NA
list1 <- data %>%
split(.,.$group) %>%
map(~select(.x,as.vector(which(colSums(is.na(.)) == nrow(.))))) %>%
map(~mutate_all(.x, replace_na, 99999))
# list with groups in which there is at least one valid observation
list2 <- data %>%
split(.,.$group) %>%
map(~select(.x, as.vector(which(colSums(is.na(.)) != nrow(.)))))
# putting the groups together into a dataframe
list3 <- mapply(cbind, list1, list2, SIMPLIFY=FALSE)
(desired_output <- do.call(rbind.data.frame, list3))

Related

Create new column based on previous column by group; if missing, use NA

I am trying out to select a value by group from one column, and pass it as value in another column, extending for the whole group. This is similar to question asked here . BUt, some groups do not have this number: in that case, I need to fill the column with NAs. How to do this?
Dummy example:
dd1 <- data.frame(type = c(1,1,1),
grp = c('a', 'b', 'd'),
val = c(1,2,3))
dd2 <- data.frame(type = c(2,2),
grp = c('a', 'b'),
val = c(8,2))
dd3 <- data.frame(type = c(3,3),
grp = c('b', 'd'),
val = c(7,4))
dd <- rbind(dd1, dd2, dd3)
Create new column:
dd %>%
group_by(type) %>%
mutate(#val_a = ifelse(grp == 'a', val , NA),
val_a2 = val[grp == 'a'])
Expected outcome:
type grp val val_a # pass in `val_a` value of teh group 'a'
1 1 a 1 1
2 1 b 2 1
3 1 d 3 1
4 2 a 8 8
5 2 b 2 8
6 3 b 7 NA
7 3 d 4 NA # value for 'a' is missing from group 3
You were close with your first approach; use any to apply the condition to all observations in the group:
dd %>%
group_by(type) %>%
mutate(val_a = ifelse(any(grp == "a"), val[grp == "a"] , NA))
type grp val val_a
<dbl> <chr> <dbl> <dbl>
1 1 a 1 1
2 1 b 2 1
3 1 d 3 1
4 2 a 8 8
5 2 b 2 8
6 3 b 7 NA
7 3 d 4 NA
Try this:
dd %>%
group_by(type) %>%
mutate(val_a2 = val[which(c(grp == 'a'))[1]])
# # A tibble: 7 x 4
# # Groups: type [3]
# type grp val val_a2
# <dbl> <chr> <dbl> <dbl>
# 1 1 a 1 1
# 2 1 b 2 1
# 3 1 d 3 1
# 4 2 a 8 8
# 5 2 b 2 8
# 6 3 b 7 NA
# 7 3 d 4 NA
This also controls against the possibility that there could be more than one match, which may cause bad results (with or without a warning).

decide 1 or 2 or both grouped by 2 variables

Question updated 9/10 !
DF<-data.frame(id=c(1,1,1,2,2,2),rank=c("1","2","3","1","2","3"),code=c("A","B","B","B","B","A"))
DF
id rank code
1 A1 1 A
2 A1 2 B
3 A1 3 B
4 B2 1 B
5 B2 2 B
6 B2 3 A
Desired output:
id rank code type1 type2 type3
1 A1 1 A aa MIX MIX
2 A1 2 B NA MIX MIX
3 A1 3 B NA NA MIX
4 B2 1 B bb bb MIX
5 B2 2 B NA bb MIX
6 B2 3 A NA NA MIX
All is grouped by id
type1 gets code where rank = 1.
type2 gets code where rank = 1-2. If code is different in rank 1 and 2, then MIX
type3 gets code where rank = 1-3. etc. etc.
Anyone? :)
Here's a dplyr solution using ifelse and a temporary column to reduce boilerplate:
library(dplyr)
DF %>%
group_by(id) %>%
mutate(a = code[rank == 1],
type1 = ifelse(rank > 1, NA,
ifelse(all(code[!(rank > 1)] == a[1]), a[1], "MIX")),
type2 = ifelse(rank > 2, NA,
ifelse(all(code[!(rank > 2)] == a[1]), a[1], "MIX")),
type3 = ifelse(rank > 3, NA,
ifelse(all(code[!(rank > 3)] == a[1]), a[1], "MIX"))) %>%
select(-a)
#> # A tibble: 6 x 6
#> # Groups: id [2]
#> id rank code type1 type2 type3
#> <dbl> <chr> <chr> <chr> <chr> <chr>
#> 1 1 1 A A MIX MIX
#> 2 1 2 B NA MIX MIX
#> 3 1 3 B NA NA MIX
#> 4 2 1 B B B MIX
#> 5 2 2 B NA B MIX
#> 6 2 3 A NA NA MIX
Using dplyr with case_when statements:
DF %>%
group_by(id) %>%
mutate(type2_grp = if_else(rank <= 2, 1, 0),
type3_grp = if_else(rank <= 3, 1, 0)) %>%
mutate(type1 = case_when(rank == 1 ~ code)) %>%
group_by(id, type2_grp) %>%
mutate(type2 = case_when(type2_grp == 1 & length(unique(code)) > 1 ~ "MIX",
type2_grp == 1 & code == "A" ~ "A",
type2_grp == 1 & code == "B" ~ "B")) %>%
group_by(id, type3_grp) %>%
mutate(type3 = case_when(type3_grp == 1 & length(unique(code)) > 1 ~ "MIX",
type3_grp == 1 & code == "A" ~ "A",
type3_grp == 1 & code == "B" ~ "B")) %>%
ungroup() %>%
select(-type2_grp, -type3_grp)
Which creates:
# A tibble: 6 x 6
id rank code type1 type2 type3
<dbl> <chr> <chr> <chr> <chr> <chr>
1 1 1 A A MIX MIX
2 1 2 B NA MIX MIX
3 1 3 B NA NA MIX
4 2 1 B B B MIX
5 2 2 B NA B MIX
6 2 3 A NA NA MIX
A base R solution for an arbitrary number of "type" columns
maxtype=3
do.call(rbind,
by(DF,list(DF$id),function(x){
y=list()
for (i in 1:maxtype) {
tmp=rep(NA,nrow(x))
idx=as.numeric(x$rank)<=i
if (length(unique(x$code[idx]))==1) {
tmp[idx]=x$code[1]
} else {
tmp[idx]="MIX"
}
y[[paste0("type",i)]]=tmp
}
cbind(x,y)
})
)
id rank code type1 type2 type3
1.1 1 1 A A MIX MIX
1.2 1 2 B <NA> MIX MIX
1.3 1 3 B <NA> <NA> MIX
2.4 2 1 B B B MIX
2.5 2 2 B <NA> B MIX
2.6 2 3 A <NA> <NA> MIX
Assuming DF is sorted by id then rank, your type columns for each id will be an upper triangular matrix of "MIX" subset with an upper triangular matrix of the first code value for as many rows as it appears.
A data.table solution:
library(data.table)
DF <- data.frame(id=c(1,1,1,2,2,2),rank=c("1","2","3","1","2","3"),code=c("A","B","B","B","B","A"))
setDT(DF)[, `:=`(rank = factor(rank), code = factor(code))]
maxRank <- nlevels(DF$rank)
naLvl <- nlevels(DF$code) + 2L
mTri <- matrix(nlevels(DF$code) + 1L, nrow = maxRank, ncol = maxRank)
mTri[lower.tri(mTri)] <- naLvl
typeMat <- function(rank, code) {
firstrep <- rle(code)[[1]][1]
mSubTri <- matrix(naLvl, nrow = firstrep, ncol = firstrep)
mSubTri[upper.tri(mSubTri, diag = TRUE)] <- code[1]
mOut <- mTri
mOut[1:firstrep, 1:firstrep] <- mSubTri
return(mOut[rank,, drop = FALSE])
}
DF <- cbind(DF, as.data.table(do.call(rbind, DF[, (type = list(list(typeMat(as.integer(rank), as.integer(code))))), by = id]$V1)))
typeCols <- 4:(3 + maxRank)
DF[, (typeCols) := lapply(.SD, function(x) {factor(x, levels = 1:naLvl, labels = c(levels(code), "MIX", NA), exclude = NULL)}), .SDcols = typeCols]
setnames(DF, 4:(3 + maxRank), paste0("type", 1:maxRank))
> DF
id rank code type1 type2 type3
1: 1 1 A A MIX MIX
2: 1 2 B <NA> MIX MIX
3: 1 3 B <NA> <NA> MIX
4: 2 1 B B B MIX
5: 2 2 B <NA> B MIX
6: 2 3 A <NA> <NA> MIX

Can I extract the value from several columns by column's name?

library(dplyr)
mydf <- data.frame(a_x = c(1,2,3,4,5),
b_x = c(8,9,10,11,12),
a_y = c("k",'b','a','d','z'),
b_y = c('aa','bb','cc','dd','ee'),
prefix=c("a","b","c","a","a"))
mydf
Assuming that the data I have is mydf, I would like to produce the same result as mydf2.
I made a column with the name of the column containing the value to be extracted.
I want to extract the value through this column.
mydf2 <- data.frame(a_x=c(1,2,3,4,5),
b_x=c(8,9,10,11,12),
prefix=c("a","b","c","a","a"),
desired_x_value = c(1,9,NA,4,5),
desired_y_value = c('k','bb',NA,'d','z'))
mydf2
I've used 'get' and 'paste0' but it doesn't work. Can I solve this problem through 'dplyr' chain?
mydf %>% mutate(desired_x_value = get(paste0(prefix,"_x")),
desired_y_value = get(paste0(prefix,"_y")))
So basically you want to create new columns (desired_x_value and desired_y_value) of which its value depends on a condition. Using dplyr I prefer case_when as it is the best readable way to do it, but you could also use (nested) if(else) statements. What it is doing is "if X meets condition A do Y, if X meets condition B do Z, if X meets condition .... do ..."
mydf %>%
dplyr::mutate(
desired_x_value = case_when(
prefix == "a" ~ a_x,
prefix == "b" ~ b_x,
desired_y_values = case_when(
prefix == "a" ~a_y,
prefix == "b" ~b_y,
TRUE ~ NA_character_ ))
You can remove the columns you don't need anymore in a second step if you want. the code above results in the table:
a_x b_x a_y b_y prefix desired_x_value desired_y_values
1 1 8 k aa a 1 k
2 2 9 b bb b 9 bb
3 3 10 a cc c NA <NA>
4 4 11 d dd a 4 d
5 5 12 z ee a 5 z
You can write a helper function for this :
get_value <- function(data, prefix, group) {
data[cbind(1:nrow(data), match(paste(prefix, group, sep = '_'), names(data)))]
}
mydf %>%
mutate(desired_x_value = get_value(select(., ends_with('_x')), prefix, 'x'),
desired_y_value = get_value(select(., ends_with('_y')), prefix, 'y'))
# a_x b_x a_y b_y prefix desired_x_value desired_y_value
#1 1 8 k aa a 1 k
#2 2 9 b bb b 9 bb
#3 3 10 a cc c NA <NA>
#4 4 11 d dd a 4 d
#5 5 12 z ee a 5 z
A simple rowwise also works.
mydf %>% rowwise() %>%
mutate(desired_x = ifelse(any(str_detect(names(mydf)[-5], prefix)),
get(paste(prefix, 'x', sep = '_')), NA),
desired_y = ifelse(any(str_detect(names(mydf)[-5], prefix)),
get(paste(prefix, 'y', sep = '_')), NA))
# A tibble: 5 x 7
# Rowwise:
a_x b_x a_y b_y prefix desired_x desired_y
<dbl> <dbl> <chr> <chr> <chr> <dbl> <chr>
1 1 8 k aa a 1 k
2 2 9 b bb b 9 bb
3 3 10 a cc c NA NA
4 4 11 d dd a 4 d
5 5 12 z ee a 5 z
If the prefixes don't contain any invalid column prefixes, this will do without ifelse statement.
mydf <- data.frame(a_x = c(1,2,3,4,5),
b_x = c(8,9,10,11,12),
a_y = c("k",'b','a','d','z'),
b_y = c('aa','bb','cc','dd','ee'),
prefix=c("a","b","a","a","a"))
mydf %>% rowwise() %>%
mutate(desired_x = get(paste(prefix, 'x', sep = '_')),
desired_y = get(paste(prefix, 'y', sep = '_')))
# A tibble: 5 x 7
# Rowwise:
a_x b_x a_y b_y prefix desired_x desired_y
<dbl> <dbl> <chr> <chr> <chr> <dbl> <chr>
1 1 8 k aa a 1 k
2 2 9 b bb b 9 bb
3 3 10 a cc a 3 a
4 4 11 d dd a 4 d
5 5 12 z ee a 5 z
First I would like to say that I am not presenting this as a good solution as other proposed solutions are much better and simpler. However, since you have brought up get function, I wanted to show you how to make use of it to get your desired output. As a matter of fact some of the values in your prefix column such as c does not have a match among your column names and get function throws an error on terminating the execution, and unlike mget function it does not have a ifnotfound argument. So you need a way to go around that error message by means of an ifelse:
library(dplyr)
library(stringr)
library(tidyr)
library(purrr)
library(glue)
mydf1 %>%
mutate(desired_x_value = map(prefix, ~ ifelse(any(str_detect(names(mydf)[-5], .x)),
get(glue("{.x}_x")), NA)),
desired_y_value = map(prefix, ~ ifelse(any(str_detect(names(mydf)[-5], .x)),
get(glue("{.x}_y")), NA))) %>%
unnest(cols = c(desired_x_value, desired_y_value))
# A tibble: 5 x 7
a_x b_x a_y b_y prefix desired_x_value desired_y_value
<dbl> <dbl> <chr> <chr> <chr> <dbl> <chr>
1 1 8 k aa a 1 k
2 2 9 b bb b 9 bb
3 3 10 a cc NA NA NA
4 4 11 d dd a 4 d
5 5 12 z ee a 5 z
You can also use paste function instead of glue and in case we already know the output types of the desired columns, we can spare the last line:
mydf1 %>%
mutate(desired_x_value = map_dbl(prefix, ~ ifelse(any(str_detect(names(mydf)[-5], .x)),
get(paste(.x, "x", sep = "_")), NA)),
desired_y_value = map_chr(prefix, ~ ifelse(any(str_detect(names(mydf)[-5], .x)),
get(paste(.x, "y", sep = "_")), NA)))
# A tibble: 5 x 7
# Rowwise:
a_x b_x a_y b_y prefix desired_x_value desired_y_value
<dbl> <dbl> <chr> <chr> <chr> <dbl> <chr>
1 1 8 k aa a 1 k
2 2 9 b bb b 9 bb
3 3 10 a cc NA NA NA
4 4 11 d dd a 4 d
5 5 12 z ee a 5 z

How to drop a group if frequency of NAs in data is above certain threshold in dplyr?

How do I get from
# A tibble: 6 x 2
group_var psbl_NAs
<chr> <dbl>
1 a 1
2 a NA
3 a NA
4 b 1
5 b 1
6 b NA
to
# A tibble: 6 x 2
group_var psbl_NAs
<chr> <dbl>
1 b 1
2 b 1
3 b NA
using the fact that psbl_NAs in group "a" are present in more than 50% of the data?
tibble(
group_var = c(rep("a",3), rep("b",3)),
psbl_NAs = c(1, NA, NA, 1, 1, NA)
) %>%
group_by(group_var) %>%
??????
We can group_by, mutate, and then filter:
d %>%
group_by(group_var) %>%
# calculate % of NA values by group
mutate(pct_na = mean(is.na(psbl_NAs))) %>%
# only keep where % of NA values < 0.5
filter(pct_na < 0.5) %>%
select(-pct_na) # remove % NA column
# group_var psbl_NAs
# <chr> <dbl>
# 1 b 1
# 2 b 1
# 3 b NA
It might be instructive to see the result of our calculation of % NAs:
d %>%
group_by(group_var) %>%
# calculate % of NA values by group
mutate(pct_na = mean(is.na(psbl_NAs)))
# group_var psbl_NAs pct_na
# <chr> <dbl> <dbl>
# 1 a 1 0.667
# 2 a NA 0.667
# 3 a NA 0.667
# 4 b 1 0.333
# 5 b 1 0.333
# 6 b NA 0.333
Here's a one line base R solution using ave:
d[with(d, ave(psbl_NAs, group_var, FUN = function(x) mean(is.na(x)))) < 0.5,]

summarise and group_by using two different columns consecutively

I have a dataframe df with three columns a,b,c.
df <- data.frame(a = c('a','b','c','d','e','f','g','e','f','g'),
b = c('X','Y','Z','X','Y','Z','X','X','Y','Z'),
c = c('cat','dog','cat','dog','cat','cat','dog','cat','cat','dog'))
df
# output
a b c
1 a X cat
2 b Y dog
3 c Z cat
4 d X dog
5 e Y cat
6 f Z cat
7 g X dog
8 e X cat
9 f Y cat
10 g Z dog
I have to group_by using the column b followed by summarise using the column c with counts of available values in it.
df %>% group_by(b) %>%
summarise(nCat = sum(c == 'cat'),
nDog = sum(c == 'dog'))
#output
# A tibble: 3 × 3
b nCat nDog
<fctr> <int> <int>
1 X 2 2
2 Y 2 1
3 Z 2 1
However, before doing the above task, I should remove the rows belonging to a value in a which has more than one value in b.
df %>% group_by(a) %>% summarise(count = n())
#output
# A tibble: 7 × 2
a count
<fctr> <int>
1 a 1
2 b 1
3 c 1
4 d 1
5 e 2
6 f 2
7 g 2
For example, in this dataframe, all the rows having value e(values: Y,X), f(values: Z,Y), g(values: X,Z) in column a.
# Expected output
# A tibble: 3 × 3
b nCat nDog
<fctr> <int> <int>
1 X 1 1
2 Y 0 1
3 Z 1 0
We can use filter with n_distinct to filter the values in 'b' that have only one unique element for each 'a' group, then grouped by 'b', we do the summarise
df %>%
group_by(a) %>%
filter(n_distinct(b)==1) %>%
group_by(b) %>%
summarise(nCat =sum(c=='cat'), nDog = sum(c=='dog'), Total = n())
# A tibble: 3 × 4
# b nCat nDog Total
# <fctr> <int> <int> <int>
#1 X 1 1 2
#2 Y 0 1 1
#3 Z 1 0 1

Resources