how to extract column names based on a condition? - r

Consider this simple example
mytest <- data_frame(group = c('a', 'a', 'a', 'b', 'b', 'b'),
x = c(NA,NA,NA,5,6,7),
other_var = c(NA, NA, NA, 1,2,3),
y = c(3,5,6,NA,NA,NA),
another_var = c(1,2,3, NA,NA,NA),
label_x = c('hello','hello','hello','world','world','world'),
label_y =c('bada','bada','bada','boom','boom','boom'),
label_other_var = c('ak','ak','ak','run','run','run'),
label_another_var = c('noo','noo','noo','bie','bie','bie'))
# A tibble: 6 x 9
group x other_var y another_var label_x label_y label_other_var label_another_var
<chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr>
1 a NA NA 3 1 hello bada ak noo
2 a NA NA 5 2 hello bada ak noo
3 a NA NA 6 3 hello bada ak noo
4 b 5 1 NA NA world boom run bie
5 b 6 2 NA NA world boom run bie
6 b 7 3 NA NA world boom run bie
Here, I need to nest() this dataframe by group, and be able to extract the column names of the variables (in each nested dataframe) that are not NAs. The trick is that the actual name of the variable is shown in the label_ column
For instance, this is the output desired:
# A tibble: 4 x 2
group var
<chr> <chr>
1 a bada
2 a noo
3 b world
4 b run
Indeed, take group a. There is only one non-missing variables are y and another_var. However, the name of y is bada (as shown in the label_y variable) and the name of another_var is noo. Same reasoning for b.
I dont know how to do that with a map call after running
mytest %>% group_by(group) %>% nest()
# A tibble: 2 x 2
group data
<chr> <list>
1 a <tibble [3 x 8]>
2 b <tibble [3 x 8]>
Any ideas?
Thanks!
EDIT: the original, smaller, tibble proposed was the following
mytest <- data_frame(group = c('a', 'a', 'a', 'b', 'b', 'b'),
+ x = c(NA,NA,NA,5,6,7),
+ y = c(3,5,6,NA,NA,NA),
+ label_x = c('hello','hello','hello','world','world','world'),
+ label_y =c('bada','bada','bada','boom','boom','boom'))

After group by nesting, loop through the 'data' with map, summarise the the 'label' columns by extracting the first non-NA element, gather it to a single column while removing the NA (na.rm = TRUE)), select the 'var' column, and then do the unnest (after keeping only the columns of interest)
mytest %>%
group_by(group) %>%
nest %>%
mutate(var = map(data, ~
.x %>%
summarise(label_x = label_x[!is.na(x)][1],
label_y = label_y[!is.na(y)][1]) %>%
gather(key, var, na.rm = TRUE) %>%
select(var))) %>%
select(-data) %>%
unnest
# A tibble: 2 x 2#
# group var
# <chr> <chr>
#1 a bada
#2 b world
Update
If there are more columns, create unique column names and then loop through the corresponding column names with map2
nm1 <- unique(sub("label_", "", setdiff(names(mytest), "group")))
nm2 <- paste0("label_", nm1)
mytest %>%
group_by(group) %>%
nest %>%
mutate(var = map(data, ~
map2_chr(.x %>%
select(nm1),
.x %>%
select(nm2), ~
.y[!is.na(.x)][1]) %>%
na.omit %>%
tibble(var = .))) %>%
select(-data) %>%
unnest
# A tibble: 4 x 2
# group var
# <chr> <chr>
#1 a bada
#2 a noo
#3 b world
#4 b run

This will output the result you want:
mytest <- data_frame(group = c('a', 'a', 'a', 'b', 'b', 'b'),
x = c(NA,NA,NA,5,6,7),
y = c(3,5,6,NA,NA,NA),
label_x = c('hello','hello','hello','world','world','world'),
label_y =c('bada','bada','bada','boom','boom','boom'))
extract_good_colnames <- function(df, subgroup){
subset <- filter(df, group == subgroup)
if(sum(is.na(subset$x)) > 0){
colname = 'label_y'
}else if(sum(is.na(subset$y)) > 0){
colname = 'label_x'
}
return(tibble(group = subgroup, var = as.character(subset[1, colname])))
}
groups <- unique(mytest$group)
map_df(groups, function(x) extract_good_colnames(mytest, x))

Related

Find novel categories between groups

I am trying to identify which trees are different between two groups a & b across different forest types (type).
My dummy example:
dd1 <- data.frame(
type = rep(1, 5),
grp = c('a', 'a', 'a', 'b', 'b'),
sp = c('oak', 'beech', 'spruce',
'oak', 'yew')
)
dd2 <- data.frame(
type = rep(2, 3),
grp = c('a', 'b', 'b'),
sp = c('oak', 'beech', 'spruce')
)
dd <- rbind(dd1, dd2)
I can find unique species by each group (in reality, two groups: type & grp) by distinct:
dd %>%
group_by(type, grp) %>%
distinct(sp)
But instead I want to know which trees in group b are different from group a?
Expected output:
type grp sp
<dbl> <chr> <chr>
1 1 b yew # here, only `yew` is a new one; `oak` was previously listed in group `a`
2 2 b beech # both beech and spruce are new compared to group `a`
3 2 b spruce
How can I do this? Thank you!
The condition to filter is
library(dplyr)
dd %>%
group_by(type) %>%
filter(grp == 'b' & !sp %in% sp[grp == 'a']) %>%
ungroup()
# # A tibble: 3 × 3
# type grp sp
# <dbl> <chr> <chr>
# 1 1 b yew
# 2 2 b beech
# 3 2 b spruce
You could try an anti_join:
library(dplyr)
library(tidyr)
dd |>
anti_join(dd |> filter(grp == "a"), by = c("sp", "type"))
Output:
type grp sp
1 1 b yew
2 2 b beech
3 2 b spruce

R: dplyr How to group by then filter rows based on the condition of each group's first row

I have a simple data frame such as
df <- data.frame(x=c(1,1,1,1,2,2,2,3,3,3),
y=c('a','b','a','c','e','d','e','a','f','c'))
I want to group by x, then if the first row of each x-groups has y == 'a', then get only rows that have y == 'a' | y == 'c'
So I expect the outcome would have row 1, 3, 4, 8, 10
Thank you very much.
After grouping by 'x', create an & condition - 1) check whether the first value of 'y' is 'a', 2) condition that checks for values 'a', 'c' in the column
library(dplyr)
df %>%
group_by(x) %>%
filter('a' == first(y), y %in% c('a', 'c')) %>%
ungroup
-output
# A tibble: 5 × 2
x y
<dbl> <chr>
1 1 a
2 1 a
3 1 c
4 3 a
5 3 c
If we have additional rules, create a named list where the names will be expected first values of 'y' and the vector of values to be filtered, then extract the list element based on the first value of the 'y' and use that vector in the logical expression with %in%
df %>%
group_by(x) %>%
filter(y %in% list(a = c('a', 'c'), e = 'e')[[first(y)]]) %>%
ungroup
-output
# A tibble: 7 × 2
x y
<dbl> <chr>
1 1 a
2 1 a
3 1 c
4 2 e
5 2 e
6 3 a
7 3 c
Here is another dplyr option
> df %>%
+ filter(y %in% c("a", "c") & ave(y == "a", x, FUN = first))
x y
1 1 a
2 1 a
3 1 c
4 3 a
5 3 c

Link filter(across()) statements with | (OR) instead of & (AND)

How do I filter a dataframe df for all rows where one or more of columns_to_check meet a condition. As an example: Where is at least one cell NA?
df <- tibble(a = c('x', 'x', 'x'),
b = c(NA, 'x', 'x'),
c = c(NA, NA, 'x'))
columns_to_check <- c('b', 'c')
Checking where all columns are NA is straightforward:
library(tidyverse)
df %>%
filter(across(all_of(columns_to_check), ~ !is.na(.x)))
#> # A tibble: 1 x 3
#> a b c
#> <chr> <chr> <chr>
#> 1 x x x
But (how) can I combine the filter() statements created with across() using OR?
Here's an approach with reduce from purrr:
df %>%
filter(reduce(.x = across(all_of(columns_to_check), ~ !is.na(.x)), .f = `|`))
This works because across returns a list of logical vectors that are length nrow(df).
You can see that behavior when you execute it in mutate:
df %>%
+ mutate(across(all_of(columns_to_check), ~ !is.na(.x)))
# A tibble: 3 x 3
a b c
<chr> <lgl> <lgl>
1 x FALSE FALSE
2 x TRUE FALSE
3 x TRUE TRUE
Therefore, you can reduce them together with | to get one logical vector. You don't need .x or .f, they are only there for illustrative purposes.
My mistake, this is documented in vignette("rowwise"):
df %>%
filter(rowSums(across(all_of(columns_to_check), ~ !is.na(.x))) > 0)
Another solution could be:
df %>%
filter(across(all_of(columns_to_check), ~ !is.na(.x)) == TRUE)
a b c
<chr> <chr> <chr>
1 x x <NA>
2 x x x

Create NA's based on another dataframe without long data

I have a tibble with the explicit "id" and colnames I need to convert to NA's. Is there anyway I can create the NA's without making my df a long dataset? I considered using the new rows_update function, but I'm not sure if this is correct because I only want certain columns to be NA.
library(dplyr)
to_na <- tribble(~x, ~col,
1, "z",
3, "y"
)
df <- tibble(x = c(1,2,3),
y = c(1,1,1),
z = c(2,2,2))
# desired output:
#> # A tibble: 3 x 3
#> x y z
#> <dbl> <dbl> <dbl>
#> 1 1 1 NA
#> 2 2 1 2
#> 3 3 NA 2
Created on 2020-07-03 by the reprex package (v0.3.0)
This definitely isn't the most elegant solution, but it gets the output you want.
library(dplyr)
library(purrr)
to_na <- tribble(~x, ~col,
1, "z",
3, "y"
)
df <- tibble(x = c(1,2,3),
y = c(1,1,1),
z = c(2,2,2))
map2(to_na$x, to_na$col, #Pass through these two objects in parallel
function(xval_to_missing, col) df %>% #Two objects above matched by position here.
mutate_at(col, #mutate_at the specified cols
~if_else(x == xval_to_missing, NA_real_, .) #if x == xval_to_missing, make NA, else keep as is.
) %>%
select(x, col) #keep x and the modified column.
) %>% #end of map2
reduce(left_join, by = "x") %>% #merge within the above list, by x.
relocate(x, y, z) #Keep your ordering
Output:
# A tibble: 3 x 3
x y z
<dbl> <dbl> <dbl>
1 1 1 NA
2 2 1 2
3 3 NA 2
We can use row/column indexing to assign the values to NA in base R
df <- as.data.frame(df)
df[cbind(to_na$x, match(to_na$col, names(df)))] <- NA
df
# x y z
#1 1 1 NA
#2 2 1 2
#3 3 NA 2
If we want to use rows_update
library(dplyr)
library(tidyr)
library(purrr)
lst1 <- to_na %>%
mutate(new = NA_real_) %>%
split(seq_len(nrow(.))) %>%
map(~ .x %>%
pivot_wider(names_from = col, values_from = new))
for(i in seq_along(lst1)) df <- rows_update(df, lst1[[i]])
df
# A tibble: 3 x 3
# x y z
# <dbl> <dbl> <dbl>
#1 1 1 NA
#2 2 1 2
#3 3 NA 2

Using the value in one column to specify from which row to retrieve a value for a new column

I'm looking for an automated way of converting this:
dat = tribble(
~a, ~b, ~c
, 'x', 1, 'y'
, 'y', 2, NA
, 'q', 4, NA
, 'z', 3, 'q'
)
to:
tribble(
~a, ~b, ~d
, 'x', 1, 2
, 'z', 3, 4
)
So, the column c in dat encodes which row in dat to look at to grab a value for a new column d, and if c is NA, toss that row from the output. Any tips?
We can join dat with itself using c and a columns.
library(dplyr)
dat %>%
inner_join(dat %>% select(-c) %>% rename(d = 'b'),
by = c('c' = 'a'))
# A tibble: 2 x 4
# a b c d
# <chr> <dbl> <chr> <dbl>
#1 x 1 y 2
#2 z 3 q 4
In base R, we can do this with merge :
merge(dat, dat[-3], by.x = 'c', by.y = 'a')
We create the 'd' with lead of 'b' and filter out the NA rows of 'c' and remove the c column with select
library(dplyr)
dat %>%
mutate(d = lead(b)) %>%
filter(!is.na(c)) %>%
select(-c)
# A tibble: 2 x 3
# a b d
# <chr> <dbl> <dbl>
#1 x 1 2
#2 z 3 4
Or more compactly
dat %>%
mutate(d = replace(lead(b), is.na(c), NA), c = NULL) %>%
na.omit
Or with fill
library(tidyr)
dat %>%
mutate(c1 = c) %>%
fill(c1) %>%
group_by(c1) %>%
mutate(d = lead(b)) %>%
ungroup %>%
filter(!is.na(c)) %>%
select(-c, -c1)
Or in data.table
library(data.table)
setDT(dat)[, d := shift(b, type = 'lead')][!is.na(c)][, c := NULL][]
# a b d
#1: x 1 2
#2: z 3 4
NOTE: Both the solutions are simple and doesn't require any joins. Besides, it gives the expected output in the OP's post
Or using match from base R
cbind(na.omit(dat), d = with(dat, b[match(c, a, nomatch = 0)]))[, -3]
# a b d
#1 x 1 2
#2 z 3 4

Resources