Find novel categories between groups - r

I am trying to identify which trees are different between two groups a & b across different forest types (type).
My dummy example:
dd1 <- data.frame(
type = rep(1, 5),
grp = c('a', 'a', 'a', 'b', 'b'),
sp = c('oak', 'beech', 'spruce',
'oak', 'yew')
)
dd2 <- data.frame(
type = rep(2, 3),
grp = c('a', 'b', 'b'),
sp = c('oak', 'beech', 'spruce')
)
dd <- rbind(dd1, dd2)
I can find unique species by each group (in reality, two groups: type & grp) by distinct:
dd %>%
group_by(type, grp) %>%
distinct(sp)
But instead I want to know which trees in group b are different from group a?
Expected output:
type grp sp
<dbl> <chr> <chr>
1 1 b yew # here, only `yew` is a new one; `oak` was previously listed in group `a`
2 2 b beech # both beech and spruce are new compared to group `a`
3 2 b spruce
How can I do this? Thank you!

The condition to filter is
library(dplyr)
dd %>%
group_by(type) %>%
filter(grp == 'b' & !sp %in% sp[grp == 'a']) %>%
ungroup()
# # A tibble: 3 × 3
# type grp sp
# <dbl> <chr> <chr>
# 1 1 b yew
# 2 2 b beech
# 3 2 b spruce

You could try an anti_join:
library(dplyr)
library(tidyr)
dd |>
anti_join(dd |> filter(grp == "a"), by = c("sp", "type"))
Output:
type grp sp
1 1 b yew
2 2 b beech
3 2 b spruce

Related

Replace a value in a data frame from other dataframe in r

Hi I have two dataframes, based on the id match, i wanted to replace table a's values with that of table b.
sample dataset is here :
a = tibble(id = c(1, 2,3),
type = c("a", "x", "y"))
b= tibble(id = c(1,3),
type =c("d", "n"))
Im expecting an output like the following :
c= tibble(id = c(1,2,3),
type = c("d", "x", "n"))
In dplyr v1.0.0, the rows_update() function was introduced for this purpose:
rows_update(a, b)
# Matching, by = "id"
# # A tibble: 3 x 2
# id type
# <dbl> <chr>
# 1 1 d
# 2 2 x
# 3 3 n
Here is an option using dplyr::left_join and dplyr::coalesce
library(dplyr)
a %>%
rename(old = type) %>%
left_join(b, by = "id") %>%
mutate(type = coalesce(type, old)) %>%
select(-old)
## A tibble: 3 × 2
# id type
#. <dbl> <chr>
#1 1 d
#2 2 x
#3 3 n
The idea is to join a with b on column id; then replace missing values in type from b with values from a (column old is the old type column from a, avoiding duplicate column names).

R: Group_by depending on condition

let's assume I have this simple dataframe:
df <- tibble(a = c(1, 1), b = c(2, 2))
I now want to know how to use a group_by inside a pipline that depends on a variable. Something like,
flag <- T
resulting <- df %>%
filter(a > 0 & b >0) %>%
group_by(ifelse(flag), yes = c(a), no = c(a, b))
That is, if flag == T, then I want to group only on column a. If flag is false I want to group an both columns.
I think this worked for me
flag <- T
resulting <- df %>%
filter(a > 0 & b >0) %>%
{if(flag) group_by(.,a) else group_by(. ,a , b)}
resulting
# A tibble: 2 × 2
# Groups: a [1] # <======== here grouped by a
a b
<dbl> <dbl>
1 1 2
2 1 2
by changing the flag
flag <- F
resulting <- df %>%
filter(a > 0 & b >0) %>%
{if(flag) group_by(.,a) else group_by(. ,a , b)}
resulting
# A tibble: 2 × 2
# Groups: a, b [1] # <======== here grouped by a ,b
a b
<dbl> <dbl>
1 1 2
2 1 2

R: dplyr How to group by then filter rows based on the condition of each group's first row

I have a simple data frame such as
df <- data.frame(x=c(1,1,1,1,2,2,2,3,3,3),
y=c('a','b','a','c','e','d','e','a','f','c'))
I want to group by x, then if the first row of each x-groups has y == 'a', then get only rows that have y == 'a' | y == 'c'
So I expect the outcome would have row 1, 3, 4, 8, 10
Thank you very much.
After grouping by 'x', create an & condition - 1) check whether the first value of 'y' is 'a', 2) condition that checks for values 'a', 'c' in the column
library(dplyr)
df %>%
group_by(x) %>%
filter('a' == first(y), y %in% c('a', 'c')) %>%
ungroup
-output
# A tibble: 5 × 2
x y
<dbl> <chr>
1 1 a
2 1 a
3 1 c
4 3 a
5 3 c
If we have additional rules, create a named list where the names will be expected first values of 'y' and the vector of values to be filtered, then extract the list element based on the first value of the 'y' and use that vector in the logical expression with %in%
df %>%
group_by(x) %>%
filter(y %in% list(a = c('a', 'c'), e = 'e')[[first(y)]]) %>%
ungroup
-output
# A tibble: 7 × 2
x y
<dbl> <chr>
1 1 a
2 1 a
3 1 c
4 2 e
5 2 e
6 3 a
7 3 c
Here is another dplyr option
> df %>%
+ filter(y %in% c("a", "c") & ave(y == "a", x, FUN = first))
x y
1 1 a
2 1 a
3 1 c
4 3 a
5 3 c

Using the value in one column to specify from which row to retrieve a value for a new column

I'm looking for an automated way of converting this:
dat = tribble(
~a, ~b, ~c
, 'x', 1, 'y'
, 'y', 2, NA
, 'q', 4, NA
, 'z', 3, 'q'
)
to:
tribble(
~a, ~b, ~d
, 'x', 1, 2
, 'z', 3, 4
)
So, the column c in dat encodes which row in dat to look at to grab a value for a new column d, and if c is NA, toss that row from the output. Any tips?
We can join dat with itself using c and a columns.
library(dplyr)
dat %>%
inner_join(dat %>% select(-c) %>% rename(d = 'b'),
by = c('c' = 'a'))
# A tibble: 2 x 4
# a b c d
# <chr> <dbl> <chr> <dbl>
#1 x 1 y 2
#2 z 3 q 4
In base R, we can do this with merge :
merge(dat, dat[-3], by.x = 'c', by.y = 'a')
We create the 'd' with lead of 'b' and filter out the NA rows of 'c' and remove the c column with select
library(dplyr)
dat %>%
mutate(d = lead(b)) %>%
filter(!is.na(c)) %>%
select(-c)
# A tibble: 2 x 3
# a b d
# <chr> <dbl> <dbl>
#1 x 1 2
#2 z 3 4
Or more compactly
dat %>%
mutate(d = replace(lead(b), is.na(c), NA), c = NULL) %>%
na.omit
Or with fill
library(tidyr)
dat %>%
mutate(c1 = c) %>%
fill(c1) %>%
group_by(c1) %>%
mutate(d = lead(b)) %>%
ungroup %>%
filter(!is.na(c)) %>%
select(-c, -c1)
Or in data.table
library(data.table)
setDT(dat)[, d := shift(b, type = 'lead')][!is.na(c)][, c := NULL][]
# a b d
#1: x 1 2
#2: z 3 4
NOTE: Both the solutions are simple and doesn't require any joins. Besides, it gives the expected output in the OP's post
Or using match from base R
cbind(na.omit(dat), d = with(dat, b[match(c, a, nomatch = 0)]))[, -3]
# a b d
#1 x 1 2
#2 z 3 4

how to extract column names based on a condition?

Consider this simple example
mytest <- data_frame(group = c('a', 'a', 'a', 'b', 'b', 'b'),
x = c(NA,NA,NA,5,6,7),
other_var = c(NA, NA, NA, 1,2,3),
y = c(3,5,6,NA,NA,NA),
another_var = c(1,2,3, NA,NA,NA),
label_x = c('hello','hello','hello','world','world','world'),
label_y =c('bada','bada','bada','boom','boom','boom'),
label_other_var = c('ak','ak','ak','run','run','run'),
label_another_var = c('noo','noo','noo','bie','bie','bie'))
# A tibble: 6 x 9
group x other_var y another_var label_x label_y label_other_var label_another_var
<chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr>
1 a NA NA 3 1 hello bada ak noo
2 a NA NA 5 2 hello bada ak noo
3 a NA NA 6 3 hello bada ak noo
4 b 5 1 NA NA world boom run bie
5 b 6 2 NA NA world boom run bie
6 b 7 3 NA NA world boom run bie
Here, I need to nest() this dataframe by group, and be able to extract the column names of the variables (in each nested dataframe) that are not NAs. The trick is that the actual name of the variable is shown in the label_ column
For instance, this is the output desired:
# A tibble: 4 x 2
group var
<chr> <chr>
1 a bada
2 a noo
3 b world
4 b run
Indeed, take group a. There is only one non-missing variables are y and another_var. However, the name of y is bada (as shown in the label_y variable) and the name of another_var is noo. Same reasoning for b.
I dont know how to do that with a map call after running
mytest %>% group_by(group) %>% nest()
# A tibble: 2 x 2
group data
<chr> <list>
1 a <tibble [3 x 8]>
2 b <tibble [3 x 8]>
Any ideas?
Thanks!
EDIT: the original, smaller, tibble proposed was the following
mytest <- data_frame(group = c('a', 'a', 'a', 'b', 'b', 'b'),
+ x = c(NA,NA,NA,5,6,7),
+ y = c(3,5,6,NA,NA,NA),
+ label_x = c('hello','hello','hello','world','world','world'),
+ label_y =c('bada','bada','bada','boom','boom','boom'))
After group by nesting, loop through the 'data' with map, summarise the the 'label' columns by extracting the first non-NA element, gather it to a single column while removing the NA (na.rm = TRUE)), select the 'var' column, and then do the unnest (after keeping only the columns of interest)
mytest %>%
group_by(group) %>%
nest %>%
mutate(var = map(data, ~
.x %>%
summarise(label_x = label_x[!is.na(x)][1],
label_y = label_y[!is.na(y)][1]) %>%
gather(key, var, na.rm = TRUE) %>%
select(var))) %>%
select(-data) %>%
unnest
# A tibble: 2 x 2#
# group var
# <chr> <chr>
#1 a bada
#2 b world
Update
If there are more columns, create unique column names and then loop through the corresponding column names with map2
nm1 <- unique(sub("label_", "", setdiff(names(mytest), "group")))
nm2 <- paste0("label_", nm1)
mytest %>%
group_by(group) %>%
nest %>%
mutate(var = map(data, ~
map2_chr(.x %>%
select(nm1),
.x %>%
select(nm2), ~
.y[!is.na(.x)][1]) %>%
na.omit %>%
tibble(var = .))) %>%
select(-data) %>%
unnest
# A tibble: 4 x 2
# group var
# <chr> <chr>
#1 a bada
#2 a noo
#3 b world
#4 b run
This will output the result you want:
mytest <- data_frame(group = c('a', 'a', 'a', 'b', 'b', 'b'),
x = c(NA,NA,NA,5,6,7),
y = c(3,5,6,NA,NA,NA),
label_x = c('hello','hello','hello','world','world','world'),
label_y =c('bada','bada','bada','boom','boom','boom'))
extract_good_colnames <- function(df, subgroup){
subset <- filter(df, group == subgroup)
if(sum(is.na(subset$x)) > 0){
colname = 'label_y'
}else if(sum(is.na(subset$y)) > 0){
colname = 'label_x'
}
return(tibble(group = subgroup, var = as.character(subset[1, colname])))
}
groups <- unique(mytest$group)
map_df(groups, function(x) extract_good_colnames(mytest, x))

Resources