Change value in grouping based on condition - r

I'm starting with the following data:
df <- data.frame(Person=c("Ada","Ada","Bob","Bob","Carl","Carl"), Day=c(1,2,2,1,1,2), Fruit=c("Apple","X","Apple","X","X","Orange"))
Person Day Fruit
1 Ada 1 Apple
2 Ada 2 X
3 Bob 2 Apple
4 Bob 1 X
5 Carl 1 X
6 Carl 2 Orange
And I want to loop through every person and replace the unknown fruit X with either Apple or Orange while making sure that if it's Orange one day, it should be Apple the next day, and vice versa.
For Ada: Day 1 = Apple, meaning Day 2 = X <- Orange
I don't know where to start other than:
library(dplyr)
df %>%
group_by(Person)
any suggestions for direction?

Another solution using case_when from dplyr:
library(dplyr)
# Changing datatypes to character instead of factor
df[] <- lapply(df, as.character)
# Optional, but this line will convert all columns to appropriate datatype, eg. Day will be integer
df <- readr::type_convert(df)
df %>%
group_by(Person) %>%
mutate(
Contains_Apple = any(Fruit == "Apple"),
Contains_Orange = any(Fruit == "Orange"),
Fruit = case_when(
Fruit == "X" & Contains_Apple == F ~ "Apple",
Fruit == "X" & Contains_Orange == F ~ "Orange",
TRUE ~ Fruit
)
)
# A tibble: 6 x 5
# Groups: Person [3]
Person Day Fruit Contains_Apple Contains_Orange
<chr> <int> <chr> <lgl> <lgl>
1 Ada 1 Apple T F
2 Ada 2 Orange T F
3 Bob 2 Apple T F
4 Bob 1 Orange T F
5 Carl 1 Apple F T
6 Carl 2 Orange F T
Remove the Contains_Apple and Contains_Orange by:
df %>%
group_by(Person) %>%
mutate(Contains_Apple = any(Fruit == "Apple"),
Contains_Orange = any(Fruit == "Orange"),
Fruit = case_when(Fruit == "X" & Contains_Apple == F ~ "Apple",
Fruit == "X" & Contains_Orange == F ~ "Orange",
TRUE ~ Fruit)) %>%
select(Person, Day, Fruit) %>%
ungroup()
# A tibble: 6 x 3
Person Day Fruit
<chr> <int> <chr>
1 Ada 1 Apple
2 Ada 2 Orange
3 Bob 2 Apple
4 Bob 1 Orange
5 Carl 1 Apple
6 Carl 2 Orange

Here is one idea using case_when to check if each group already has "Apple" or "Orange", and then assign the opposite value if Fruit is "X".
Notice that I added stringsAsFactors = FALSE when creating the example data frame, which aims to avoid the creation of factor columns.
library(dplyr)
library(tidyr)
df %>%
group_by(Person) %>%
mutate(Fruit = case_when(
Fruit %in% "X" & any(Fruit %in% "Apple") ~ "Orange",
Fruit %in% "X" & any(Fruit %in% "Orange") ~ "Apple",
TRUE ~ Fruit
)) %>%
ungroup()
# # A tibble: 6 x 3
# Person Day Fruit
# <chr> <dbl> <chr>
# 1 Ada 1.00 Apple
# 2 Ada 2.00 Orange
# 3 Bob 2.00 Apple
# 4 Bob 1.00 Orange
# 5 Carl 1.00 Apple
# 6 Carl 2.00 Orange
DATA
df <- data.frame(Person=c("Ada","Ada","Bob","Bob","Carl","Carl"),
Day=c(1,2,2,1,1,2),
Fruit=c("Apple","X","Apple","X","X","Orange"),
stringsAsFactors = FALSE)

Simple with looping:
fruity_loop <- function(frame) {
ops <- c('Apple', 'Orange')
for(x in 1:nrow(frame)) {
if(frame[x,]['Fruit'] == 'X') {
if(frame[x-1,]['Fruit'] == ops[1]) { frame[x,]['Fruit'] <- ops[2] } else { frame[x,]['Fruit'] <- ops[1] } }
}
return(frame)
}
Example:
fruity_loop(df)

Related

How do I replace values in certain columns conditional on a certain value in corresponding columns?

I have the following data frame:
`1_X94` <- c("apple", "lemon", "orange")
`2_X94` <- c("apple", "strawberry", "lemon")
`1_X09` <- c(1, 2, 3)
`2_X09` <- c(4, 5, 6)
`1_X38` <- c("red", "yellow", "orange")
`2_X38` <- c("red", "red", "yellow")
df <- data.frame(`1_X94`, `2_X94`, `1_X09`, `2_X09`, `1_X38`, `2_X38`)
And I have a second data frame:
fruit <- c("apple", "watermelon")
fruit_list <- data.frame(fruit)
What I would like to accomplish is, whenever there is a column name with the regex pattern of ^\d+_X94? with a value that matches the fruit_list data frame, it replaces the column name with the regex pattern of ^\d+_X38 with the word "green."
I currently have the following code, but I want to add some of the automated aspects so I don't have to list all the fruits in the str_detect() and create multiple mutate commands for X1, X2, etc.
library(tidyverse)
library(stringr)
df <- df %>%
mutate(
X1_X38 = case_when(
str_detect(X1_X94, "apple|watermelon") ~ "green",
TRUE ~ .$X1_X38
)
) %>%
mutate(
X2_X38 = case_when(
str_detect(X2_X94, "apple|watermelon") ~ "green",
TRUE ~ .$X2_X38
)
)
Any guidance would be appreciated.
We can use across
library(dplyr)
library(stringr)
df %>%
mutate(across(ends_with('_X38'),
~ if(all(is.na(.x))) NA_character_ else
case_when(get(str_replace(cur_column(), "_X38$", "_X94")) %in%
fruit ~ "green", TRUE ~ .x)))
-output
X1_X94 X2_X94 X1_X09 X2_X09 X1_X38 X2_X38
1 apple apple 1 4 green green
2 lemon strawberry 2 5 yellow red
3 orange lemon 3 6 orange yellow
This does not feel like the most efficient way, but here is an option:
library(tidyverse)
df|>
mutate(row = row_number()) |>
pivot_longer(names_pattern = "(X\\d)_(X\\d+)",
names_to = c("X1", "X2"),
values_transform = as.character,
cols = -row)|>
pivot_wider(names_from = X2, values_from = value) |>
mutate(X38 = ifelse(X94 %in% fruit_list$fruit, "green", X38)) |>
pivot_longer(c(X38,X09, X94)) |>
pivot_wider(names_from = c(X1, name),
names_glue = "{X1}_{name}",
values_from = value)
#> # A tibble: 3 x 7
#> row X1_X38 X1_X09 X1_X94 X2_X38 X2_X09 X2_X94
#> <int> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1 green 1 apple green 4 apple
#> 2 2 yellow 2 lemon red 5 strawberry
#> 3 3 orange 3 orange yellow 6 lemon
EDIT
This feels a little cleaner:
library(tidyverse)
#helper
col_split <- function(dat){
list(
dat[,grepl("X1_", colnames(dat))],
dat[,grepl("X2_", colnames(dat))]
)
}
df |>
col_split() |>
map_dfc(\(x) mutate(x, across(ends_with("X38"),
\(y) ifelse(x[,grepl("X94", colnames(x))] %in% fruit_list$fruit,
"green", y))))
#> X1_X94 X1_X09 X1_X38 X2_X94 X2_X09 X2_X38
#> 1 apple 1 green apple 4 green
#> 2 lemon 2 yellow strawberry 5 red
#> 3 orange 3 orange lemon 6 yellow

Compare overlap of groups pairwise using tidyverse

I have a tidy data.frame in this format:
library(tidyverse)
df = data.frame(name = c("Clarence","Clarence","Clarence","Shelby","Shelby", "Patricia","Patricia"), fruit = c("Apple", "Banana", "Grapes", "Apple", "Apricot", "Banana", "Grapes"))
df
# name fruit
#1 Clarence Apple
#2 Clarence Banana
#3 Clarence Grapes
#4 Shelby Apple
#5 Shelby Apricot
#6 Patricia Banana
#7 Patricia Grapes
I want to compare the overlaps between groups in a pairwise manner (i.e. if both people have an apple that counts as an overlap of 1) so that I end up with a dataframe that looks like this:
df2 = data.frame(names = c("Clarence-Shelby", "Clarence-Patricia", "Shelby-Patricia"), n_overlap = c(1, 2, 0))
df2
# names n_overlap
#1 Clarence-Shelby 1
#2 Clarence-Patricia 2
#3 Shelby-Patricia 0
Is there an elegant way to do this in the tidyverse framework? My real dataset is much larger than this and will be grouped on multiple columns.
If the 0 overlap is not important, a solution is:
> df %>% inner_join(df,by="fruit") %>% filter(name.x<name.y) %>% count(name.x,name.y)
name.x name.y n
1 Clarence Patricia 2
2 Clarence Shelby 1
If you really need non-overlapping pairs:
> a = df %>% inner_join(df,by="fruit") %>% filter(name.x<name.y) %>% count(name.x,name.y)
> b = as.data.frame(t(combn(sort(unique(df$name,2)),2)))
> colnames(b)=colnames(a)[1:2]
> a %>% full_join(b) %>% replace_na(list(n=0))
Joining, by = c("name.x", "name.y")
name.x name.y n
1 Clarence Patricia 2
2 Clarence Shelby 1
3 Patricia Shelby 0
Try this,
combinations <- apply(combn(unique(df$name), 2), 2, function(z) paste(sort(z), collapse = "-"))
combinations
# [1] "Clarence-Shelby" "Clarence-Patricia" "Patricia-Shelby"
library(dplyr)
df %>%
group_by(fruit) %>%
summarize(names = paste(sort(unique(name)), collapse = "-")) %>%
right_join(tibble(names = combinations), by = "names") %>%
group_by(names) %>%
summarize(n_overlap = sum(!is.na(fruit)))
# # A tibble: 3 x 2
# names n_overlap
# <chr> <int>
# 1 Clarence-Patricia 2
# 2 Clarence-Shelby 1
# 3 Patricia-Shelby 0

Separate multi-value obs with pairs of values and count

I have a data frame combining single and multi-values obs.
dataset <- c("Apple;Banana;Kiwi", "orange", "Apple;Banana", "orange" )
dataset <- as.data.frame(dataset)
My output :
dataset
1 Apple;Banana;Kiwi
2 orange
3 Apple;Banana
4 orange
What I want : separate by pairs all the combinaisons of values into 2 columns and count to make a graph
from |to |weight
Apple |Banana|2
Apple | Kiwi | 1
Banana| Kiwi | 1
orange|NA |2
What I tried :
dataset2 <- dataset %>%
separate_rows(dataset, sep = ";")
We may use combn on each row and get the frequency
stack(table(unlist(lapply(strsplit(dataset$dataset, ";"),
function(x) if(length(x) > 1) combn(x, 2, FUN = toString) else x))))[2:1]
-output
ind values
1 Apple, Banana 2
2 Apple, Kiwi 1
3 Banana, Kiwi 1
4 orange 2
You could do:
library(dplyr)
result <-
do.call(rbind, lapply(strsplit(dataset$dataset, ';'), function(x) {
if(length(x) == 1) return(c(x, NA_character_))
do.call(rbind, lapply(1:(length(x) - 1), function(i) c(x[i], x[i+1])))
}))
as.data.frame(table(paste(result[,1], result[,2]))) %>%
tidyr::separate(Var1, into = c('from', 'to'), sep = ' ') %>%
mutate(to = ifelse(to == 'NA', NA, to),
weight = Freq) %>%
select(-Freq)
#> from to weight
#> 1 Apple Banana 2
#> 2 Banana Kiwi 1
#> 3 orange <NA> 2
Another possible solution:
library(tidyverse)
pmap(dataset, ~ if (str_detect(.x, ";"))
{combn(.x %>% str_split(";") %>% unlist, 2, str_c, collapse=";")} else {.x}) %>%
map_dfr(data.frame) %>%
separate(1, ";", into = c("from", "to"), fill = "right") %>%
count(from, to, name = "weight")
#> from to weight
#> 1 Apple Banana 2
#> 2 Apple Kiwi 1
#> 3 Banana Kiwi 1
#> 4 orange <NA> 2
Or without purrr:
library(tidyverse)
dataset %>%
rowwise %>%
mutate(from = ifelse(str_detect(dataset, ";"), combn(dataset %>%
str_split(";") %>% unlist, 2, str_c, collapse=";") %>% list,
list(dataset))) %>%
unnest_longer(from) %>%
separate(from, ";", into = c("from", "to"), fill = "right") %>%
count(from, to, name = "weight")
#> # A tibble: 4 × 3
#> from to weight
#> <chr> <chr> <int>
#> 1 Apple Banana 2
#> 2 Apple Kiwi 1
#> 3 Banana Kiwi 1
#> 4 orange <NA> 2

How to subset list by ID for specfic output

If I say I have a list of people who used cafeteria.
Fruits ID Date
apple 1 100510
apple 2 100710
banana 2 110710
banana 1 120910
kiwi 2 120710
apple 3 100210
kiwi 3 110810
I want to select people who have took both apple and banana and my new dataset to contain people who qualify for this inclusion criteria and give:
ID
1
2
(because only ID 1 and 2 had both apple and banana in the dataset)
what code should I use in R?
In base R you could do something like
data.frame(ID = names(which(sapply(split(df$Fruits, df$ID), function(x) {
"apple" %in% x & "banana" %in% x
}))))
#> ID
#> 1 1
#> 2 2
This will give you the name of the IDs that contain both "apple" and "banana"
If you want the subset of the data frame containing these rows you can do:
df[df$ID %in% names(which(sapply(split(df$Fruits, df$ID), function(x) {
"apple" %in% x & "banana" %in% x
}))),]
#> Fruits ID Date
#> 1 apple 1 100510
#> 2 apple 2 100710
#> 3 banana 2 110710
#> 4 banana 1 120910
#> 5 kiwi 2 120710
You can use dplyr package to check if any of the entries is an apple AND any of the entries is a banana per ID:
library(dplyr)
df <- data.frame(Fruits = c("apple", "apple", "banana", "banana", "kiwi", "apple", "kiwi"),
ID = c(1,2,2,1,2,3,3),
Date = c(100510,100710,100710,120910, 120710,100210,110810))
df %>%
group_by(ID) %>%
filter(any(Fruits == "apple") & any(Fruits == "banana")) %>%
ungroup() %>%
select(ID) %>%
distinct()
In this case, the result is
# A tibble: 2 x 1
ID
<dbl>
1 1
2 2

Why can't case_when return different-length vectors?

This fails:
library(tidyverse)
myFn <- function(nmbr){
case_when(
nmbr > 3 ~ letters[1:3],
TRUE ~ letters[1:2]
)
}
myFn(4)
# Error: `TRUE ~ letters[1:2]` must be length 3 or one, not 2
# Run `rlang::last_error()` to see where the error occurred.
Why does it fail? Why is case_when built in such a way that its branches can't return different-length vectors? I'd like myFn to work so that I can do things like:
tibble(fruit = c("apple", "grape"),
count = 3:4) %>%
mutate(bowl = myFn(count)) %>%
unnest(col = "bowl")
and get
# A tibble: 5 x 3
fruit count bowl
<chr> <int> <int>
1 apple 3 a
2 apple 3 b
3 grape 4 a
4 grape 4 b
5 grape 4 c
I can get it to work - by writing a non-vectorized myFn using if/else, then wrapping it in map, but why should I have to?
Per my comments, your function needs to return one element for each row of input. However, each of those elements can be a list of length 0 or more (and arbitrary complexity). Try this:
myFn <- function(nmbr){
case_when(
nmbr > 3 ~ list(letters[1:3]),
TRUE ~ list(letters[1:2])
)
}
tibble(fruit = c("apple", "grape"),
count = 3:4) %>%
mutate(bowl = myFn(count))
# # A tibble: 2 x 3
# fruit count bowl
# <chr> <int> <list>
# 1 apple 3 <chr [2]>
# 2 grape 4 <chr [3]>
tibble(fruit = c("apple", "grape"),
count = 3:4) %>%
mutate(bowl = myFn(count)) %>%
unnest(col = "bowl")
# # A tibble: 5 x 3
# fruit count bowl
# <chr> <int> <chr>
# 1 apple 3 a
# 2 apple 3 b
# 3 grape 4 a
# 4 grape 4 b
# 5 grape 4 c
Extending r2Evans answer, you can unlist prior to returning. Here is my code that I am using as part of my nesting algorithm.
test_fn <- function(StockCode) {
L <- dplyr::case_when(stringr::str_starts(StockCode, "A") ~ list(7500),
stringr::str_starts(StockCode, "FL") ~ list(6000),
stringr::str_starts(StockCode, "PIPE") ~ list(6500),
stringr::str_starts(StockCode, "RHS") ~
list(c(8000, 12000)),
stringr::str_starts(StockCode, "RND") ~ list(6000),
stringr::str_starts(StockCode, "SQ") ~ list(6000),
TRUE ~
list(c(9000, 10500, 12000, 13500, 15000, 16500, 18000))) %>%
unlist()
return(L)
}
test_fn("RHS")
test_fn("CH")
The output will be the vector assigned to the list.

Resources