Use result of subquery in conditional mutate - r

I want to compute a new row based on the result of a subquery in the same dataframe. Minimal (non) working example:
library(plyr)
library(dplyr)
df <- data.frame(
VAR1 = c("A", "A", "B", "C"),
VAR2 = c("F", "G", "E", "D"),
VAR3 = c("G", "F", "X", "D")
) %>% as_tibble
subquery <- function(v1, v2) {
dplyr::filter(df, as.character(v1) == VAR1, as.character(v2) == VAR2)
}
TEST <-
df %>%
mutate(X = case_when(
plyr::empty(subquery(VAR1, VAR3)) ~ "EMPTY",
TRUE ~ "NON EMPTY"
))
The result dataframe TEST should be
VAR1 VAR2 VAR3 X
<fctr> <fctr> <fctr> <chr>
A F G NON EMPTY
A G F NON EMPTY
B E X EMPTY
C D D NON EMPTY
but is
VAR1 VAR2 VAR3 X
<fctr> <fctr> <fctr> <chr>
A F G NON EMPTY
A G F NON EMPTY
B E X NON EMPTY
C D D NON EMPTY
Many thanks in advance!
remark: If I don't coerce the v1 and v2 to character I get the following error:
Error in mutate_impl(.data, dots) :
Evaluation error: Evaluation error: level sets of factors are different..

I would put the empty function within the subquery function in order to return TRUE or FALSE values. Then it would be possible to vectorise it in order to apply it to every row of your dataframe:
library(plyr)
library(dplyr)
df <- data.frame(
VAR1 = c("A", "A", "B", "C"),
VAR2 = c("F", "G", "E", "D"),
VAR3 = c("G", "F", "X", "D")
) %>% as_tibble
subquery <- function(v1, v2) {
empty(filter(df, as.character(v1) == VAR1, as.character(v2) == VAR2))
}
subquery = Vectorize(subquery)
df %>%
mutate(X = case_when(
subquery(VAR1, VAR3) == FALSE ~ "NON EMPTY",
TRUE ~ "EMPTY"
))
# # A tibble: 4 x 4
# VAR1 VAR2 VAR3 X
# <fct> <fct> <fct> <chr>
# 1 A F G NON EMPTY
# 2 A G F NON EMPTY
# 3 B E X EMPTY
# 4 C D D NON EMPTY
Or you can put empty and case_when within the subquery function like this:
subquery <- function(v1, v2) {
res = empty(filter(df, as.character(v1) == VAR1, as.character(v2) == VAR2))
case_when(res == FALSE ~ "NON EMPTY",
TRUE ~ "EMPTY")
}
subquery = Vectorize(subquery)
df %>% mutate(X = subquery(VAR1, VAR3))

Related

I´m looking for a way to inverse the values of 2 columns of a row when this inverse exists in the dataframe?

Here is my dataframe:
DF <- data.frame(
VAR1 = c("A", "A", "B", "B", "B", "C", "C"),
VAR2 = c("B", "C", "A", "D", "C", "B", "D"),
VAR3 = c(1, 1, 1, 2, 4, 6, 4)
)
I would like to have this:
VAR1 VAR2 VAR3
A B 2
A C 1
B D 2
B C 10
C D 4
If There is two rows like (VAR1=A, VAR2=B, VAR3=X) and (VAR2=B, VAR1=A, VAR3=Y), I want to have one row like this one (VAR1=A, VAR2=B, VAR3=X+Y). So if the two first variables are "inverse", I would like to have one row with the sum of them.
I tried to have a column which says "Yes" if two rows have inverse values but I can´t find a way to do it.
My code:
DF <- DF %>%
mutate(VAR4 = case_when(VAR2 %in% DF$VAR1 &
VAR1 %in%
(DF %>%
filter(VAR1 == VAR2) %>%
pull(VAR2)
) ~ "Yes",
TRUE ~ 'No' ))
`
This is the result:
VAR1 VAR2 VAR3 VAR4
A B 1 No
A C 1 No
B A 1 No
B D 2 No
B C 4 No
C B 6 No
C D 4 No
My code doesn´t work because my filter doesn´t take the result of VAR2 %in% DF$VAR1 in account.
Does someone have an idea?
You can sort first with apply, and then summarise:
DF[1:2] <- t(apply(DF[1:2], 1, sort))
DF %>%
group_by(VAR1, VAR2) %>%
summarise(VAR3 = sum(VAR3))
# A tibble: 5 × 3
# Groups: VAR1 [3]
VAR1 VAR2 VAR3
<chr> <chr> <dbl>
1 A B 2
2 A C 1
3 B C 10
4 B D 2
5 C D 4
Or, in single pipe:
DF %>%
mutate(VAR = pmap(., ~ sort(c(..1, ..2)) %>%
set_names(., c("VAR1", "VAR2")))) %>%
group_by(VAR) %>%
summarise(VAR3 = sum(VAR3)) %>%
unnest_wider(VAR)
You could try:
library(dplyr)
DF %>%
mutate(across(VAR1:VAR2, as.character)) %>%
group_by(idx1 = pmin(VAR1, VAR2), idx2 = pmax(VAR1, VAR2)) %>%
summarise(VAR3 = sum(VAR3)) %>%
rename_with(~ sub('idx', 'VAR', .)) %>%
ungroup
Output:
# A tibble: 5 x 3
VAR1 VAR2 VAR3
<chr> <chr> <dbl>
1 A B 2
2 A C 1
3 B C 10
4 B D 2
5 C D 4

stringr: replace string by supplying pattern through a character vector

Here is my data:
df <- tibble::tribble(
~A, ~B,
"C", "G",
"D", "H",
"E", "I",
"F", "J")
value1 <- "D"
value2 <- "C"
And, in variable A, I want to replace D and C with "m" and "n", something like this, but it's not working!
df %>% mutate(X = A %>% str_replace_all(c(value1 = "m", value2 = "n")))
My desired output is:
df %>% mutate(X = A %>% str_replace_all(c("D" = "m", "C" = "n")))
But instead of supplying "D" and "C" manually, I want to programmatically supply these, something in line with...using value1 and value2.
How should I do that?
You could try using setNames to set the names of m and n like:
library(dplyr)
library(stringr)
df %>% mutate(X = A %>% str_replace_all(setNames(c("m","n"), c(value1, value2))))
# A tibble: 4 x 3
# A B X
# <chr> <chr> <chr>
#1 C G n
#2 D H m
#3 E I E
#4 F J F
And then checking that it's equal to your desired result:
identical(
df %>% mutate(X = A %>% str_replace_all(c("D" = "m", "C" = "n"))),
df %>% mutate(X = A %>% str_replace_all(setNames(c("m","n"), c(value1, value2)))))
#[1] TRUE
I also included the other packages you use: dplyr and stringr
You can think of creating a named vector and use it as replacement vector.
replacementVector <- c("m","n")
names(replacementVector) <- c("D","C")
Now, use the replacementVector in dplyr chain along with ifelse as:
df %>% mutate(X = ifelse(is.na(replacementVector[A]), A, replacementVector[A]))
# # A tibble: 4 x 3
# A B X
# <chr> <chr> <chr>
# 1 C G n
# 2 D H m
# 3 E I E
# 4 F J F
Data:
library(tidyverse)
df <- tibble::tribble(
~A, ~B,
"C", "G",
"D", "H",
"E", "I",
"F", "J")
As is vectorized over string and replacement if you put all the values in the same vector you can just run
df %>% mutate(X = A %>% str_replace_all(c("C","D"), c("m","n")))
We could use chartr
df %>%
mutate(X = chartr('DC', 'mn', A))
# A tibble: 4 x 3
# A B X
# <chr> <chr> <chr>
#1 C G n
#2 D H m
#3 E I E
#4 F J F

Loop to Replace Matching Values

I'm looking for an easy and elegant way to accomplish this.
So if I have dataset x and relationship is A -> B -> Z -> Y and D -> H -> G, I would like to create dataset y. Unfortunately, they are not necessarily in order:
> x <- data.frame(
+ from = as.character(c("A", "E", "B", "D", "H", "Z")),
+ to = as.character(c("B", "E", "Z", "H", "G", "Y")))
>
> y <- data.frame(
+ from = as.character(c("A", "E", "B", "D", "H", "Z")),
+ to = as.character(c("Y", "E", "Y", "G", "G", "Y")))
>
> x
from to
1 A B
2 E E
3 B Z
4 D H
5 H G
6 Z Y
> y
from to
1 A Y
2 E E
3 B Y
4 D G
5 H G
6 Z Y
I have a fairly large dataset (currently 500k rows; will grow in the future) and actually care about the performance; I'm not sure if there are any other ways to do this without a for-loop or even to vectorize/parallelize the process.
I'm thinking about splitting and removing all rows where from == to or creating an indicator to skip certain rows so the loop does not have to go through the entire dataset each time.
I'd also like to know what the breakpoint should be if I do create a loop; I'm not sure how to define when the loop should stop.
Any suggestions would be appreciated. Thanks!
We can use dplyr to create a grouping variable by comparing the adjacent elements of 'to' and 'from' and change the values in 'to' the last element of 'to'
library(dplyr)
x %>%
group_by(grp = cumsum(lag(lead(from, default = last(from)) !=
as.character(to), default = TRUE))) %>%
mutate(to = last(to)) %>%
ungroup %>%
select(-grp)
# A tibble: 4 x 2
# from to
# <fctr> <fctr>
#1 A D
#2 B D
#3 C D
#4 E E
Another solution can be achieved using lag from dplyr and fill from tidyr as:
library(tidyverse)
x %>% arrange(from) %>%
mutate(samegroup = ifelse(from == lag(to), 1, 0)) %>%
mutate(group = ifelse(samegroup == 0 | is.na(samegroup), row_number(), NA)) %>%
fill(group) %>%
group_by(group) %>%
mutate(to = last(to)) %>%
ungroup() %>%
select(-samegroup, - group)
# A tibble: 6 x 2
# from to
# <chr> <chr>
#1 A D
#2 B D
#3 C D
#4 E E
#5 F H
#6 G H
Data used
x <- data.frame(from = as.character(c("A", "B", "F", "C", "G", "E")),
to = as.character(c("B", "C", "G", "D", "H", "E")),
stringsAsFactors = FALSE)

How to convert tidy hierarchical data frame to hierarchical list grid in R?

This is a more complex version of a previous question where I had abstracted the actual problem too much to apply the answers.
R convert tidy hierarchical data frame to hierarchical list
I've converted a hierarchical data frame with two grouping levels into a hierarchical list-grid using a for loop.
Is there a more efficient base R, tidyverse or other approach to achieve this?
In the real dataset:
The grouping variables and description are multi word strings.
The description preface - d# - is in the MWE for ease of checking.
There are 14 associated variables variously of type: character, integer and double
Rules
Group 1 and Group 2 headings to be in description column
Group 1 headings to appear once only
Group 2 heading are children of group 1 heading, and only change when there is a new group 2 heading
Descriptions are children of group 2 headings
From this
g1 g2 desc var1 var2 var3
A a d1 KS3 0.0500 2 PLs
A a d2 CTI 0.0500 9 7O0
A b d3 b8x 0.580 5 he2
A b d4 XOf 0.180 12 XJE
A b d5 ygn 0.900 11 v48
A c d6 dGY 0.770 6 UcH
A d d7 jpG 0.600 4 P5M
B d d8 Z95 0.600 10 j6O
To this
desc var1 var2 var3
A
a
d1 KS3 0.0500 2 PLs
d2 CTI 0.0500 9 7O0
b
d3 b8x 0.580 5 he2
d4 XOf 0.180 12 XJE
d5 ygn 0.900 11 v48
c
d6 dGY 0.770 6 UcH
d
d7 jpG 0.600 4 P5M
B
d
Code
library(tidyverse)
library(stringi)
set.seed(2018)
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = paste0("d", 1:12, " ", stri_rand_strings(12, 3)),
var1 = round(runif(12), 2),
var2 = sample.int(12),
var3 = stri_rand_strings(12, 3))
tib
# Number of rows in final table
n_rows <- length(unique(tib$g1)) + length(unique(paste0(tib$g1, tib$g2))) + nrow(tib)
# create empty output tibble
output <-
as_tibble(matrix(nrow = n_rows, ncol = ncol(tib)-1)) %>%
rename(id = V1, desc = V2, var1 = V3, var2 = V4, var3 = V5) %>%
mutate(id = NA_character_,
desc = NA_character_,
var1 = NA_real_,
var2 = NA_integer_,
var3 = NA_character_)
# Loop counters
level_1 <- 0
level_2 <- 0
output_row <- 1
for(i in seq_len(nrow(tib))){
# level 1 headings
if(tib$g1[[i]] != level_1) {
output$id[[output_row]] <- "g1"
output$desc[[output_row]] <- tib$g1[[i]]
output_row <- output_row + 1
}
# level 2 headings
if(paste0(tib$g1[[i]], tib$g2[[i]]) != paste0(level_1, level_2)) {
output$id[[output_row]] <- "g2"
output$desc[[output_row]] <- tib$g2[[i]]
output_row <- output_row + 1
}
level_1 <- tib$g1[[i]]
level_2 <- tib$g2[[i]]
# Description and data grid
output$desc[[output_row]] <- tib$desc[[i]]
output$var1[[output_row]] <- tib$var1[[i]]
output$var2[[output_row]] <- tib$var2[[i]]
output$var3[[output_row]] <- tib$var3[[i]]
output_row <- output_row + 1
}
output
Adapting the answer from tyluRp R convert tidy hierarchical data frame to hierarchical list I've hit on a solution.
library(tidyverse)
library(stringi)
set.seed(2018)
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = paste0("d", 1:12, " ", stri_rand_strings(12, 3)),
var1 = round(runif(12), 2),
var2 = sample.int(12),
var3 = stri_rand_strings(12, 3))
# add unique identifier for description and variable rows
tib <-
tib %>%
rowid_to_column() %>%
mutate(rowid = paste0("z_", rowid))
# separate tibble for variables associated with descriptions
tib_var <-
tib %>%
select(rowid, var1, var2, var3)
# code adapted from tyluRp to reorder the data and add description variables
tib <-
tib %>%
select(g1, g2, desc, rowid) %>%
mutate(g2 = paste(g1, g2, sep = "_")) %>%
transpose() %>%
unlist() %>%
stack() %>%
distinct(values, ind) %>%
mutate(detect_var = str_detect(values, "^z_"),
ind = lead(case_when(detect_var == TRUE ~ values)),
values = case_when(detect_var == TRUE ~ NA_character_,
TRUE ~ values))%>%
drop_na(values) %>%
select(values, ind) %>%
mutate(values = str_remove(values, "\\D_")) %>%
left_join(tib_var, by = c("ind" = "rowid")) %>%
select(-ind) %>%
replace_na(list(var1 = "", var2 = "", var3 = ""))

How to assign a value to a data.frame filtered by dplyr?

I am trying to modify a data.frame filtered by dplyr but I don't quite seem to grasp what I need to do. In the following example, I am trying to filter the data frame z and then assign a new value to the third column -- I give two examples, one with "9" and one with "NA".
require(dplyr)
z <- data.frame(w = c("a", "a", "a", "b", "c"), x = 1:5, y = c("a", "b", "c", "d", "e"))
z %>% filter(w == "a" & x == 2) %>% select(y)
z %>% filter(w == "a" & x == 2) %>% select(y) <- 9 # Should be similar to z[z$w == "a" & z$ x == 2, 3] <- 9
z %>% filter(w == "a" & x == 3) %>% select(y) <- NA # Should be similar to z[z$w == "a" & z$ x == 3, 3] <- NA
Yet, it doesn't work: I get the following error message:
"Error in z %>% filter(w == "a" & x == 3) %>% select(y) <- NA : impossible de trouver la fonction "%>%<-"
I know that I can use the old data.frame notation, but what would be the solution for dplyr?
Thanks!
Filtering will subset the data frame. If you want to keep the whole data frame, but modify part of it, you can, for example use mutate with ifelse. I've added stringsAsFactors=FALSE to your sample data so that y will be a character column.
z <- data.frame(w = c("a", "a", "a", "b", "c"), x = 1:5, y = c("a", "b", "c", "d", "e"),
stringsAsFactors=FALSE)
z %>% mutate(y = ifelse(w=="a" & x==2, 9, y))
w x y
1 a 1 a
2 a 2 9
3 a 3 c
4 b 4 d
5 c 5 e
Or with replace:
z %>% mutate(y = replace(y, w=="a" & x==2, 9),
y = replace(y, w=="a" & x==3, NA))
w x y
1 a 1 a
2 a 2 9
3 a 3 <NA>
4 b 4 d
5 c 5 e
It is my impression that the dplyr package is philosophically opposed to modifying your underlying data. You might find the data.table package friendlier for this operation:
library(data.table)
z <- data.table(w = c("a", "a", "a", "b", "c"), x = 1:5, y = c("a", "b", "c", "d", "e"))
m <- data.table(w = c("a","a"), x = c(2,3), new_y = c("9", NA))
z[m, y := new_y, on=c("w","x")]
w x y
1: a 1 a
2: a 2 9
3: a 3 NA
4: b 4 d
5: c 5 e
I'm sure there's a way in base R as well, but I don't know it. In particular, I can't get merge or match to do the job.

Resources