Edited in response to #akrun's insight:
This works:
require("magrittr")
requireNamespace("dplyr")
df <- data.frame(a = 1:5)
b_column <- c_column <- "a"
df %>% dplyr::mutate(
b = !!dplyr::sym(b_column),
c = !!dplyr::sym(c_column))
But when any one of the *_columns is NULL it doesn't:
c_column <- NULL
df %>% dplyr::mutate(
b = !!dplyr::sym(b_column),
c = !!dplyr::sym(c_column))
The resulting error is:
Error: Only strings can be converted to symbols
Run `rlang::last_error()` to see where the error occurred.
How would I make the call to ANY of the ensymboled *_column variables resilient to it being NULL?
If we need to check for NULL case, use an if condition
df1 <- if(!is.null(c)) {
df %>%
dplyr::mutate(b = !!dplyr::sym(c))
} else df
With multiple columns, an option is map
library(purrr)
b_column <- c_column <- "a"
map2_dfc(list(b_column, c_column), c("b", "c"), ~
if(!is.null(.x)) df %>%
transmute(!! .y := !! sym(.x))) %>%
bind_cols(df, .)
-output
# a b c
#1 1 1 1
#2 2 2 2
#3 3 3 3
#4 4 4 4
#5 5 5 5
If one of them is NULL
c_column <- NULL
map2_dfc(list(b_column, c_column), c("b", "c"), ~
if(!is.null(.x)) df %>%
transmute(!! .y := !! sym(.x))) %>%
bind_cols(df, .)
# a b
#1 1 1
#2 2 2
#3 3 3
#4 4 4
#5 5 5
Another option is mutate with across, but make sure that we need to rename only the columns that are not NULL
nm1 <- c("b", "c")
i1 <- !map_lgl(list(b_column, c_column), is.null)
nm2 <- nm1[i1]
df %>%
mutate(across(all_of(c(b_column, c_column)), ~ .)) %>%
rename_at(vars(everything()), ~ nm2) %>%
bind_cols(df, .)
Related
I wrote a code that applies various filters ('a','b', and 'c') that may result into an empty dataframe. If empty rows is true, then filters must be dropped one-by-one (from 'c' to 'a') until finding a non-empty dataframe. Can someone write that code below more elegantly? The code is:
library(dplyr)
df <- data.frame(a = 1:10, b = letters[1:10]) %>% mutate(c= str_c(a,b))
a.selected <- 1:5
b.selected <- letters[2:5]
c.selected <- c('10j')
filtered <- df %>%
filter(a %in% a.selected &
b %in% b.selected &
c %in% c.selected)
if(nrow(filtered)==0) {
filtered1 <- df %>%
filter(a %in% a.selected &
b %in% b.selected)
filtered <- filtered1
} else {
if(nrow(filtered1)==0) {
filtered2 <- df %>%
filter(a %in% a.selected)
filtered <- filtered2
} else {
if(nrow(filtered2)==0) {
filtered3 <- df
filtered <- filtered3
}
}
}
filtered
a b c
1 2 b 2b
2 3 c 3c
3 4 d 4d
4 5 e 5e
I'm not sure it's elegant, but this is considerably shorter:
selections <- list(a.selected, b.selected, c.selected)
combos <- Reduce(`&`, Map(`%in%`, df, selections), accumulate = TRUE)
df %>%
filter(combos[[max(which(sapply(combos, any)))]])
#> a b c
#> 1 2 b 2b
#> 2 3 c 3c
#> 3 4 d 4d
#> 4 5 e 5e
An option with tidyverse would be
library(dplyr)
library(purrr)
library(stringr)
df %>%
mutate(ind = across(everything(),
~ .x %in% get(str_c(cur_column(), ".selected"))) %>%
accumulate(`&`) %>%
keep(any) %>%
tail(1) %>%
names) %>%
filter(cur_data()[[first(ind)]] %in% get(str_c(first(ind), ".selected"))) %>%
select(-ind)
a b c
1 2 b 2b
2 3 c 3c
3 4 d 4d
4 5 e 5e
I have a dataset looking like that:
set.seed(123)
test_data <- data.frame(
id = c("a", "b", "c", "d", "e"),
x = sample(c(0,1), 5, replace = T),
y = sample(c(0,1), 5, replace = T)
)
> test_data
id x y
1 a 0 1
2 b 0 1
3 c 0 1
4 d 1 0
5 e 0 0
For the columns x and y, if the value is equal to 1, the value is replaced by the name of the column. In my example, I would like to have:
id x y
1 a <NA> y
2 b <NA> y
3 c <NA> y
4 d x <NA>
5 e <NA> <NA>
The thing that I don't know how many columns should be treated this way. Basically, I know that the first column ("id") is not concerned, but after this column, I could have any number of columns (even 0) that need to be treated this way.
I tried something like that but it doesn't work:
library(dplyr)
test_data %>%
mutate(
across(
.cols = 1:last_col(),
.funs = function(x) {
ifelse(x == 1, as.character(x), NA)
}
)
)
How can I do that? A dplyr answer is preferred.
You can do this way also:
library(tidyverse)
## define a function for your job
fn <- function(x, name){
return(ifelse(x ==1, name, NA))
}
test_data %>%
select(-id) %>%
map2_dfr(., names(.), ~fn(.x, .y)) %>%
bind_cols('id'= test_data$id, .)
Another version could be:
fn <- function(x, name){
if(name != 'id'){
return(ifelse(x ==1, name, NA))
} else {
return(x)
}
}
test_data %>%
map2_dfr(., names(.), ~fn(.x, .y))
Here is a try using purrr
library(dplyr)
library(purrr)
col_to_fix <- names(test_data)[2:length(test_data)]
walk(.x = col_to_fix, .f = function(x) {
# Note that I used <<- assigment here to change the test_data in global
test_data[[x]] <<- case_when(
test_data[[x]] == 1 ~ x,
TRUE ~ NA_character_
)
})
Output
> test_data
id x y
1 a <NA> y
2 b <NA> y
3 c <NA> y
4 d x <NA>
5 e <NA> <NA>
There is my problem that I can't solve it:
Data:
df <- data.frame(f1=c("a", "a", "b", "b", "c", "c", "c"),
v1=c(10, 11, 4, 5, 0, 1, 2))
data.frame:f1 is factor
f1 v1
a 10
a 11
b 4
b 5
c 0
c 1
c 2
# What I want is:(for example, fetch data with the number of element of some level == 2, then to data.frame)
a b
10 4
11 5
Thanks in advance!
I might be missing something simple here , but the below approach using dplyr works.
library(dplyr)
nlevels = 2
df1 <- df %>%
add_count(f1) %>%
filter(n == nlevels) %>%
select(-n) %>%
mutate(rn = row_number()) %>%
spread(f1, v1) %>%
select(-rn)
This gives
# a b
# <int> <int>
#1 10 NA
#2 11 NA
#3 NA 4
#4 NA 5
Now, if you want to remove NA's we can do
do.call("cbind.data.frame", lapply(df1, function(x) x[!is.na(x)]))
# a b
#1 10 4
#2 11 5
As we have filtered the dataframe which has only nlevels observations, we would have same number of rows for each column in the final dataframe.
split might be useful here to split df$v1 into parts corresponding to df$f1. Since you are always extracting equal length chunks, it can then simply be combined back to a data.frame:
spl <- split(df$v1, df$f1)
data.frame(spl[lengths(spl)==2])
# a b
#1 10 4
#2 11 5
Or do it all in one call by combining this with Filter:
data.frame(Filter(function(x) length(x)==2, split(df$v1, df$f1)))
# a b
#1 10 4
#2 11 5
Here is a solution using unstack :
unstack(
droplevels(df[ave(df$v1, df$f1, FUN = function(x) length(x) == 2)==1,]),
v1 ~ f1)
# a b
# 1 10 4
# 2 11 5
A variant, similar to #thelatemail's solution :
data.frame(Filter(function(x) length(x) == 2, unstack(df,v1 ~ f1)))
My tidyverse solution would be:
library(tidyverse)
df %>%
group_by(f1) %>%
filter(n() == 2) %>%
mutate(i = row_number()) %>%
spread(f1, v1) %>%
select(-i)
# # A tibble: 2 x 2
# a b
# * <dbl> <dbl>
# 1 10 4
# 2 11 5
or mixing approaches :
as_tibble(keep(unstack(df,v1 ~ f1), ~length(.x) == 2))
Using all base functions (but you should use tidyverse)
# Add count of instances
x$len <- ave(x$v1, x$f1, FUN = length)
# Filter, drop the count
x <- x[x$len==2, c('f1','v1')]
# Hacky pivot
result <- data.frame(
lapply(unique(x$f1), FUN = function(y) x$v1[x$f1==y])
)
colnames(result) <- unique(x$f1)
> result
a b
1 10 4
2 11 5
I'd like code this, may it helps for you
library(reshape2)
library(dplyr)
aa = data.frame(v1=c('a','a','b','b','c','c','c'),f1=c(10,11,4,5,0,1,2))
cc = aa %>% group_by(v1) %>% summarise(id = length((v1)))
dd= merge(aa,cc) #get the level
ee = dd[dd$aa==2,] #select number of level equal to 2
ee$id = rep(c(1,2),nrow(ee)/2) # reset index like (1,2,1,2)
dcast(ee, id~v1,value.var = 'f1')
all done!
Given a situation such as the following
library(dplyr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
I would like to group `myData' to eventually find summary data grouping by all possible combinations of var2, var3, and var4.
I can create a list with all possible combinations of variables as character values with
groupNames <- names(myData)[2:4]
myGroups <- Map(combn,
list(groupNames),
seq_along(groupNames),
simplify = FALSE) %>%
unlist(recursive = FALSE)
My plan was to make separate data sets for each variable combination with a for() loop, something like
### This Does Not Work
for (i in 1:length(myGroups)){
assign( myGroups[i]%>%
unlist() %>%
paste0(collapse = "")%>%
paste0("Data"),
myData %>%
group_by_(lapply(myGroups[[i]], as.symbol)) %>%
summarise( n = length(var1),
avgVar2 = var2 %>%
mean()))
}
Admittedly I am not very good with lists, and looking up this issue was a bit challenging since dpyr updates have altered how grouping works a bit.
If there is a better way to do this than separate data sets I would love to know.
I've gotten a loop similar to above working when I am only grouping by a single variable.
Any and all help is greatly appreciated! Thank you!
This seems convulated, and there's probably a way to simplify or fancy it up with a do, but it works. Using your myData and myGroups,
results = lapply(myGroups, FUN = function(x) {
do.call(what = group_by_, args = c(list(myData), x)) %>%
summarise( n = length(var1),
avgVar1 = mean(var1))
}
)
> results[[1]]
Source: local data frame [3 x 3]
var2 n avgVar1
1 a 31 0.38929738
2 b 31 -0.07451717
3 c 38 -0.22522129
> results[[4]]
Source: local data frame [9 x 4]
Groups: var2
var2 var3 n avgVar1
1 a A 11 -0.1159160
2 a B 11 0.5663312
3 a C 9 0.7904056
4 b A 7 0.0856384
5 b B 13 0.1309756
6 b C 11 -0.4192895
7 c A 15 -0.2783099
8 c B 10 -0.1110877
9 c C 13 -0.2517602
> results[[7]]
# I won't paste them here, but it has all 27 rows, grouped by var2, var3 and var4.
I changed your summarise call to average var1 since var2 isn't numeric.
I have created a function based on the answer of #Gregor and the comments that followed:
library(magrittr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
Function combSummarise
combSummarise <- function(data, variables=..., summarise=...){
# Get all different combinations of selected variables (credit to #Michael)
myGroups <- lapply(seq_along(variables), function(x) {
combn(c(variables), x, simplify = FALSE)}) %>%
unlist(recursive = FALSE)
# Group by selected variables (credit to #konvas)
df <- eval(parse(text=paste("lapply(myGroups, function(x){
dplyr::group_by_(data, .dots=x) %>%
dplyr::summarize_( \"", paste(summarise, collapse="\",\""),"\")})"))) %>%
do.call(plyr::rbind.fill,.)
groupNames <- c(myGroups[[length(myGroups)]])
newNames <- names(df)[!(names(df) %in% groupNames)]
df <- cbind(df[, groupNames], df[, newNames])
names(df) <- c(groupNames, newNames)
df
}
Call of combSummarise
combSummarise (myData, var=c("var2", "var3", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)"))
etc
Inspired by the answers by Gregor and dimitris_ps, I wrote a dplyr style function that runs summarise for all combinations of group variables.
summarise_combo <- function(data, ...) {
groupVars <- group_vars(data) %>% map(as.name)
groupCombos <- map( 0:length(groupVars), ~combn(groupVars, ., simplify=FALSE) ) %>%
unlist(recursive = FALSE)
results <- groupCombos %>%
map(function(x) {data %>% group_by(!!! x) %>% summarise(...)} ) %>%
bind_rows()
results %>% select(!!! groupVars, everything())
}
Example
library(tidyverse)
mtcars %>% group_by(cyl, vs) %>% summarise_combo(cyl_n = n(), mean(mpg))
Using unite to create a new column is the simplest way
library(tidyverse)
df = tibble(
a = c(1,1,2,2,1,1,2,2),
b = c(3,4,3,4,3,4,3,4),
val = c(1,2,3,4,5,6,7,8)
)
print(df)#output1
df_2 = unite(df, 'combined_header', a, b, sep='_', remove=FALSE) #remove=F doesn't remove existing columns
print(df_2)#output2
df_2 %>% group_by(combined_header) %>%
summarize(avg_val=mean(val)) %>% print()#output3
#avg 1_3 = mean(1,5)=3 avg 1_4 = mean(2, 6) = 4
RESULTS
Output:
output1
a b val
<dbl> <dbl> <dbl>
1 1 3 1
2 1 4 2
3 2 3 3
4 2 4 4
5 1 3 5
6 1 4 6
7 2 3 7
8 2 4 8
output2
combined_header a b val
<chr> <dbl> <dbl> <dbl>
1 1_3 1 3 1
2 1_4 1 4 2
3 2_3 2 3 3
4 2_4 2 4 4
5 1_3 1 3 5
6 1_4 1 4 6
7 2_3 2 3 7
8 2_4 2 4 8
output3
combined_header avg_val
<chr> <dbl>
1 1_3 3
2 1_4 4
3 2_3 5
4 2_4 6
I'll illustrate my question with an example.
Sample data:
df <- data.frame(ID = c(1, 1, 2, 2, 3, 5), A = c("foo", "bar", "foo", "foo", "bar", "bar"), B = c(1, 5, 7, 23, 54, 202))
df
ID A B
1 1 foo 1
2 1 bar 5
3 2 foo 7
4 2 foo 23
5 3 bar 54
6 5 bar 202
What I want to do is to summarize, by ID, the sum of B and the sum of B when A is "foo". I can do this in a couple steps like:
require(magrittr)
require(dplyr)
df1 <- df %>%
group_by(ID) %>%
summarize(sumB = sum(B))
df2 <- df %>%
filter(A == "foo") %>%
group_by(ID) %>%
summarize(sumBfoo = sum(B))
left_join(df1, df2)
ID sumB sumBfoo
1 1 6 1
2 2 30 30
3 3 54 NA
4 5 202 NA
However, I'm looking for a more elegant/faster way, as I'm dealing with 10gb+ of out-of-memory data in sqlite.
require(sqldf)
my_db <- src_sqlite("my_db.sqlite3", create = T)
df_sqlite <- copy_to(my_db, df)
I thought of using mutate to define a new Bfoo column:
df_sqlite %>%
mutate(Bfoo = ifelse(A=="foo", B, 0))
Unfortunately, this doesn't work on the database end of things.
Error in sqliteExecStatement(conn, statement, ...) :
RS-DBI driver: (error in statement: no such function: IFELSE)
You can do both sums in a single dplyr statement:
df1 <- df %>%
group_by(ID) %>%
summarize(sumB = sum(B),
sumBfoo = sum(B[A=="foo"]))
And here is a data.table version:
library(data.table)
dt = setDT(df)
dt1 = dt[ , .(sumB = sum(B),
sumBfoo = sum(B[A=="foo"])),
by = ID]
dt1
ID sumB sumBfoo
1: 1 6 1
2: 2 30 30
3: 3 54 0
4: 5 202 0
Writing up #hadley's comment as an answer
df_sqlite %>%
group_by(ID) %>%
mutate(Bfoo = if(A=="foo") B else 0) %>%
summarize(sumB = sum(B),
sumBfoo = sum(Bfoo)) %>%
collect
If you want to do counting instead of summarizing, then the answer is somewhat different. The change in code is small, especially in the conditional counting part.
df1 <- df %>%
group_by(ID) %>%
summarize(countB = n(),
countBfoo = sum(A=="foo"))
df1
Source: local data frame [4 x 3]
ID countB countBfoo
1 1 2 1
2 2 2 2
3 3 1 0
4 5 1 0
If you wanted to count the rows, instead of summing them, can you pass a variable to the function:
df1 <- df %>%
group_by(ID) %>%
summarize(RowCountB = n(),
RowCountBfoo = n(A=="foo"))
I get an error both with n() and nrow().