I have a data frame that looks like this:
data.frame(group1_a_mu = 10, group1_b_sd = 4, group1_c_xx = 5, group2_a_mu=1, group2_b_sd=2, gorup2_c_xx = 14, stringsAsFactors = FALSE)
group1_a_mu group1_b_sd group1_c_xx group2_a_mu group2_b_sd gorup2_c_xx
1 10 4 5 1 2 14
and I would like to transform it to this:
mu sd xx
group1 10 4 5
group2 1 2 14
how can one do that?
You could try the following (based on the data from the original post):
library(dplyr)
library(tidyr)
data.frame(group1_a = 10, group1_b = 4, group1_c = 5, group2_a=1, group2_b=2, group2_c = 14, stringsAsFactors = FALSE) %>%
gather(key, val) %>%
separate(key, c('group_name', 'subgroup_name'), sep = '_') %>%
spread(subgroup_name, val)
## group_name a b c
## 1 group1 10 4 5
## 2 group2 1 2 14
For the situation where there are 2 _ characters (updated post), the following approach temporarily modifies the _ character. The alternative is to use the look ahead or look behind operators in the separate regular expression (sep).
data.frame(group1_a_mu = 10, group1_b_sd = 4, group1_c_xx = 5, group2_a_mu=1, group2_b_sd=2, group2_c_xx = 14, stringsAsFactors = FALSE) %>%
gather(key, val) %>%
mutate(key = sub('_', '|', key)) %>% ## Temporary change of '_' to '|'
separate(key, c('group_name', 'subgroup_name'), sep = '_') %>%
spread(subgroup_name, val) %>%
mutate(group_name = sub('[|]', '_', group_name)) ## Change back to '_'
## group_name mu sd xx
## 1 group1_a 10 NA NA
## 2 group1_b NA 4 NA
## 3 group1_c NA NA 5
## 4 group2_a 1 NA NA
## 5 group2_b NA 2 NA
## 6 group2_c NA NA 14
Using the positive look behind operator will give the same results.
data.frame(group1_a_mu = 10, group1_b_sd = 4, group1_c_xx = 5, group2_a_mu=1, group2_b_sd=2, group2_c_xx = 14, stringsAsFactors = FALSE) %>%
gather(key, val) %>%
separate(key, c('group_name', 'subgroup_name'), sep = '(?<=[a-z])_') %>%
spread(subgroup_name, val)
Related
Is there a way to pass a formula expression into mutate to create a new variable.
expression1 <- formula(ifelse(Var1 == 9, 0, Var1))
df <- data.frame(Var1 = sample(1:10, 10, replace = TRUE),
Var2 = sample(1:10, 10, replace = TRUE)) %>%
mutate(new_var = expression1)
If it is a string, then we can parse it
library(dplyr)
expression1 <- 'ifelse(Var1 == 9, 0, Var1)'
df %>%
mutate(new_var = eval(rlang::parse_expr(expression1)))
-output
# Var1 Var2 new_var
#1 7 7 7
#2 5 7 5
#3 5 2 5
#4 4 6 4
#5 7 9 7
#6 9 3 0
#7 9 5 0
#8 2 9 2
#9 9 2 0
#10 10 9 10
data
df <- data.frame(Var1 = sample(1:10, 10, replace = TRUE),
Var2 = sample(1:10, 10, replace = TRUE))
rlang prefers to use expressions or quosures rather than formulas. It would be better to use
expression1 <- rlang::expr(ifelse(Var1 == 9, 0, Var1))
df <- data.frame(Var1 = sample(1:10, 10, replace = TRUE),
Var2 = sample(1:10, 10, replace = TRUE)) %>%
mutate(new_var = !!expression1)
First you create the expression with the expr() function. Then you "inject" that into the dplyr expression you want to call with !!
I have a numeric vector with names following a pattern. The name for each element consists of two parts. There are a fixed number of variations on the first part and a fixed number of variations on the second part per the below.
x <- c(2, 4, 3, 7, 6, 9)
names(x) <- c("a.0", "b.0", "c.0", "a.1", "b.1", "c.1")
From this I want to create and print a table where the first part of the names is the rows and the second part the columns per the below.
a b c
0 2 4 3
1 7 6 9
Here are some possibilities. The first 3 only use base R.
1) tapply Use tapply with the row and column parts specified in the second argument.
nms <- names(x)
tapply(x, list(row = sub(".*\\.", "", nms), col = sub("\\..*", "", nms)), c)
giving the following matrix with the indicated row and column names.
col
row a b c
0 2 4 3
1 7 6 9
2) xtabs Another possibility is to use xtabs:
dnms <- read.table(text = names(x), sep = ".", as.is = TRUE,
col.names = c("col", "row"))[2:1]
xtabs(x ~ ., dnms)
giving this xtabs/table object:
col
row a b c
0 2 4 3
1 7 6 9
3) reshape
long <- cbind(x, read.table(text = names(x), sep = ".", as.is = TRUE,
col.names = c("col", "row")))
r <- reshape(long, dir = "wide", idvar = "row", timevar = "col")[-1]
dimnames(r) <- lapply(long[3:2], unique)
r
giving this data.frame:
a b c
0 2 4 3
1 7 6 9
4) dplyr/tidyr/tibble Using the indicated packages we can form the following pipeline:
library(dplyr)
library(tidyr)
library(tibble)
x %>%
stack %>%
separate(ind, c("col", "rowname")) %>%
pivot_wider(names_from = col, values_from = ".") %>%
column_to_rownames
giving this data.frame:
a b c
0 2 4 3
1 7 6 9
If you are using an older version of tidyr replace the pivot_wider line with
spread(col, values) %>%
As per #d.b. comment this would also work:
x %>%
data.frame %>%
rownames_to_column %>%
separate(rowname, c("col", "rowname")) %>%
pivot_wider(names_from = col, values_from = ".") %>%
column_to_rownames
do.call(rbind, split(x, gsub(".*\\.(.*)", "\\1", names(x))))
# a.0 b.0 c.0
#0 2 4 3
#1 7 6 9
i have a key in tableA and in tableB i have key and numeric. How can i achieve formula excel sumifs(numeric,tableB.key,tableA.key,tableA.key,1)
with dplyr without join the two table
i already tried summarise_if within mutate
mutate(newColumn = summarise_if(tableB, .predicate = tableB$Key == .$Key, .funs = sum(tableB$numeric)))
but i get this error
In tableB$Key == .$Key:
longer object length is not a multiple of shorter object length
tableA tableB
key key numeric
1 1 10
2 1 30
3
4
Expected
key newColumn
1 40
2
3
4
you could try
library(tidyverse)
tableA <- tibble(key = c(1, 2, 3, 4))
tableB <- tibble(key = c(1, 1, 2, 2),
numeric = c(10, 30, 10, 15))
(function(){
tmpDF <- tableB %>%
filter(key %in% tableA$key) %>%
group_by(key) %>%
summarise(newColumn = sum(numeric))
tableA %>%
mutate(new = ifelse(key == tmpDF$key, tmpDF$newColumn, 0)
)
})()
which gives
# A tibble: 4 x 2
# key new
# <dbl> <dbl>
# 1 40
# 2 25
# 3 0
# 4 0
Given a situation such as the following
library(dplyr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
I would like to group `myData' to eventually find summary data grouping by all possible combinations of var2, var3, and var4.
I can create a list with all possible combinations of variables as character values with
groupNames <- names(myData)[2:4]
myGroups <- Map(combn,
list(groupNames),
seq_along(groupNames),
simplify = FALSE) %>%
unlist(recursive = FALSE)
My plan was to make separate data sets for each variable combination with a for() loop, something like
### This Does Not Work
for (i in 1:length(myGroups)){
assign( myGroups[i]%>%
unlist() %>%
paste0(collapse = "")%>%
paste0("Data"),
myData %>%
group_by_(lapply(myGroups[[i]], as.symbol)) %>%
summarise( n = length(var1),
avgVar2 = var2 %>%
mean()))
}
Admittedly I am not very good with lists, and looking up this issue was a bit challenging since dpyr updates have altered how grouping works a bit.
If there is a better way to do this than separate data sets I would love to know.
I've gotten a loop similar to above working when I am only grouping by a single variable.
Any and all help is greatly appreciated! Thank you!
This seems convulated, and there's probably a way to simplify or fancy it up with a do, but it works. Using your myData and myGroups,
results = lapply(myGroups, FUN = function(x) {
do.call(what = group_by_, args = c(list(myData), x)) %>%
summarise( n = length(var1),
avgVar1 = mean(var1))
}
)
> results[[1]]
Source: local data frame [3 x 3]
var2 n avgVar1
1 a 31 0.38929738
2 b 31 -0.07451717
3 c 38 -0.22522129
> results[[4]]
Source: local data frame [9 x 4]
Groups: var2
var2 var3 n avgVar1
1 a A 11 -0.1159160
2 a B 11 0.5663312
3 a C 9 0.7904056
4 b A 7 0.0856384
5 b B 13 0.1309756
6 b C 11 -0.4192895
7 c A 15 -0.2783099
8 c B 10 -0.1110877
9 c C 13 -0.2517602
> results[[7]]
# I won't paste them here, but it has all 27 rows, grouped by var2, var3 and var4.
I changed your summarise call to average var1 since var2 isn't numeric.
I have created a function based on the answer of #Gregor and the comments that followed:
library(magrittr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
Function combSummarise
combSummarise <- function(data, variables=..., summarise=...){
# Get all different combinations of selected variables (credit to #Michael)
myGroups <- lapply(seq_along(variables), function(x) {
combn(c(variables), x, simplify = FALSE)}) %>%
unlist(recursive = FALSE)
# Group by selected variables (credit to #konvas)
df <- eval(parse(text=paste("lapply(myGroups, function(x){
dplyr::group_by_(data, .dots=x) %>%
dplyr::summarize_( \"", paste(summarise, collapse="\",\""),"\")})"))) %>%
do.call(plyr::rbind.fill,.)
groupNames <- c(myGroups[[length(myGroups)]])
newNames <- names(df)[!(names(df) %in% groupNames)]
df <- cbind(df[, groupNames], df[, newNames])
names(df) <- c(groupNames, newNames)
df
}
Call of combSummarise
combSummarise (myData, var=c("var2", "var3", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)"))
etc
Inspired by the answers by Gregor and dimitris_ps, I wrote a dplyr style function that runs summarise for all combinations of group variables.
summarise_combo <- function(data, ...) {
groupVars <- group_vars(data) %>% map(as.name)
groupCombos <- map( 0:length(groupVars), ~combn(groupVars, ., simplify=FALSE) ) %>%
unlist(recursive = FALSE)
results <- groupCombos %>%
map(function(x) {data %>% group_by(!!! x) %>% summarise(...)} ) %>%
bind_rows()
results %>% select(!!! groupVars, everything())
}
Example
library(tidyverse)
mtcars %>% group_by(cyl, vs) %>% summarise_combo(cyl_n = n(), mean(mpg))
Using unite to create a new column is the simplest way
library(tidyverse)
df = tibble(
a = c(1,1,2,2,1,1,2,2),
b = c(3,4,3,4,3,4,3,4),
val = c(1,2,3,4,5,6,7,8)
)
print(df)#output1
df_2 = unite(df, 'combined_header', a, b, sep='_', remove=FALSE) #remove=F doesn't remove existing columns
print(df_2)#output2
df_2 %>% group_by(combined_header) %>%
summarize(avg_val=mean(val)) %>% print()#output3
#avg 1_3 = mean(1,5)=3 avg 1_4 = mean(2, 6) = 4
RESULTS
Output:
output1
a b val
<dbl> <dbl> <dbl>
1 1 3 1
2 1 4 2
3 2 3 3
4 2 4 4
5 1 3 5
6 1 4 6
7 2 3 7
8 2 4 8
output2
combined_header a b val
<chr> <dbl> <dbl> <dbl>
1 1_3 1 3 1
2 1_4 1 4 2
3 2_3 2 3 3
4 2_4 2 4 4
5 1_3 1 3 5
6 1_4 1 4 6
7 2_3 2 3 7
8 2_4 2 4 8
output3
combined_header avg_val
<chr> <dbl>
1 1_3 3
2 1_4 4
3 2_3 5
4 2_4 6
I'll illustrate my question with an example.
Sample data:
df <- data.frame(ID = c(1, 1, 2, 2, 3, 5), A = c("foo", "bar", "foo", "foo", "bar", "bar"), B = c(1, 5, 7, 23, 54, 202))
df
ID A B
1 1 foo 1
2 1 bar 5
3 2 foo 7
4 2 foo 23
5 3 bar 54
6 5 bar 202
What I want to do is to summarize, by ID, the sum of B and the sum of B when A is "foo". I can do this in a couple steps like:
require(magrittr)
require(dplyr)
df1 <- df %>%
group_by(ID) %>%
summarize(sumB = sum(B))
df2 <- df %>%
filter(A == "foo") %>%
group_by(ID) %>%
summarize(sumBfoo = sum(B))
left_join(df1, df2)
ID sumB sumBfoo
1 1 6 1
2 2 30 30
3 3 54 NA
4 5 202 NA
However, I'm looking for a more elegant/faster way, as I'm dealing with 10gb+ of out-of-memory data in sqlite.
require(sqldf)
my_db <- src_sqlite("my_db.sqlite3", create = T)
df_sqlite <- copy_to(my_db, df)
I thought of using mutate to define a new Bfoo column:
df_sqlite %>%
mutate(Bfoo = ifelse(A=="foo", B, 0))
Unfortunately, this doesn't work on the database end of things.
Error in sqliteExecStatement(conn, statement, ...) :
RS-DBI driver: (error in statement: no such function: IFELSE)
You can do both sums in a single dplyr statement:
df1 <- df %>%
group_by(ID) %>%
summarize(sumB = sum(B),
sumBfoo = sum(B[A=="foo"]))
And here is a data.table version:
library(data.table)
dt = setDT(df)
dt1 = dt[ , .(sumB = sum(B),
sumBfoo = sum(B[A=="foo"])),
by = ID]
dt1
ID sumB sumBfoo
1: 1 6 1
2: 2 30 30
3: 3 54 0
4: 5 202 0
Writing up #hadley's comment as an answer
df_sqlite %>%
group_by(ID) %>%
mutate(Bfoo = if(A=="foo") B else 0) %>%
summarize(sumB = sum(B),
sumBfoo = sum(Bfoo)) %>%
collect
If you want to do counting instead of summarizing, then the answer is somewhat different. The change in code is small, especially in the conditional counting part.
df1 <- df %>%
group_by(ID) %>%
summarize(countB = n(),
countBfoo = sum(A=="foo"))
df1
Source: local data frame [4 x 3]
ID countB countBfoo
1 1 2 1
2 2 2 2
3 3 1 0
4 5 1 0
If you wanted to count the rows, instead of summing them, can you pass a variable to the function:
df1 <- df %>%
group_by(ID) %>%
summarize(RowCountB = n(),
RowCountBfoo = n(A=="foo"))
I get an error both with n() and nrow().