aggregate in j column to string column name, data.table - r

I'm trying to create a summarized data.table using the j column, but assign to a name stored in a variable.
For example, I can do this:
x = data.table(c(1,2,3,4,5,6),c(2,2,2,3,3,3))
x[,.("a" = mean(V1), "b" = max(V1)),by=V2]
which returns as wanted
V2 a b
1: 2 2 3
2: 3 5 6
Now instead of using the name "a", I would like to use a variable name:
varname = "a"
x[,.(varname = mean(V1), "b" = max(V1)), by=V2]
I'd like it to return the same output, but of course here column a is labeled as "varname". I've tried using eval, get, and others and haven't figured out the right syntax. Is this built in, or will I have to relabel the name outside of data.table?

We can use setnames after the aggregation
out <- x[,.( mean(V1), "b" = max(V1)), by=V2]
setnames(out, 'V1', varname)
out
# V2 a b
#1: 2 2 3
#2: 3 5 6
Or use setNames
x[, setNames(.(mean(V1), max(V1)), c(varname, "b")), by = V2]
With tidyverse, the lhs evaluation is possible
library(tidyverse)
x %>%
group_by(V2) %>%
summarise(!! varname := mean(V1))
# A tibble: 2 x 2
# V2 a
# <dbl> <dbl>
#1 2 2
#2 3 5

Alternatively, you can put the functions into a named list as follows:
x[, lapply(structure(list(mean, max), names=c(varname, "b")),
function(f) f(V1)),
by=V2]

Related

How to move dataframe variable names to first row and add new variable names to multiple dataframes in a list?

library(purrr)
library(tibble)
library(dplyr)
Starting list of dataframes
lst <- list(df1 = data.frame(X.1 = as.character(1:2),
heading = letters[1:2]),
df2 = data.frame(X.32 = as.character(3:4),
another.topic = paste("Line ", 1:2)))
lst
#> $df1
#> X.1 heading
#> 1 1 a
#> 2 2 b
#>
#> $df2
#> X.32 another.topic
#> 1 3 Line 1
#> 2 4 Line 2
Expected "combined" dataframe, with new consistent variable names, and old variable names in the first row of each constituent dataframe.
#> id h1 h2
#> 1 df1 X.1 heading
#> 2 df1 1 a
#> 3 df1 2 b
#> 4 df2 X.32 another.topic
#> 5 df2 3 Line 1
#> 6 df2 4 Line 2
add_row requires "Name-value pairs, passed on to tibble(). Values can be defined only for columns that already exist in .data and unset columns will get an NA value."
Which is what I think I have achieved with this:
df_nms <-
map(lst, names) %>%
map(set_names)
#> $df1
#> X.1 heading
#> "X.1" "heading"
#>
#> $df2
#> X.32 another.topic
#> "X.32" "another.topic"
But I cannot tie up the last bit, using a purrr function to add the names to the head of each dataframe. I've tried numerous variations with map2 and pmap the closest I can get at present (if I treat add_row as a formula , prefixing it with ~ and remove the .y I get a new first row populated with NAs). I think I'm missing how to pass the name-value pairs to the add_row function.
map2(lst, df_nms, add_row(.x, .y, .before = 1)) %>%
map(set_names, c("h1", "h2")) %>%
map_dfr(bind_rows, .id = "id")
#> Error in add_row(.x, .y, .before = 1): object '.x' not found
A pointer to resolve this last step would be most appreciated.
Not quite sure how to do this via purrr map functions, but here is an alternative,
library(dplyr)
bind_rows(lapply(lst, function(i){d1 <- as.data.frame(matrix(names(i), ncol = ncol(i)));
rbind(d1, setNames(i, names(d1)))}), .id = 'id')
# id V1 V2
#1 df1 X.1 heading
#2 df1 1 a
#3 df1 2 b
#4 df2 X.32 another.topic
#5 df2 3 Line 1
#6 df2 4 Line 2
Here's an approach using map, rbindlist from data.table and some base R functions:
library(purrr)
library(dplyr)
library(data.table)
map(lst, ~ as.data.frame(unname(rbind(colnames(.x),as.matrix(.x))))) %>%
rbindlist(idcol = "id")
# id V1 V2
#1: df1 X.1 heading
#2: df1 1 a
#3: df1 2 b
#4: df2 X.32 another.topic
#5: df2 3 Line 1
#6: df2 4 Line 2
Alternatively we could use map_df if we use colnames<-:
map_df(lst, ~ as.data.frame(rbind(colnames(.x),as.matrix(.x))) %>%
`colnames<-`(.,paste0("h",seq(1,dim(.)[2]))), .id = "id")
# id h1 h2
#1 df1 X.1 heading
#2 df1 1 a
#3 df1 2 b
#4 df2 X.32 another.topic
#5 df2 3 Line 1
#6 df2 4 Line 2
Key things here are:
Use as.matrix to get rid of the factor / character incompatibility.
Remove names with unname or set them with colnames<-
Use the idcols = or .id = feature to get the names of the list as a column.
I altered your sample data a bit, setting stringsAsFactors to FALSE when creating the data.frames in lst.
here is a solution using data.table::rbindlist().
#sample data
lst <- list(df1 = data.frame(X.1 = as.character(1:2),
heading = letters[1:2],
stringsAsFactors = FALSE), # !! <--
df2 = data.frame(X.32 = as.character(3:4),
another.topic = paste("Line ", 1:2),
stringsAsFactors = FALSE) # !! <--
)
DT <- data.table::rbindlist( lapply( lst, function(x) rbind( names(x), x ) ),
use.names = FALSE, idcol = "id" )
setnames(DT, names( lst[[1]] ), c("h1", "h2") )
# id h1 h2
# 1: df1 X.1 heading
# 2: df1 1 a
# 3: df1 2 b
# 4: df2 X.32 another.topic
# 5: df2 3 Line 1
# 6: df2 4 Line 2

Accessing column name within the SD construct

I have a data table in R that looks like this
DT = data.table(a = c(1,2,3,4,5), a_mean = c(1,1,2,2,2), b = c(6,7,8,9,10), b_mean = c(3,2,1,1,2))
I want to create two more columns a_final and b_final defined as a_final = (a - a_mean) and b_final = (b - b_mean). In my real life use case, there can be a large number of such column pairs and I want a scalable solution in the spirit of R's data tables.
I tried something along the lines of
DT[,paste0(c('a','b'),'_final') := lapply(.SD, function(x) ((x-get(paste0(colnames(.SD),'_mean'))))), .SDcols = c('a','b')]
but this doesn't quite work. Any idea of how I can access the column name of the column being processed within the lapply statement?
We can create a character vector with columns names, subset it from the original data.table, get their corresponding "mean" columns, subtract and add as new columns.
library(data.table)
cols <- unique(sub('_.*', '', names(DT))) #Thanks to #Sotos
#OR just
#cols <- c('a', 'b')
DT[,paste0(cols, '_final')] <- DT[,cols, with = FALSE] -
DT[,paste0(cols, "_mean"), with = FALSE]
DT
# a a_mean b b_mean a_final b_final
#1: 1 1 6 3 0 3
#2: 2 1 7 2 1 5
#3: 3 2 8 1 1 7
#4: 4 2 9 1 2 8
#5: 5 2 10 2 3 8
Another option is using mget with Map:
cols <- c('a', 'b')
DT[, paste0(cols,'_final') := Map(`-`, mget(cols), mget(paste0(cols,"_mean")))]
Relying on the .SD construct you could do something along the lines of:
cols <- c('a', 'b')
DT[, paste0(cols, "_final") :=
DT[, .SD, .SDcols = cols] -
DT[, .SD, .SDcols = paste0(cols, "_mean")]]

Is there a way to find the indices of common (exactly the same) elements in a dataframe?

Given a dataframe such as,
num <- c(5,10,15,20,25)
letter <- c("A", "B", "A", "C", "B")
thelist <- data.frame(num, letter)
I need to find the indices where the letters are the same.
Output:
A 1 3
B 2 5
C 4
Then, take these indices and find the mean of those indices in num.
Output:
A 10
B 17.5
C 20
I cannot use loops or if statements, I am looking at using a sort of apply, which, etc.
As the objective is to find the mean for each similar 'letter', it is better to group by 'letter' and get the mean of 'num'
library(dplyr)
thelist %>%
group_by(letter) %>%
summarise(num = mean(num))
# A tibble: 3 x 2
# letter num
# <fct> <dbl>
#1 A 10
#2 B 17.5
#3 C 20
or in base R
aggregate(num ~ letter, thelist, mean)
To find the index of the same 'letter', we can split the sequence of rows by 'letter
split(seq_len(nrow(thelist)), thelist$letter)
#$A
#[1] 1 3
#$B
#[1] 2 5
#$C
#[1] 4
Another option using data.table:
library(data.table)
setDT(thelist)[, .(ind = paste(.I, collapse = " "),
mean_num = mean(num)
),
by = letter]
Output:
letter ind mean_num
1: A 1 3 10.0
2: B 2 5 17.5
3: C 4 20.0
I'd use dplyr/tidyverse for this:
# setup
library(tidyverse)
# group by letters then get mean of num
thelist %>%
group_by(letter) %>%
summarise(mean_num = mean(num))
You could also use base R with a for loop:
lets <- unique(thelist$letter)
x <- rep(NA, length(lets))
for(i in 1:3){
x[i] <- mean(thelist$num[thelist$letter %in% lets[i]])
}
x

dplyr::distinct - keep only selected columns, not all

Applying dplyr::distinct, in order to keep only selected columns instead of all (.keep_all = TRUE), I am currently selecting post hoc using select:
library(dplyr)
foo_df <- data.frame(id1=c(1,1,3),id2=c(1,1,4), val1 = letters[1:3], val2 = letters[3:5])
foo_df %>% distinct(id1,id2,.keep_all = TRUE) %>% select(id1,id2, val1)
# I want to keep "val1" and the identifiers for unique combinations
#> id1 id2 val1
#> 1 1 1 a
#> 2 3 4 c
#> packageVersion('dplyr')
#> [1] ‘0.7.7’
Created on 2018-12-19 by the reprex package (v0.2.1)
But is there a more succinct way? Happy to be pointed to another function too.
Shame on me if this is a dupe.
Maybe the data.table syntax is more to your liking. It is more succinct than dplyr.
library(data.table)
DT <- data.table(foo_df)
# ?data.table::unique
unique(DT[, .(id1, id2, val1)], by = c("id1", "id2"))
id1 id2 val1
1: 1 1 a
2: 3 4 c

combining values in rows based on matching conditions in R

I have a simple question about aggregating values in R.
Suppose I have a dataframe:
DF <- data.frame(col1=c("Type 1", "Type 1B", "Type 2"), col2=c(1, 2, 3))
which looks like this:
col1 col2
1 Type 1 1
2 Type 1B 2
3 Type 2 3
I notice that I have Type 1 and Type 1B in the data, so I would like to combine Type 1B into Type 1.
So I decide to use dplyr:
filter(DF, col1=='Type 1' | col1=='Type 1B') %>%
summarise(n = sum(col2))
But now I need to keep going with it:
DF2 <- data.frame('Type 1', filter(DF, col1=='Type 1' | col1=='Type 1B') %>%
summarise(n = sum(col2)))
I guess I want to cbind this new DF2 back to the original DF, but that means I have to set the column names to be consistent:
names(DF2) <- c('col1', 'col2')
OK, now I can rbind:
rbind(DF2, DF[3,])
The result? It worked....
col1 col2
1 Type 1 3
3 Type 2 3
...but ugh! That was awful! There has to be a better way to simply combine values.
Here's a possible dplyr approach:
library(dplyr)
DF %>%
group_by(col1 = sub("(.*\\d+).*$", "\\1", col1)) %>%
summarise(col2 = sum(col2))
#Source: local data frame [2 x 2]
#
# col1 col2
#1 Type 1 3
#2 Type 2 3
Using sub() with aggregate(), removing anything other than a digit from the end of col1,
do.call("data.frame",
aggregate(col2 ~ cbind(col1 = sub("\\D+$", "", col1)), DF, sum)
)
# col1 col2
# 1 Type 1 3
# 2 Type 2 3
The do.call() wrapper is there so that the first column after aggregate() is properly changed from a matrix to a vector. This way there aren't any surprises later on down the road.
In my opinion, aggregate() is the perfect function for this purpose, but you shouldn't have to do any text processing (e.g. gsub()). I would do this in a two-step process:
Overwrite col1 with the new desired grouping.
Compute the aggregation using the new col1 to specify the grouping.
DF$col1 <- ifelse(DF$col1 %in% c('Type 1','Type 1B'),'Type 1',levels(DF$col1));
DF;
## col1 col2
## 1 Type 1 1
## 2 Type 1 2
## 3 Type 2 3
DF <- aggregate(col2~col1, DF, FUN=sum );
DF;
## col1 col2
## 1 Type 1 3
## 2 Type 2 3
You can try:
library(data.table)
setDT(transform(DF, col1=gsub("(.*)[A-Z]+$","\\1",DF$col1)))[,list(col2=sum(col2)),col1]
# col1 col2
# 1: Type 1 3
# 2: Type 2 3
Or even more directly:
setDT(DF)[, .(col2 = sum(col2)), by = .(col1 = sub("[[:alpha:]]+$", "", col1))]

Resources