Summarize a dataframe by groups - r

Consider the following dataframe with 4 columns:
df = data.frame(A = rnorm(10), B = rnorm(10), C = rnorm(10), D = rnorm(10))
The columns A, B, C, D belong to different groups, and the groups are defined in a separate dataframe:
groups = data.frame(Class = c("A","B","C","D"), Group = c("G1", "G2", "G2", "G1"))
#> groups
# Class Group
#1 A G1
#2 B G2
#3 C G2
#4 D G1
I would like to average elements of the columns that belong to the same group, and get something similar to:
#> res
# G1 G2
#1 -0.30023039 -0.71075139
#2 0.53053443 -0.12397126
#3 0.21968567 -0.46916160
#4 -1.13775100 -0.61266026
#5 1.30388130 -0.28021734
#6 0.29275876 -0.03994522
#7 -0.09649998 0.59396983
#8 0.71334020 -0.29818438
#9 -0.29830924 -0.47094084
#10 -0.36102888 -0.40181739
where each cell of G1 is the mean of the relative cells of A and D, and each cell of G2 is the mean of the relative cells of B and C, etc.
I was able to achieve this result, but in a rather brute force way:
l = levels(groups$Group)
res = data.frame(matrix(nc = length(levels), nr = nrow(df)))
for(i in 1:length(l)) {
df.sub = df[which(groups$Group == l[i])]
res[,i] = apply(df.sub, 1, mean)
}
names(res) <- l
Is there a better way of doing this? In reality, I have more than 20 columns and more than 10 groups.
Thank you!

using data.table
library(data.table)
groups <- data.table(groups, key="Group")
DT <- data.table(df)
groups[, rowMeans(DT[, Class, with=FALSE]), by=Group][, setnames(as.data.table(matrix(V1, ncol=length(unique(Group)))), unique(Group))]
G1 G2
1: -0.13052091 -0.3667552
2: 1.17178729 -0.5496347
3: 0.23115841 0.8317714
4: 0.45209516 -1.2180895
5: -0.01861638 -0.4174929
6: -0.43156831 0.9008427
7: -0.64026238 0.1854066
8: 0.56225108 -0.3563087
9: -2.00405840 -0.4680040
10: 0.57608055 -0.6177605
# Also, make sure you have characters, not factors,
groups[, Class := as.character(Class)]
groups[, Group := as.character(Group)]
simple base:
tapply(groups$Class, groups$Group, function(X) rowMeans(df[, X]))
using sapply :
sapply(unique(groups$Group), function(X)
rowMeans(df[, groups[groups$Group==X, "Class"]]) )

I would personally go with Ricardo's solution, but another option would be to merge your two datasets first, and then use your preferred method of aggregating.
library(reshape2)
## Retain the "rownames" so we can aggregate by row
temp <- merge(cbind(id = rownames(df), melt(df)), groups,
by.x = "variable", by.y = "Class")
head(temp)
# variable id value Group
# 1 A 1 -0.6264538 G1
# 2 A 2 0.1836433 G1
# 3 A 3 -0.8356286 G1
# 4 A 4 1.5952808 G1
# 5 A 5 0.3295078 G1
# 6 A 6 -0.8204684 G1
## This is the perfect form for `dcast` to do its work
dcast(temp, id ~ Group, value.var="value", mean)
# id G1 G2
# 1 1 0.36611287 1.21537927
# 2 10 0.22889368 0.50592144
# 3 2 0.04042780 0.58598977
# 4 3 -0.22397850 -0.27333780
# 5 4 0.77073788 -2.10202579
# 6 5 -0.52377589 0.87237833
# 7 6 -0.61773147 -0.05053117
# 8 7 0.04656955 -0.08599288
# 9 8 0.33950565 -0.26345809
# 10 9 0.83790336 0.17153557
(Above data using set.seed(1) on your sample "df".

Related

creating multiple variables in loops in r

I am quite new to R, and I do not know how to create variables in a loop. I have a dataset where each observation is uniquely defined by an id and a type. My goal would be to create different datasets from a starting one, keeping for each dataset the id, type a specific variable, and to rename the variable type as type_variable. Please see below a reproducible example of my dataset:
dt_type <- data.frame(id = c(1,1,1,1,2,2,2,2),
type= c("b1", "b2","c1", "c2","b1", "b2","c1", "c2"),
a=rnorm(8), b=rnorm(8),c=rnorm(8),d=rnorm(8))
# id type a b c d
# 1 1 b1 -0.74733339 -1.1121249 -0.2005649 1.70320036
# 2 1 b2 -0.87290362 -0.1221949 -2.7723691 1.04158671
# 3 1 c1 -0.00878965 -0.7592988 -0.5108226 2.10755315
# 4 1 c2 0.87295622 -0.5885439 0.2606365 -0.87080649
# 5 2 b1 -0.74536372 0.1377794 -0.1382621 0.01743011
# 6 2 b2 -0.01570109 -0.3058672 -0.3146880 -0.43594081
# 7 2 c1 -0.28966205 -0.2045772 -1.1776759 -2.24223369
# 8 2 c2 -0.63680969 2.3815740 0.4462243 -0.05397941
This is how I have tried to do it, but unfortunately it does not work.
varlist <- list("a", "b", "c", "d")
for (i in 1:4) {
tmp <- dt_type %>% rename(paste("type", varlist[[i]], sep=="_") = type) %>%
arrange(id, varlist[[i]], desc(paste("type", varlist[[i]], sep=="_"))) %>%
distinct(id, varlist[[i]], .keep_all = T)
assign(paste("dt_type_", varlist[[i]]), tmp)
}
I am used to using loops in other programming languages, but if there are better ways to reach the result I want, please let me know.
Sorry for not posting the expected output, here it is:
dt_type_a
# id type value
# 1 1 b1 -1.5023199
# 2 1 b2 -0.3653626
# 3 1 c1 1.2842098
# 4 1 c2 0.2732327
# 5 2 b1 -0.7581897
# 6 2 b2 1.1627059
# 7 2 c1 -1.6644546
# 8 2 c2 1.2916819
dt_type_b
# id type value
# 1 1 b1 -0.19573684
# 2 1 b2 -1.35095843
# 3 1 c1 0.69342205
# 4 1 c2 0.47689611
# 5 2 b1 0.67058845
# 6 2 b2 0.21992074
# 7 2 c1 -0.02046201
# 8 2 c2 0.19686712
Thanks,
Vincenzo
Hum, I would just go from wide to long but since you're asking to create variables dynamically:
library(data.table)
dt_type <- data.frame(id = c(1,1,1,1,2,2,2,2),
type= c("b1", "b2","c1", "c2","b1", "b2","c1", "c2"),
a=rnorm(8), b=rnorm(8),c=rnorm(8),d=rnorm(8))
setDT(dt_type)
dt_long <- melt(dt_type, id.vars = c("id", "type"))
varnames <- unique(dt_long$variable)
for (var in varnames) {
assign(paste0("dt_type_", var), dt_long[variable == var, .(id, type, value)])
}
hope it helps...

Create n new columns by n distinct groups in data.table

I have the following data table, and would like to sum y twice grouping the first time by g1 and the second time by g2.
Usually I would just chain the calculations together, but I would like to be able to do the grouped sum n different times by n groups.
library(data.table)
DT <- data.table(
g1 = c("a", "b"),
g2 = c("a", "a"),
y = c(3,5)
)
new_cols <- paste0("sum_by_", c("g1", "g2"))
group_cols <- c("g1", "g2")
# Supplying cols to by like this groups by g1 AND g2, when in reality I want it to
# take g1 the first time and g2 the second time.
DT[, paste(new_cols) := lapply(rep(y, length(new_cols)), sum),
by = .(group_cols)][]
this gives me:
# g1 g2 y sum_by_g1 sum_by_g2
# 1: a a 3 3 3
# 2: b a 5 5 5
when I actually want:
# g1 g2 y sum_by_g1 sum_by_g2
# 1: a a 3 3 8
# 2: b a 5 5 8
Is there any slick data.table way to do this? Something like supplying .SD to by (this in itself doesn't seem to work)?
Edit: Changed y from c(1,1) to c(3,5)
Edit Rationale: Actual and desired outputs while y = c(1,1) gave the impression that I wanted to count the observations in each group, when I actually want to sum(y) for each group.
The grouping should be separate as a a and a b are regarded as unique elements thus, there is only a single observation per group
for(i in seq_along(group_cols)) DT[, (new_cols[i]) := sum(y), by = c(group_cols[i])]
-output
DT
g1 g2 y sum_by_g1 sum_by_g2
1: a a 3 3 8
2: b a 5 5 8
You can try Reduce like below
> Reduce(function(dt, g) dt[, paste0("sum_by_", g) := .N, g], list(DT, "g1", "g2"))[]
g1 g2 y sum_by_g1 sum_by_g2
1: a a 1 1 2
2: b a 1 1 2
or
> Reduce(function(dt, g) dt[, paste0("sum_by_", g) := .N, g], c("g1", "g2"),init = DT)[]
g1 g2 y sum_by_g1 sum_by_g2
1: a a 1 1 2
2: b a 1 1 2

Add (not merge!) two data frames with unequal rows and columns

I want to efficiently sum the entries of two data frames, though the data frames are not guaranteed to have the same dimensions or column names. Merge isn't really what I'm after here. Instead I want to create an output object with all of the row and column names that belong to either of the added data frames. In each position of that output, I want to use the following logic for the computed value:
If a row/column pairing belongs to both input data frames I want the output to include their sum
If a row/column pairing belongs to just one input data frame I want to include that value in the output
If a row/column pairing does not belong to any input matrix I want to have 0 in that position in the output.
As an example, consider the following input data frames:
df1 = data.frame(x = c(1,2,3), y = c(4,5,6))
rownames(df1) = c("a", "b", "c")
df2 = data.frame(x = c(7,8), z = c(9,10), w = c(2, 3))
rownames(df2) = c("a", "d")
> df1
x y
a 1 4
b 2 5
c 3 6
> df2
x z w
a 7 9 2
d 8 10 3
I want the final result to be
> df2
x y z w
a 8 4 9 2
b 2 5 0 0
c 3 6 0 0
d 8 0 10 3
What I've done so far -
bind_rows / bind_cols in dplyr can throw the following:
"Error: incompatible number of rows (3, expecting 2)"
I have duplicated column names, so 'merge' isn't working for my purposes either - returns an empty df for some reason.
Seems like you could merge on the rownames, then take care of the sums and conversion of NA to zero with some additional munging:
library(dplyr)
df.new = df1 %>% add_rownames %>%
full_join(df2 %>% add_rownames, by="rowname") %>%
mutate_each(funs(replace(., which(is.na(.)), 0))) %>%
mutate(x = x.x + x.y) %>%
select(rowname,x,y,z,w)
Or, with #DavidArenburg's much more elegant and extensible solution:
df.new = df1 %>% add_rownames %>%
full_join(df2 %>% add_rownames) %>%
group_by(rowname) %>%
summarise_each(funs(sum(., na.rm = TRUE)))
df.new
rowname x y z w
1 a 8 4 9 2
2 b 2 5 0 0
3 c 3 6 0 0
4 d 8 0 10 3
This seems like some type of a simple merge on common column names (+ row names) and then a simple aggregation, this is how I would tackle this
library(data.table)
merge(setDT(df1, keep.rownames = TRUE), # Convert to data.table + keep rows
setDT(df2, keep.rownames = TRUE), # Convert to data.table + keep rows
by = intersect(names(df1), names(df2)), # merge on common column names
all = TRUE)[, lapply(.SD, sum, na.rm = TRUE), by = rn] # Sum all columns by group
# rn x y z w
# 1: a 8 4 9 2
# 2: b 2 5 0 0
# 3: c 3 6 0 0
# 4: d 8 0 10 3
Are a pretty straight forward base R solution
df1$rn <- row.names(df1)
df2$rn <- row.names(df2)
res <- merge(df1, df2, all = TRUE)
rowsum(res[setdiff(names(res), "rn")], res[, "rn"], na.rm = TRUE)
# x y z w
# a 8 4 9 2
# b 2 5 0 0
# c 3 6 0 0
# d 8 0 10 3
First, I would grab the names of all the rows and columns of the new entity:
(all.rows <- unique(c(row.names(df1), row.names(df2))))
# [1] "a" "b" "c" "d"
(all.cols <- unique(c(names(df1), names(df2))))
# [1] "x" "y" "z" "w"
Then I would construct an output matrix with those rows and column names (with matrix data initialized to all 0s), adding df1 and df2 to the relevant parts of that matrix.
out <- matrix(0, nrow=length(all.rows), ncol=length(all.cols))
rownames(out) <- all.rows
colnames(out) <- all.cols
out[row.names(df1),names(df1)] <- unlist(df1)
out[row.names(df2),names(df2)] <- out[row.names(df2),names(df2)] + unlist(df2)
out
# x y z w
# a 8 4 9 2
# b 2 5 0 0
# c 3 6 0 0
# d 8 0 10 3
Using xtabs on melted / stacked data frames:
out <- rbind(cbind(rn=rownames(df1),stack(df1)), cbind(rn=rownames(df2),stack(df2)))
as.data.frame.matrix(xtabs(values ~ rn + ind, data=out))
# x y w z
#a 8 4 2 9
#b 2 5 0 0
#c 3 6 0 0
#d 8 0 3 10
I’m not convinced the accepted (or alternative merge) method is the best. It will give incorrect results if you have common rows, they’ll get joined and not summed.
This can be shown trivialy by changing df2 to:
df2 = data.frame(x = c(1,2), y = c(4,5), z = c(9,10), w = c(2, 3))
rownames(df2) = c("a", "d")
expected results:
rn x y z w
1: a 2 8 9 2
2: b 2 5 0 0
3: c 3 6 0 0
4: d 2 5 10 3
actual results
merge(setDT(df1, keep.rownames = TRUE),
setDT(df2, keep.rownames = TRUE),
by = intersect(names(df1), names(df2)),
all = TRUE)[, lapply(.SD, sum, na.rm = TRUE), by = rn]
rn x y z w
1: a 1 4 9 2
2: b 2 5 0 0
3: c 3 6 0 0
4: d 2 5 10 3
You need to combine both the outer join with an inner join (or left/right joins, merge all=T/all=F). Or alternatively using plyr’s rbind.fill :
base R solution
res <- rbind.fill(df1,df2)
rowsum(res[setdiff(names(res), "rn")], res[, "rn"], na.rm = TRUE)
data table solution
as.data.table(rbind.fill(
setDT(df1, keep.rownames = TRUE),
setDT(df2, keep.rownames = TRUE)
))[, lapply(.SD, sum, na.rm = TRUE), by = rn]
I prefer the rbind.fill method as you can "merge" > 2 data frames using the same syntax.

How to collapse/join selected factor levels across two columns in R

Let's say I have the following data frame:
x <-c(rep (c ("s1", "s2", "s3"),each=5 ))
y <- c(rep(c("a", "b", "c", "d", "e"), 3) )
z<-c(1:15)
x_name <- "dimensions"
y_name <- "aspects"
z_name<-"value"
df <- data.frame(x,y,z)
names(df) <- c(x_name,y_name, z_name)
How can I collapse/join factor levels 'a', 'c', 'd' in one new factor 'x' across 'dimensions' and 'value', so that the value is added up for the new x factor level. The output should look like this:
I thought to use gsub to replace the names of a,c, d, with x and then sum their values using aggregate. But is there a simpler way to do this? Besides I am not sure my solution would be still good if I have other columns containing a, c, d.
I reviewed several related answers on the forum but neither addressed this situation. Thanks.
First rename a, c, and d to x and then sum by dimensions and aspects
Reading the data:
df <- data.frame(dimensions = x, aspects = y, value = z, stringsAsFactors = FALSE)
Base R solution:
# if you read the data my way the following line is unnecessary
# df$aspects <- as.character(df$aspects)
df[df$aspects %in% c("a","c","d"),]$aspects <- "x"
aggregate(value ~., df, sum)
Result:
dimensions aspects value
1 s1 b 2
2 s2 b 7
3 s3 b 12
4 s1 e 5
5 s2 e 10
6 s3 e 15
7 s1 x 8
8 s2 x 23
9 s3 x 38
data.table solution
require(data.table)
DT <- setDT(df)
DT[aspects %in% c("a","c","d"), aspects := "x"]
DT[,sum(value), by=.(dimensions, aspects)]
Results in
dimensions aspects V1
1: s1 x 8
2: s1 b 2
3: s1 e 5
4: s2 x 23
5: s2 b 7
6: s2 e 10
7: s3 x 38
8: s3 b 12
9: s3 e 15
Here's a solution using plyr::revalue (see also plyr::mapvalues) and dplyr:
# install.packages("plyr")
library(dplyr)
df %>%
mutate(aspects = plyr::revalue(aspects, c("a" = "x", "c" = "x", "d" = "x"))) %>%
group_by(dimensions, aspects) %>%
summarise(sum_value = sum(value))
# dimensions aspects sum_value
# (fctr) (fctr) (int)
# 1 s1 x 8
# 2 s1 b 2
# 3 s1 e 5
# 4 s2 x 23
# 5 s2 b 7
# 6 s2 e 10
# 7 s3 x 38
# 8 s3 b 12
# 9 s3 e 15

Cross-correlation with multiple groups in one data.table

I'd like to calculate the cross-correlations between groups of time series within on data.table. I have a time series data in this format:
data = data.table( group = c(rep("a", 5),rep("b",5),rep("c",5)) , Y = rnorm(15) )
group Y
1: a 0.90855520
2: a -0.12463737
3: a -0.45754652
4: a 0.65789709
5: a 1.27632196
6: b 0.98483700
7: b -0.44282527
8: b -0.93169070
9: b -0.21878359
10: b -0.46713392
11: c -0.02199363
12: c -0.67125826
13: c 0.29263953
14: c -0.65064603
15: c -1.41143837
Each group has the same number of observations. What I am looking for is a way to obtain cross correlation between the groups:
group.1 group.2 correlation
a b 0.xxx
a c 0.xxx
b c 0.xxx
I am working on a script to subset each group and append the cross-correlations, but the data size is fairly large. Is there any efficient / zen way to do this?
Does this help?
data[,id:=rep(1:5,3)]
dtw = dcast.data.table(data, id ~ group, value.var="Y" )[, id := NULL]
cor(dtw)
See Correlation between groups in R data.table
Another way would be:
# data
set.seed(45L)
data = data.table( group = c(rep("a", 5),rep("b",5),rep("c",5)) , Y = rnorm(15) )
# method 2
setkey(data, "group")
data2 = data[J(c("b", "c", "a"))][, list(group2=group, Y2=Y)]
data[, c(names(data2)) := data2]
data[, cor(Y, Y2), by=list(group, group2)]
# group group2 V1
# 1: a b -0.2997090
# 2: b c 0.6427463
# 3: c a -0.6922734
And to generalize this "other" way to more than three groups...
data = data.table( group = c(rep("a", 5),rep("b",5),rep("c",5),rep("d",5)) ,
Y = rnorm(20) )
setkey(data, "group")
groups = unique(data$group)
ngroups = length(groups)
library(gtools)
pairs = combinations(ngroups,2,groups)
d1 = data[pairs[,1],,allow.cartesian=TRUE]
d2 = data[pairs[,2],,allow.cartesian=TRUE]
d1[,c("group2","Y2"):=d2]
d1[,cor(Y,Y2), by=list(group,group2)]
# group group2 V1
# 1: a b 0.10742799
# 2: a c 0.52823511
# 3: a d 0.04424170
# 4: b c 0.65407400
# 5: b d 0.32777779
# 6: c d -0.02425053

Resources