Aggregate - Use more than one variable in user defined function - r

I'm working with a large dataset and doing some calculation with the aggregate() function.
This time I need to group by two different columns and for my calculation I need a user defined function that also uses two columns of the data.frame. That's where I'm stuck.
Here's an example data set:
dat <- data.frame(Kat = c("a","b","c","a","c","b","a","c"),
Sex = c("M","F","F","F","M","M","F","M"),
Val1 = c(1,2,3,4,5,6,7,8)*10,
Val2 = c(2,6,3,3,1,4,7,4))
> dat
Kat Sex Val1 Val2
a M 10 2
b F 20 6
c F 30 3
a F 40 3
c M 50 1
b M 60 4
a F 70 7
c M 80 4
Example of user defined function:
sum(Val1 * Val2) # but grouped by Kat and Sex
I tried this:
aggregate((dat$Val1),
by = list(dat$Kat, dat$Sex),
function(x, y = dat$Val2){sum(x*y)})
Output:
Group.1 Group.2 x
a F 1710
b F 600
c F 900
a M 300
b M 1800
c M 2010
But my expected output would be:
Group.1 Group.2 x
a F 610
b F 120
c F 90
a M 20
b M 240
c M 370
Is there any way to do this with aggregate()?

As #jogo suggested :
aggregate(Val1 * Val2 ~ Kat + Sex, FUN = sum, data = dat)
Or in a tidyverse style
library(dplyr)
dat %>%
group_by(Kat, Sex) %>%
summarize(sum(Val1 * Val2))
Or with data.table
library(data.table)
setDT(dat)
dat[ , sum(Val1 * Val2), by = list(Kat, Sex)]

Related

How to add two columns from two different dataframes in R wherein one column just has subset of unique values of the other

I have two dataframes, I need to add two columns from those two and store the result in the original bigger dataframe, but the bigger dataframe has lot more 'branch' column than the smaller one. I tried using match but the non matching branches the sum is NA
Sample code:
> df1 <- data.frame(branch = letters[seq(1,5)],
+ rev = seq(10,50,10),
+ stringsAsFactors = 0)
> df1
branch rev
1 a 10
2 b 20
3 c 30
4 d 40
5 e 50
>
> df2 <- data.frame(branch = c('b','d'),
+ Amt = c(10,10),
+ stringsAsFactors = 0)
> df2
branch Amt
1 b 10
2 d 10
>
> df1$rev + df2[match(df1$branch,df2$branch),2,drop = 1]
[1] NA 30 NA 50 NA
>
Expected Output
> df1
branch rev
1 a 10
2 b 30
3 c 30
4 d 50
5 e 50
>
I tried using left join as below:
> left_join(df1, df2, by = 'branch')
branch rev Amt
1 a 10 NA
2 b 20 10
3 c 30 NA
4 d 40 10
5 e 50 NA
> df1 <- left_join(df1, df2, by = 'branch')
> df1[is.na(df1)] <- 0
> df1
branch rev Amt
1 a 10 0
2 b 20 10
3 c 30 0
4 d 40 10
5 e 50 0
> df1$rev <- df1$rev + df1$Amt
> df1
branch rev Amt
1 a 10 0
2 b 30 10
3 c 30 0
4 d 50 10
5 e 50 0
> df1$Amt <- NULL
> df1
branch rev
1 a 10
2 b 30
3 c 30
4 d 50
5 e 50
>
Could someone let me know if there's a simpler solution for this.
An option using data.table:
library(data.table)
setDT(df1)[, rev :=
setDT(df2)[.SD, on=.(branch), rev + nafill(Amt, fill=0)]
]
output:
branch rev
1: a 10
2: b 30
3: c 30
4: d 50
5: e 50
How about this, no libraries required:
df1 <- df1[order(df1$branch),] #sort based on branch
df2 <- df2[order(df2$branch),] #sort also so next step works
df1$branch[df1$branch %in% df2$branch] #just to check we are on correct path
#do the task
df1$rev[df1$branch %in% df2$branch] <- df1$rev[df1$branch %in% df2$branch] + df2$Amt[df2$branch %in% df1$branch]
Warning -- if there are repeated "branch" values in df2...e.g. two "b",
you would need to accumulate those before adding them to df1.
One way would to store the output of match in a variable, replace NA to 0 and then add values
vals <- df2$Amt[match(df1$branch,df2$branch)]
df1$rev + replace(vals, is.na(vals), 0)
#[1] 10 30 30 50 50
Something similar in dplyr, doing left_join instead of match
library(dplyr)
df1 %>%
left_join(df2, by = 'branch') %>%
mutate(Amt = replace(Amt, is.na(Amt), 0),
rev = rev + Amt) %>%
select(names(df1))
Using dplyr, you can aggregate both dataframes using bind_rows (and renaming Amt by rev in order to match colnames), group by "branch" and calculate the sum:
library(dplyr)
df1 %>% bind_rows(., rename(df2, rev = Amt)) %>%
group_by(branch) %>%
summarise(rev = sum(rev))
# A tibble: 5 x 2
branch rev
<chr> <dbl>
1 a 10
2 b 30
3 c 30
4 d 50
5 e 50
Use aggregate to get the sum of rev in different branch group .
library(magrittr)
colnames(df2)[2] <- "rev"
df1 <- rbind(df1, df2) %>% aggregate(rev ~ branch, ., FUN = sum)

For-loop to summarize and joining by dplyr

Here is my simplified df:
GP_A <- c(rep("a",3),rep("b",2),rep("c",2))
GP_B <- c(rep("d",2),rep("e",4),rep("f",1))
GENDER <- c(rep("M",4),rep("F",3))
LOC <- c(rep("HK",2),rep("UK",3),rep("JP",2))
SCORE <- c(50,70,80,20,30,80,90)
df <- as.data.frame(cbind(GP_A,GP_B,GENDER,LOC,SCORE))
> df
GP_A GP_B GENDER LOC SCORE
1 a d M HK 50
2 a d M HK 70
3 a e M UK 80
4 b e M UK 20
5 b e F UK 30
6 c e F JP 80
7 c f F JP 90
I want to summarize the score by GP_A, GP_B, or other grouping columns which are not showing in this example. As the count of grouping columns might up to 50, I decided to use for-loop to summarize the score.
The original method is summarizing the score with 1 group one by one:
GP_A_SCORE <- df %>% group_by(GP_A,GENDER,LOC) %>% summarize(SCORE=mean(SCORE))
GP_B_SCORE <- df %>% group_by(GP_B,GENDER,LOC) %>% summarize(SCORE=mean(SCORE))
...
What I want is using the for-loop like this (cannot run):
GP_list <- c("GP_A","GP_B",...)
LOC_list <- c("HK","UK","JP",...)
SCORE <- list()
for (i in GP_list){
for (j in LOC_list){
SCORE[[paste0(i,j)]] <- df %>% group_by(i,j,GENDER) %>% summarize(SCORE=mean(SCORE))
}}
As in "group_by()", the variables are classified as character and here is the error shown:
Error: Column I, J is unknown
Is there any method to force R to recognize the variable?
I am facing the same problem on the left_join of dplyr.
Error is shown when I was doing something like: left_join(x,y,by=c(i=i)) inside a loop.
You could get the data in long format and then calculate the mean
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = starts_with('GP')) %>%
group_by(GENDER ,LOC, name, value) %>%
summarise(SCORE = mean(SCORE))
# GENDER LOC name value SCORE
# <fct> <fct> <chr> <fct> <dbl>
# 1 F JP GP_A c 85
# 2 F JP GP_B e 80
# 3 F JP GP_B f 90
# 4 F UK GP_A b 30
# 5 F UK GP_B e 30
# 6 M HK GP_A a 60
# 7 M HK GP_B d 60
# 8 M UK GP_A a 80
# 9 M UK GP_A b 20
#10 M UK GP_B e 50
We can use melt from data.table
library(data.table)
melt(setDT(df), measure = patterns("^GP"))[, .(SCORE = mean(SCORE)),
.(GENDER, LOC, variable, value)]
data
df <- data.frame(GP_A,GP_B,GENDER,LOC,SCORE)

Adding values from one column to another with missing values in the second column using R

I want to add just some specific values from column z in dataframe df2 into dataframe df1, but just for the id = 1 and id = 3.
I have already tried solutions with ifelse, but for the missing values that kind of solutions work for the first value, until find the first missing gap.
df1$z <- ifelse((df1$id == df2$id), df2$z, 0)
Examples of the data:
df1 <- read.table(text = "
id v w
1 20 B
3 30 T
", h = T)
df2 <- read.table(text = "
id z b c d e f g h i j
1 100 z w e r y w u y q
2 800 t q j n m q i x z
3 700 f e q b a i e p w
4 300 a b c d a g s y q"
, h = T)
Expected result:
df1_add <- read.table(text = "
id v w z
1 20 B 100
3 30 T 700
", h = T)
Let's use left_join() and select() from the dplyr package:
library(dplyr)
df1_add <- df1 %>%
left_join(df2 %>% select(id, z))
df1_add
id v w z
1 1 20 B 100
2 3 30 T 700
you can try this
df_add <- df1
df_add$z = df2[df2$id %in% c(1, 3), ]$z
We can use merge from base R
merge(df1, df2[c("id", "z")])

How to convert a >2 column dataframe to 2 columns

I am having problems transforming my data.
I have a dataframe, which tells which transitions were made, and how many times this sequence of transitions has occured. The different columns are corresponding with the situation in period 10, 11 and 12 (and there are more in my data). I want to summarize this data, and want to know how many times people went from A to C, A to D, but also C to G, etc. So basically I want to aggregate this data based on each column with the second column. My ultimate goal is to turn this data into a sankey diagram.
To illustrate:
df<-data.frame(s10=unlist(strsplit("AAAABBBBBC","")),
s11=unlist(strsplit("CCDDEEFFFF","")),
s12=unlist(strsplit("GHIGJKMNNN","")),
freq=c(10,20,30,40,50,60,70, 40, 20, 20))
s10 s11 s12 freq
1 A C G 10
2 A C H 20
3 A D I 30
4 A D G 40
5 B E J 50
6 B E K 60
7 B F M 70
8 B F N 40
9 B F N 20
10 C F N 20
And I aim to get this result:
colA colB freq
1 A C 30
2 A D 70
3 B E 110
4 B F 130
5 C F 20
6 C G 10
7 C H 20
8 D G 40
9 D I 30
10 E J 50
11 E K 60
12 F M 70
13 F N 80
I got this result by first aggregating the sum of frequencies for s10 and s11, and for s11 and s12, and then changing the column names and binding them together. It works for, but I intend to do this with more columns, so I'm sure there is a more efficient way to do this. See the code I used below:
bl1 <- df %>%
group_by(s10, s11) %>%
summarise(freq = sum(freq)) %>%
as.data.frame()
bl2 <- df %>%
group_by(s11, s12) %>%
summarise(freq = sum(freq)) %>%
as.data.frame()
colnames(bl1) <- c('colA', 'colB','freq' )
colnames(bl2) <- c('colA', 'colB','freq' )
rbind(bl1, bl2)
Any help is much appreciated!
You can rbind the selected columns of the data.frames together and then use aggregate. The only trick is to rename the columns so that they match. For this, I use setNames.
aggregate(freq ~ colA + colB,
data=rbind(setNames(df[c("s10", "s11", "freq")], c("colA", "colB", "freq")),
setNames(df[c("s11", "s12", "freq")], c("colA", "colB", "freq"))),
FUN=sum)
this returns the desired result.
colA colB freq
1 A C 30
2 A D 70
3 B E 110
4 B F 130
5 C F 20
6 C G 10
7 D G 40
8 C H 20
9 D I 30
10 E J 50
11 E K 60
12 F M 70
13 F N 80
OK. I gave it a go and had some fun with benchmarking. An alternate aproach (that I used) was to use aggregate() itself. See fun1 for implementation. I have made it to fit this particular example, and of course it will need tweaking to work with data frames of other widths
Edit: I have removed dataframe creation from functions and added Benchmarking output1
require(dplyr); require(microbenchmark)
df<-data.frame(s10=unlist(strsplit("AAAABBBBBC","")),
s11=unlist(strsplit("CCDDEEFFFF","")),
s12=unlist(strsplit("GHIGJKMNNN","")),
freq=c(10,20,30,40,50,60,70, 40, 20, 20))
fun0<- function(){
bl1 <- df %>%
group_by(s10, s11) %>%
summarise(freq = sum(freq)) %>%
as.data.frame()
bl2 <- df %>%
group_by(s11, s12) %>%
summarise(freq = sum(freq)) %>%
as.data.frame()
colnames(bl1) <- c('colA', 'colB','freq' )
colnames(bl2) <- c('colA', 'colB','freq' )
return(rbind(bl1, bl2))
}
fun1<- function(){
a<- apply(df[,c(1,2)], 1, function(x)paste(x[1],x[2], sep="",collapse = "" ))
b<- apply(df[,c(2,3)], 1, function(x)paste(x[1],x[2], sep="",collapse = "" ))
z<-data.frame(x=c(a,b),f=rep(df$freq,2))
return( aggregate( f~x , data=z, FUN=sum) )
}
fun0()
fun1()
#benchmarking
MB_res <- microbenchmark( fun0=fun0(), fun1=fun1() , times=1000)
MB_res
Results were:
Unit: milliseconds
expr min lq mean median uq max neval
fun0 2.218889 2.587820 2.773454 2.676921 2.785586 6.020277 1000
fun1 1.472971 1.737751 1.908966 1.842152 1.910118 8.915407 1000

Ranking data frame

Her is my data:
x<- data.frame(P=c("M","C","M","C","C","M","C","M"),
Q=c(13,12,12,14,19,15,12,11),
R=c(15,13,21,32,32,21,13,32),
T=c(15,12,12,14,12,11,19,15))
I want to calculate means for variables within each category.
For example means for Q is: M= (13+12+15+11)= 12.75 and for C= (12+14+19+12)= 14.25 and so on.
Next, I want to rank the means for each variable and get the following table:
P Q R T
M 2 2 2
C 1 1 1
I want to get equal rank for my real data.
For example. if I have three 12, all will get the same rank
You can try
aggregate(.~P, x, mean)
Or
library(dplyr)
x1 <- x %>%
group_by(P) %>%
summarise_each(funs(mean))
x1
# P Q R T
#1 C 14.25 22.50 14.25
#2 M 12.75 22.25 13.25
For ranking
x1 %>%
mutate_each(funs(rank(-.)), Q:T)
# P Q R T
#1 C 1 1 1
#2 M 2 2 2
Update
Suppose if there are ties,
x1$Q[2] <- x1$Q[1]
rank will get the ties averaged by default. You can specify the ties.method to min or use min_rank
x1 %>%
mutate_each(funs(min_rank(-.)), Q:T)
# P Q R T
#1 C 1 1 1
#2 M 1 2 2
For completeness, here's a possible data.table solution using frank from the devel version on GitHub
For means
library(data.table) ## v >= 1.9.5
(Res <- setDT(x)[, lapply(.SD, mean), by = P])
# P Q R T
# 1: M 12.75 22.25 13.25
# 2: C 14.25 22.50 14.25
For ranks
Res[, 2:4 := lapply(-.SD, frank, ties.method = "dense"), .SDcols = 2:4]
Res
# P Q R T
# 1: M 2 2 2
# 2: C 1 1 1
to get the mean you can do this
do.call(rbind,lapply(split(x, f = x$P), function(x) data.frame(P = unique(x$P), Q = mean(x$Q), R = mean(x$R), T = mean(x$T))))

Resources