Here is my simplified df:
GP_A <- c(rep("a",3),rep("b",2),rep("c",2))
GP_B <- c(rep("d",2),rep("e",4),rep("f",1))
GENDER <- c(rep("M",4),rep("F",3))
LOC <- c(rep("HK",2),rep("UK",3),rep("JP",2))
SCORE <- c(50,70,80,20,30,80,90)
df <- as.data.frame(cbind(GP_A,GP_B,GENDER,LOC,SCORE))
> df
GP_A GP_B GENDER LOC SCORE
1 a d M HK 50
2 a d M HK 70
3 a e M UK 80
4 b e M UK 20
5 b e F UK 30
6 c e F JP 80
7 c f F JP 90
I want to summarize the score by GP_A, GP_B, or other grouping columns which are not showing in this example. As the count of grouping columns might up to 50, I decided to use for-loop to summarize the score.
The original method is summarizing the score with 1 group one by one:
GP_A_SCORE <- df %>% group_by(GP_A,GENDER,LOC) %>% summarize(SCORE=mean(SCORE))
GP_B_SCORE <- df %>% group_by(GP_B,GENDER,LOC) %>% summarize(SCORE=mean(SCORE))
...
What I want is using the for-loop like this (cannot run):
GP_list <- c("GP_A","GP_B",...)
LOC_list <- c("HK","UK","JP",...)
SCORE <- list()
for (i in GP_list){
for (j in LOC_list){
SCORE[[paste0(i,j)]] <- df %>% group_by(i,j,GENDER) %>% summarize(SCORE=mean(SCORE))
}}
As in "group_by()", the variables are classified as character and here is the error shown:
Error: Column I, J is unknown
Is there any method to force R to recognize the variable?
I am facing the same problem on the left_join of dplyr.
Error is shown when I was doing something like: left_join(x,y,by=c(i=i)) inside a loop.
You could get the data in long format and then calculate the mean
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = starts_with('GP')) %>%
group_by(GENDER ,LOC, name, value) %>%
summarise(SCORE = mean(SCORE))
# GENDER LOC name value SCORE
# <fct> <fct> <chr> <fct> <dbl>
# 1 F JP GP_A c 85
# 2 F JP GP_B e 80
# 3 F JP GP_B f 90
# 4 F UK GP_A b 30
# 5 F UK GP_B e 30
# 6 M HK GP_A a 60
# 7 M HK GP_B d 60
# 8 M UK GP_A a 80
# 9 M UK GP_A b 20
#10 M UK GP_B e 50
We can use melt from data.table
library(data.table)
melt(setDT(df), measure = patterns("^GP"))[, .(SCORE = mean(SCORE)),
.(GENDER, LOC, variable, value)]
data
df <- data.frame(GP_A,GP_B,GENDER,LOC,SCORE)
Related
I'm still learning R and was wondering if I there was an elegant way of manipulating the below df to achieve df2.
I'm not sure if it's a loop that is supposed to be used for this, but basically I want to extract the first Non NA "X_No" Value if the "X_No" value is NA in the first row. This would perhaps be best described through an example from df to the desired df2.
A_ID <- c('A','B','I','N')
A_No <- c(11,NA,15,NA)
B_ID <- c('B','C','D','J')
B_No <- c(NA,NA,12,NA)
C_ID <- c('E','F','G','P')
C_No <- c(NA,13,14,20)
D_ID <- c('J','K','L','M')
D_No <- c(NA,NA,NA,40)
E_ID <- c('W','X','Y','Z')
E_No <- c(50,32,48,40)
df <- data.frame(A_ID,A_No,B_ID,B_No,C_ID,C_No,D_ID,D_No,E_ID,E_No)
ID <- c('A','D','F','M','W')
No <- c(11,12,13,40,50)
df2 <- data.frame(ID,No)
I'm hoping for an elegant solution to this as there are over a 1000 columns similar to the example provided.
I've looked all over the web for a similar example however to no avail that would reproduce the expected result.
Your help is very much appreciated.
Thankyou
I don't know if I'd call it "elegant", but here is a potential solution:
library(tidyverse)
A_ID <- c('A','B','I','N')
A_No <- c(11,NA,15,NA)
B_ID <- c('B','C','D','J')
B_No <- c(NA,NA,12,NA)
C_ID <- c('E','F','G','P')
C_No <- c(NA,13,14,20)
D_ID <- c('J','K','L','M')
D_No <- c(NA,NA,NA,40)
E_ID <- c('W','X','Y','Z')
E_No <- c(50,32,48,40)
df <- data.frame(A_ID,A_No,B_ID,B_No,C_ID,C_No,D_ID,D_No,E_ID,E_No)
ID <- c('A','D','F','M','W')
No <- c(11,12,13,40,50)
df2 <- data.frame(ID,No)
output <- df %>%
pivot_longer(everything(),
names_sep = "_",
names_to = c("Col", ".value")) %>%
drop_na() %>%
group_by(Col) %>%
slice_head(n = 1) %>%
ungroup() %>%
select(-Col)
df2
#> ID No
#> 1 A 11
#> 2 D 12
#> 3 F 13
#> 4 M 40
#> 5 W 50
output
#> # A tibble: 5 × 2
#> ID No
#> <chr> <dbl>
#> 1 A 11
#> 2 D 12
#> 3 F 13
#> 4 M 40
#> 5 W 50
all_equal(df2, output)
#> [1] TRUE
Created on 2023-02-08 with reprex v2.0.2
Using base R with max.col (assuming the columns are alternating with ID, No)
ind <- max.col(!is.na(t(df[c(FALSE, TRUE)])), "first")
m1 <- cbind(seq_along(ind), ind)
data.frame(ID = t(df[c(TRUE, FALSE)])[m1], No = t(df[c(FALSE, TRUE)])[m1])
ID No
1 A 11
2 D 12
3 F 13
4 M 40
5 W 50
Here is a data.table solution that should scale well to a (very) large dataset.
functionally
split the data.frame to a list of chunks of columns, based on their
names. So all columns startting with A_ go to
the first element, all colums startting with B_ to the second
Then, put these list elements on top of each other, using
data.table::rbindlist. Ignure the column-namaes (this only works if
A_ has the same number of columns as B_ has the same number of cols
as n_)
Now get the first non-NA value of each value in the first column
code
library(data.table)
# split based on what comes after the underscore
L <- split.default(df, f = gsub("(.*)_.*", "\\1", names(df)))
# bind together again
DT <- rbindlist(L, use.names = FALSE)
# extract the first value of the non-NA
DT[!is.na(A_No), .(No = A_No[1]), keyby = .(ID = A_ID)]
# ID No
# 1: A 11
# 2: D 12
# 3: F 13
# 4: G 14
# 5: I 15
# 6: M 40
# 7: P 20
# 8: W 50
# 9: X 32
#10: Y 48
#11: Z 40
For example, here is my df:
GP_A <- c(rep("a",3),rep("b",2),rep("c",2))
GP_B <- c(rep("d",2),rep("e",4),rep("f",1))
GENDER <- c(rep("M",4),rep("F",3))
LOC <- c(rep("HK",2),rep("UK",3),rep("JP",2))
SCORE <- c(50,70,80,20,30,80,90)
df <- data.frame(GP_A,GP_B,GENDER,LOC,SCORE)
> df
GP_A GP_B GENDER LOC SCORE
1 a d M HK 50
2 a d M HK 70
3 a e M UK 80
4 b e M UK 20
5 b e F UK 30
6 c e F JP 80
7 c f F JP 90
What I want is:
result[[GP_A]] <- df %>% group_by(GP_A,GENDER,LOC) %>% summarize(SCORE=mean(SCORE))
result[[GP_B]] <- df %>% group_by(GP_B,GENDER,LOC) %>% summarize(SCORE=mean(SCORE))
...
I have tried:
result <- list()
for (i in c("GP_A","GP_B")){
result[[i]] <- df %>% group_by(i,GENDER,LOC) %>% summarize(SCORE=mean(SCORE))
}
Here is the error:
Error: Column I is unknown
I also have tried to use setNames, i.e.
... %>% group_by(setNames(nm=i),GENDER,LOC) %>% ...
But it also doesn't work...
The group_by_at() function allows you to group by string inputs and is probably the best use here.
GP_A <- c(rep("a",3),rep("b",2),rep("c",2))
GP_B <- c(rep("d",2),rep("e",4),rep("f",1))
GENDER <- c(rep("M",4),rep("F",3))
LOC <- c(rep("HK",2),rep("UK",3),rep("JP",2))
SCORE <- c(50,70,80,20,30,80,90)
df <- data.frame(GP_A,GP_B,GENDER,LOC,SCORE)
result <- list()
for(i in c("GP_A","GP_B"))
{
result[[i]] <-
df %>%
group_by_at(c(i,"GENDER", "LOC")) %>%
summarise(SCORE = mean(SCORE)) %>%
ungroup()
}
Remember that it's always best practice to ungroup() your variables once you finish. This is so that in future you don't have unwanted grouping levels.
I'm working with a large dataset and doing some calculation with the aggregate() function.
This time I need to group by two different columns and for my calculation I need a user defined function that also uses two columns of the data.frame. That's where I'm stuck.
Here's an example data set:
dat <- data.frame(Kat = c("a","b","c","a","c","b","a","c"),
Sex = c("M","F","F","F","M","M","F","M"),
Val1 = c(1,2,3,4,5,6,7,8)*10,
Val2 = c(2,6,3,3,1,4,7,4))
> dat
Kat Sex Val1 Val2
a M 10 2
b F 20 6
c F 30 3
a F 40 3
c M 50 1
b M 60 4
a F 70 7
c M 80 4
Example of user defined function:
sum(Val1 * Val2) # but grouped by Kat and Sex
I tried this:
aggregate((dat$Val1),
by = list(dat$Kat, dat$Sex),
function(x, y = dat$Val2){sum(x*y)})
Output:
Group.1 Group.2 x
a F 1710
b F 600
c F 900
a M 300
b M 1800
c M 2010
But my expected output would be:
Group.1 Group.2 x
a F 610
b F 120
c F 90
a M 20
b M 240
c M 370
Is there any way to do this with aggregate()?
As #jogo suggested :
aggregate(Val1 * Val2 ~ Kat + Sex, FUN = sum, data = dat)
Or in a tidyverse style
library(dplyr)
dat %>%
group_by(Kat, Sex) %>%
summarize(sum(Val1 * Val2))
Or with data.table
library(data.table)
setDT(dat)
dat[ , sum(Val1 * Val2), by = list(Kat, Sex)]
I am having problems transforming my data.
I have a dataframe, which tells which transitions were made, and how many times this sequence of transitions has occured. The different columns are corresponding with the situation in period 10, 11 and 12 (and there are more in my data). I want to summarize this data, and want to know how many times people went from A to C, A to D, but also C to G, etc. So basically I want to aggregate this data based on each column with the second column. My ultimate goal is to turn this data into a sankey diagram.
To illustrate:
df<-data.frame(s10=unlist(strsplit("AAAABBBBBC","")),
s11=unlist(strsplit("CCDDEEFFFF","")),
s12=unlist(strsplit("GHIGJKMNNN","")),
freq=c(10,20,30,40,50,60,70, 40, 20, 20))
s10 s11 s12 freq
1 A C G 10
2 A C H 20
3 A D I 30
4 A D G 40
5 B E J 50
6 B E K 60
7 B F M 70
8 B F N 40
9 B F N 20
10 C F N 20
And I aim to get this result:
colA colB freq
1 A C 30
2 A D 70
3 B E 110
4 B F 130
5 C F 20
6 C G 10
7 C H 20
8 D G 40
9 D I 30
10 E J 50
11 E K 60
12 F M 70
13 F N 80
I got this result by first aggregating the sum of frequencies for s10 and s11, and for s11 and s12, and then changing the column names and binding them together. It works for, but I intend to do this with more columns, so I'm sure there is a more efficient way to do this. See the code I used below:
bl1 <- df %>%
group_by(s10, s11) %>%
summarise(freq = sum(freq)) %>%
as.data.frame()
bl2 <- df %>%
group_by(s11, s12) %>%
summarise(freq = sum(freq)) %>%
as.data.frame()
colnames(bl1) <- c('colA', 'colB','freq' )
colnames(bl2) <- c('colA', 'colB','freq' )
rbind(bl1, bl2)
Any help is much appreciated!
You can rbind the selected columns of the data.frames together and then use aggregate. The only trick is to rename the columns so that they match. For this, I use setNames.
aggregate(freq ~ colA + colB,
data=rbind(setNames(df[c("s10", "s11", "freq")], c("colA", "colB", "freq")),
setNames(df[c("s11", "s12", "freq")], c("colA", "colB", "freq"))),
FUN=sum)
this returns the desired result.
colA colB freq
1 A C 30
2 A D 70
3 B E 110
4 B F 130
5 C F 20
6 C G 10
7 D G 40
8 C H 20
9 D I 30
10 E J 50
11 E K 60
12 F M 70
13 F N 80
OK. I gave it a go and had some fun with benchmarking. An alternate aproach (that I used) was to use aggregate() itself. See fun1 for implementation. I have made it to fit this particular example, and of course it will need tweaking to work with data frames of other widths
Edit: I have removed dataframe creation from functions and added Benchmarking output1
require(dplyr); require(microbenchmark)
df<-data.frame(s10=unlist(strsplit("AAAABBBBBC","")),
s11=unlist(strsplit("CCDDEEFFFF","")),
s12=unlist(strsplit("GHIGJKMNNN","")),
freq=c(10,20,30,40,50,60,70, 40, 20, 20))
fun0<- function(){
bl1 <- df %>%
group_by(s10, s11) %>%
summarise(freq = sum(freq)) %>%
as.data.frame()
bl2 <- df %>%
group_by(s11, s12) %>%
summarise(freq = sum(freq)) %>%
as.data.frame()
colnames(bl1) <- c('colA', 'colB','freq' )
colnames(bl2) <- c('colA', 'colB','freq' )
return(rbind(bl1, bl2))
}
fun1<- function(){
a<- apply(df[,c(1,2)], 1, function(x)paste(x[1],x[2], sep="",collapse = "" ))
b<- apply(df[,c(2,3)], 1, function(x)paste(x[1],x[2], sep="",collapse = "" ))
z<-data.frame(x=c(a,b),f=rep(df$freq,2))
return( aggregate( f~x , data=z, FUN=sum) )
}
fun0()
fun1()
#benchmarking
MB_res <- microbenchmark( fun0=fun0(), fun1=fun1() , times=1000)
MB_res
Results were:
Unit: milliseconds
expr min lq mean median uq max neval
fun0 2.218889 2.587820 2.773454 2.676921 2.785586 6.020277 1000
fun1 1.472971 1.737751 1.908966 1.842152 1.910118 8.915407 1000
I want to combine different column value rows into a new column row.
Example df like this:
df <- data.frame(area = c("a","b","c","a"),
d = c(1,3,6,3),
f = c(3,2,8,2),
e = c(4,7,1,8),
g = c(6,9,2,9))
Where a,b,c are area column value, I want to combine/sum two rows (a,c) into one to get:
area d f e g
a+c+a 10 13 13 17
b 3 2 7 9
AND I have tried like this:
df <- aggregate(df, list(area=replace(area == c("a","c"), "a+c+a")), sum)
But it won't work.
Thank you.
Another solution using dplyr
library(dplyr)
aggr <- df[df$area %in% c("a", "c"),-1] %>%
summarize_all(sum)
rbind(df[!(df$area %in% c("a", "c")),],
bind_cols(area = "a+c+a", aggr))
# area d f e g
# 2 b 3 2 7 9
# 1 a+c+a 10 13 13 17