lapply aggregate columns in multiple dataframes R - r

I have several dataframes in a list in R. There are entries in each of those DF I would like to summarise. Im trying to get into lapply so that would be my preferred way (though if theres a better solution I would be happy to know it and why).
My Sample data:
df1 <- data.frame(Count = c(1,2,3), ID = c("A","A","C"))
df2 <- data.frame(Count = c(1,1,2), ID = c("C","B","C"))
dfList <- list(df1,df2)
> head(dfList)
[[1]]
Count ID
1 1 A
2 2 A
3 3 C
[[2]]
Count ID
1 1 C
2 1 B
3 2 C
I tried to implement this in lapply with
dfList_agg<-lapply(dfList, function(i) {
aggregate(i[[1:length(i)]][1L], by=list(names(i[[1:length(i)]][2L])), FUN=sum)
})
However this gives me a error "arguments must have same length". What am I doing wrong?
My desired output would be the sum of Column "Count" by "ID" which looks like this:
>head(dfList_agg)
[[1]]
Count ID
1 3 A
2 3 C
[[2]]
Count ID
1 3 C
2 1 B

I think you've overcomplicated it. Try this...
dfList_agg<-lapply(dfList, function(i) {
aggregate(i[,1], by=list(i[,2]), FUN=sum)
})
dflist_agg
[[1]]
Group.1 x
1 A 3
2 C 3
[[2]]
Group.1 x
1 B 1
2 C 3

Here is a third option
lapply(dfList, function(x) aggregate(. ~ ID, data = x, FUN = "sum"))
#[[1]]
# ID Count
#1 A 3
#2 C 3
#
#[[2]]
#ID Count
#1 B 1
#2 C 3

I guess this is what you need
library(dplyr)
lapply(dfList,function(x) ddply(x,.(ID),summarize,Count=sum(Count)))

An option with tidyverse would be
library(tidyverse)
map(dfList, ~ .x %>%
group_by(ID) %>%
summarise(Count = sum(Count)) %>%
select(names(.x)))
#[[1]]
# A tibble: 2 x 2
# Count ID
# <dbl> <fctr>
#1 3.00 A
#2 3.00 C
#[[2]]
# A tibble: 2 x 2
# Count ID
# <dbl> <fctr>
#1 1.00 B
#2 3.00 C

Related

Rename variables (columns) in datasets that are in a list

I have a list of datasets with different variables. I need to rename them according to the naming convention in the name dataframe below.
df1 <- data.frame(x1= c(1,2,3), x2=c(1,2,3))
df2 <- data.frame(x1= c(1,2,3), x3=c(1,2,3))
df3 <- data.frame(x4= c(1,2,3), x5=c(1,2,3))
mylist <- list(df1,df2,df3)
name <- data.frame(old= c("x1","x2","x3","x4","x5"), new=c("A","B","A","A","C"))
I can do this one by one, but I am wondering how to be more efficient and rename them all at once
newdf <- map_if(mylist, ~ "x1" %in% colnames(.x),
.f = list(. %>% rename("A"="x1")))
I was hoping something like this would work, but it doesn't:
for (i in nrow(name)) {
newdf <- map_if(mylist, ~ name[i,1] %in% colnames(.x),
.f = list(. %>% rename(name[2] = name[1])))
}
You can use setnames from data.table, which can take a list of old and new names.
library(data.table)
library(purrr)
map(mylist, ~ setnames(.x, name$old, name$new, skip_absent=TRUE))
Output
[[1]]
A B
1 1 1
2 2 2
3 3 3
[[2]]
A A
1 1 1
2 2 2
3 3 3
[[3]]
A C
1 1 1
2 2 2
3 3 3
Column names must be unique, so there is a typo (?) in your example (as "x1" and "x3" would both be re-labelled as "A").
If we fix the typo, here is an option using map and rename_with.
name <- data.frame(old= c("x1","x2","x3","x4","x5"), new=c("A","B","C","D","E"))
library(tidyverse)
mylist %>%
map(function(df) df %>% rename_with(~ name$new[match(.x, name$old)]))
#[[1]]
# A B
#1 1 1
#2 2 2
#3 3 3
#
#[[2]]
# A C
#1 1 1
#2 2 2
#3 3 3
#
#[[3]]
# D E
#1 1 1
#2 2 2
#3 3 3
You could use set_names + recode:
library(tidyverse)
map(mylist, set_names, ~ recode(.x, !!!deframe(name)))
[[1]]
A B
1 1 1
2 2 2
3 3 3
[[2]]
A A
1 1 1
2 2 2
3 3 3
[[3]]
A C
1 1 1
2 2 2
3 3 3

R Count duplicates between two dataframes

I have two dataframes df1 and df2. They both have a column 'ID'. For each row in DF1, I would like to find out how many duplicates of its ID there are in df2 and add the count to that row. If there are no duplicates, the count should return as 0.
# # A tibble: 4 x 3
# ID a b
# <dbl> <dbl> <dbl>
# 1 1_234 1 1
# 2 1_235 1 2
# 3 2_222 1 1
# 4 2_654 1 2
# # A tibble: 4 x 3
# ID a b
# <dbl> <dbl> <dbl>
# 1 1_234 1 1
# 2 1_235 1 2
# 3 1_234 1 1
# 4 3_234 1 2
Using dplyr:
Your data:
df1 <- data.frame(ID = c("1_234","1_235","2_222","2_654"),
a = c(1,1,1,1),
b = c(1,2,1,2))
df2 <- data.frame(ID = c("1_234","1_235","1_234","3_235"),
a = c(1,1,1,1),
b = c(1,2,1,2))
Edit: considering only the IDs:
output <- left_join(df1,
as.data.frame(table(df2$ID)),
by = c("ID" = "Var1")) %>%
mutate(Freq = ifelse(is.na(Freq), 0, Freq))
Output:
ID a b Freq
1 1_234 1 1 2
2 1_235 1 2 1
3 2_222 1 1 0
4 2_654 1 2 0
A base R option using subset + aggregate
subset(
aggregate(
n ~ .,
rbind(
cbind(df1, n = 1),
cbind(df2, n = 1)
), function(x) length(x) - 1
), ID %in% df1$ID
)
gives
ID a b n
1 1_234 1 1 2
2 2_222 1 1 0
3 1_235 1 2 1
4 2_654 1 2 0
I think you can do it with a simple sapply() and base r (no extra packages).
df1$count <- sapply(df1$ID, function(x) sum(df2$ID == x))
We may also use outer
df1$count <- rowSums(outer(df1$ID, df2$ID, FUN = `==`))
df1$count
[1] 2 1 0 0
We could use semi_join and n() to get the count of duplicates:
library(dplyr)
df1 %>%
semi_join(df2, by="ID") %>%
summarise(duplicates_df1_df2 = n())
Output:
duplicates_df1_df2
1 2

Drop list columns from dataframe using dplyr and select_if

Is it possible to drop all list columns from a dataframe using dpyr select similar to dropping a single column?
df <- tibble(
a = LETTERS[1:5],
b = 1:5,
c = list('bob', 'cratchit', 'rules!','and', 'tiny tim too"')
)
df %>%
select_if(-is.list)
Error in -is.list : invalid argument to unary operator
This seems to be a doable work around, but was wanting to know if it can be done with select_if.
df %>%
select(-which(map(df,class) == 'list'))
Use Negate
df %>%
select_if(Negate(is.list))
# A tibble: 5 x 2
a b
<chr> <int>
1 A 1
2 B 2
3 C 3
4 D 4
5 E 5
There is also purrr::negate that would give the same result.
We can use Filter from base R
Filter(Negate(is.list), df)
# A tibble: 5 x 2
# a b
# <chr> <int>
#1 A 1
#2 B 2
#3 C 3
#4 D 4
#5 E 5

count by all variables / count distinct with dplyr

Say I have this data.frame :
library(dplyr)
df1 <- data.frame(x=rep(letters[1:3],1:3),y=rep(letters[1:3],1:3))
# x y
# 1 a a
# 2 b b
# 3 b b
# 4 c c
# 5 c c
# 6 c c
I can group and count easily by mentioning the names :
df1 %>%
count(x,y)
# A tibble: 3 x 3
# x y n
# <fctr> <fctr> <int>
# 1 a a 1
# 2 b b 2
# 3 c c 3
How do I do to group by everything without mentioning individual column names, in the most compact /readable way ?
We can pass the input itself to the ... argument and splice it with !!! :
df1 %>% count(., !!!.)
#> x y n
#> 1 a a 1
#> 2 b b 2
#> 3 c c 3
Note : see edit history to make sense of some comments
With base we could do : aggregate(setNames(df1[1],"n"), df1, length)
For those who wouldn't get the voodoo you are using in the accepted answer, if you don't need to use dplyr, you can do it with data.table:
setDT(df1)
df1[, .N, names(df1)]
# x y N
# 1: a a 1
# 2: b b 2
# 3: c c 3
Have you considered the (now superceded) group_by_all()?
df1 <- data.frame(x=rep(letters[1:3],1:3),y=rep(letters[1:3],1:3))
df1 %>% group_by_all() %>% count
df1 %>% group_by(across()) %>% count()
df1 %>% count(across()) # don't know why this returns a data.frame and not tibble
See the colwise vignette "other verbs" section for explanation... though honestly I get turned around myself sometimes.

Reorder a single column in a dataframe within each level of another column

Probably the solution to this problem is really easy but I just can't see it. Here is my sample data frame:
df <- data.frame(id=c(1,1,1,2,2,2), value=rep(1:3,2), level=rep(letters[1:3],2))
df[6,2] <- NA
And here is the desired output that I would like to create:
df$new_value <- c(3,2,1,NA,2,1)
So the order of all columns is the same, and for the new_value column the value column order is reversed within each level of the id column. Any ideas? Thanks!
As I understood your question, it's a coincidence that your data is sorted, if you just want to reverse the order without sorting:
library(dplyr)
df %>% group_by(id) %>% mutate(new_value = rev(value)) %>% ungroup
# A tibble: 6 x 4
id value level new_value
<dbl> <int> <fctr> <int>
1 1 1 a 3
2 1 2 b 2
3 1 3 c 1
4 2 1 a NA
5 2 2 b 2
6 2 NA c 1
A slightly different approach, using the parameters in the sort function:
library(dplyr)
df %>% group_by(id) %>%
mutate(value = sort(value, decreasing=TRUE, na.last=FALSE))
Output:
# A tibble: 6 x 3
# Groups: id [2]
id value level
<dbl> <int> <fctr>
1 1.00 3 a
2 1.00 2 b
3 1.00 1 c
4 2.00 NA a
5 2.00 2 b
6 2.00 1 c
Hope this helps!
We can use order on the missing values and on the column itself
library(dplyr)
df %>%
group_by(id) %>%
mutate(new_value = value[order(!is.na(value), -value)])
# A tibble: 6 x 4
# Groups: id [2]
# id value level new_value
# <dbl> <int> <fctr> <int>
#1 1.00 1 a 3
#2 1.00 2 b 2
#3 1.00 3 c 1
#4 2.00 1 a NA
#5 2.00 2 b 2
#6 2.00 NA c 1
Or using the arrange from dplyr
df %>%
arrange(id, !is.na(value), desc(value)) %>%
transmute(new_value = value) %>%
bind_cols(df, .)
Or using base R and specify the na.last option as FALSE in order
with(df, ave(value, id, FUN = function(x) x[order(-x, na.last = FALSE)]))
#[1] 3 2 1 NA 2 1

Resources