dplyr approach for sumifs like excel - r

i have a key in tableA and in tableB i have key and numeric. How can i achieve formula excel sumifs(numeric,tableB.key,tableA.key,tableA.key,1)
with dplyr without join the two table
i already tried summarise_if within mutate
mutate(newColumn = summarise_if(tableB, .predicate = tableB$Key == .$Key, .funs = sum(tableB$numeric)))
but i get this error
In tableB$Key == .$Key:
longer object length is not a multiple of shorter object length
tableA tableB
key key numeric
1 1 10
2 1 30
3
4
Expected
key newColumn
1 40
2
3
4

you could try
library(tidyverse)
tableA <- tibble(key = c(1, 2, 3, 4))
tableB <- tibble(key = c(1, 1, 2, 2),
numeric = c(10, 30, 10, 15))
(function(){
tmpDF <- tableB %>%
filter(key %in% tableA$key) %>%
group_by(key) %>%
summarise(newColumn = sum(numeric))
tableA %>%
mutate(new = ifelse(key == tmpDF$key, tmpDF$newColumn, 0)
)
})()
which gives
# A tibble: 4 x 2
# key new
# <dbl> <dbl>
# 1 40
# 2 25
# 3 0
# 4 0

Related

How to convert multiple binary columns into a single character column?

I would like to convert data frame df1 into data frame df2.
id <- c(1,2,3)
outcome_1 <- c(1,0,1)
outcome_2 <- c(1,1,0)
df1 <- data.frame(id,outcome_1,outcome_2)
id <- c(1,2,3)
outcome <- c("1,2","2","1")
df2 <- data.frame(id,outcome)
The answers to the following question almost do what I want, but in my case a row can have more than one positive outcome (e.g. first row needs to be "1,2"). Also, I would like the resulting column to be a character column.
R: Converting multiple binary columns into one factor variable whose factors are binary column names
Please kindly help. Thank you.
Subset the substrings of the outcomes with their binary values coerced as.logical.
apply(df1[-1], 1, \(x) toString(substring(names(df1)[-1], 9)[as.logical(x)]))
# [1] "1, 2" "2" "1"
or
apply(df1[-1], 1, \(x) paste(substring(names(df1)[-1], 9)[as.logical(x)], collapse=','))
# [1] "1,2" "2" "1"
Using the first method:
cbind(df1[1], outcome=apply(df1[-1], 1, \(x) toString(substring(names(df1)[-1], 9)[as.logical(x)])))
# id outcome
# 1 1 1, 2
# 2 2 2
# 3 3 1
If you want a nested list you may use list2DF.
l <- list2DF(c(df1[1],
outcome=list(apply(df1[-1], 1, \(x)
as.numeric(substring(names(df1)[-1], 9))[as.logical(x)]))))
l
# id outcome
# 1 1 1, 2
# 2 2 2
# 3 3 1
where
str(l)
# 'data.frame': 3 obs. of 2 variables:
# $ id : num 1 2 3
# $ outcome:List of 3
# ..$ : num 1 2
# ..$ : num 2
# ..$ : num 1
Data:
df1 <- structure(list(id = c(1, 2, 3), outcome_1 = c(1, 0, 1), outcome_2 = c(1,
1, 0)), class = "data.frame", row.names = c(NA, -3L))
Here is one more tidyverse approach:
library(dplyr)
library(tidyr)
df1 %>%
mutate(across(-id, ~case_when(. == 1 ~ cur_column()), .names = 'new_{col}'), .keep="unused") %>%
unite(outcome, starts_with('new'), na.rm = TRUE, sep = ', ') %>%
mutate(outcome = gsub('outcome_', '', outcome))
id outcome
1 1 1, 2
2 2 2
3 3 1
How many outcome_ columns are there? If just 2, this will work fine.
library(dplyr)
df1 %>%
rowwise() %>%
summarise(id = id,
outcome = paste(which(c(outcome_1,outcome_2)==1), collapse =","))
# A tibble: 3 x 2
id outcome
<dbl> <chr>
1 1 1,2
2 2 2
3 3 1
If there are more than 2, try this:
df1 %>%
rowwise() %>%
summarise(id=id,
outcome = paste(which(c_across(-id)== 1), collapse =","))
Another possible solution, based on dplyr and purrr::pmap:
library(tidyverse)
df1 %>%
transmute(id, outcome = pmap(., ~ c(1*..2, 2*..3) %>% .[. != 0] %>% toString))
#> id outcome
#> 1 1 1, 2
#> 2 2 2
#> 3 3 1
Or simply:
library(tidyverse)
pmap_dfr(df1, ~ data.frame(id = ..1, outcome = c(1*..2, 2*..3) %>% .[. != 0]
%>% toString))
#> id outcome
#> 1 1 1, 2
#> 2 2 2
#> 3 3 1
outcome_col_idx <- grepl("outcome", colnames(df1))
cbind(
df1[,!outcome_col_idx, drop = FALSE],
outcome = apply(
replace(df1, df1 == 0, NA)[,outcome_col_idx],
1,
function(x){
as.factor(
toString(
gsub(
"outcome_",
"",
names(x)[complete.cases(x)]
)
)
)
}
)
)

Replacing value depending on paired column

I have a dataframe with two columns per sample (n > 1000 samples):
df <- data.frame(
"sample1.a" = 1:5, "sample1.b" = 2,
"sample2.a" = 2:6, "sample2.b" = c(1, 3, 3, 3, 3),
"sample3.a" = 3:7, "sample3.b" = 2)
If there is a zero in column .b, the correspsonding value in column .a should be set to NA.
I thought to write a function over colnames (without suffix) to filter each pair of columns and conditional exchaning values. Is there a simpler approach based on tidyverse?
We can split the data.frame into a list of data.frames and do the replacement in base R
df1 <- do.call(cbind, lapply(split.default(df,
sub("\\..*", "", names(df))), function(x) {
x[,1][x[2] == 0] <- NA
x}))
Or another option is Map
acols <- endsWith(names(df), "a")
bcols <- endsWith(names(df), "b")
df[acols] <- Map(function(x, y) replace(x, y == 0, NA), df[acols], df[bcols])
Or if the columns are alternate with 'a', 'b' columns, use a logical index for recycling, create the logical matrix with 'b' columns and assign the corresponding values in 'a' columns to NA
df[c(TRUE, FALSE)][df[c(FALSE, TRUE)] == 0] <- NA
or an option with tidyverse by reshaping into 'long' format (pivot_longer), changing the 'a' column to NA if there is a correspoinding 0 in 'a', and reshape back to 'wide' format with pivot_wider
library(dplyr)
library(tidyr)
df %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn, names_sep="\\.",
names_to = c('group', '.value')) %>%
mutate(a = na_if(b, a == 0)) %>%
pivot_wider(names_from = group, values_from = c(a, b)) %>%
select(-rn)
# A tibble: 5 x 6
# a_sample1 a_sample2 a_sample3 b_sample1 b_sample2 b_sample3
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 2 1 2 2 1 2
#2 2 3 2 2 3 2
#3 2 3 2 2 3 2
#4 2 3 2 2 3 2
#5 2 3 2 2 3 2

How to group the data by id and get unique values of all columns in R?

I have a table with ID and other columns. I want to group the data by Ids and get the unique values of all columns.
from above table group by ID and get unique(Alt1, Alt2, Alt3)
Resul should be in vector form
A -> 1,2,3,5
B ->1,3,4,5,7
We can get data in long format and for each ID make a list of unique values.
library(dplyr)
library(tidyr)
df1 <- df %>%
pivot_longer(cols = -ID) %>%
group_by(ID) %>%
summarise(value = list(unique(value))) %>%
unnest(value)
df1
# ID value
# <fct> <dbl>
# 1 A 1
# 2 A 3
# 3 A 2
# 4 A 5
# 5 B 1
# 6 B 4
# 7 B 5
# 8 B 3
# 9 B 6
#10 B 7
We can store it as a list if needed using split.
split(df1$value, df1$ID)
#$A
#[1] 1 3 2 5
#$B
#[1] 1 4 5 3 6 7
data.table equivalent of the above would be :
library(Data.table)
setDT(df)
df2 <- melt(df, id.vars = 'ID')[, .(value = list(unique(value))), ID]
unique values are present in df2$value as a vector.
data
df <- data.frame(ID = c('A', 'A', 'B', 'B'),
Alt1 = c(1, 2, 1, 3),
Alt2 = c(3, 5, 4, 6),
Alt3 = c(1, 3, 5, 7))

How to keep all columns when concatenating rows with dplyr::summarise?

I want to aggregate one column (C) in a data frame according to one grouping variable A, and separate the individual values by a comma while keeping all the other column B. However, B can either have a character (which is always the same for all the rows) or be empty. In this case, I would like to keep the character whenever it is present on one row.
Here is a simplified example:
data <- data.frame(A = c(rep(111, 3), rep(222, 3)), B = c("", "", "", "a" , "", "a"), C = c(5:10))
data
Based on this question Collapse / concatenate / aggregate a column to a single comma separated string within each group, I have the following code:
library(dplyr)
data %>%
group_by(A) %>%
summarise(test = toString(C)) %>%
ungroup()
Here it is what I would like to obtain:
A B C
1 111 5,6,7
2 222 a 8,9,10
Use summarise_all()
To keep all your columns, you can use summarise_all():
data %>%
group_by(A) %>%
summarise_all(toString)
# A tibble: 2 x 3
A B C
<dbl> <chr> <chr>
1 111 1, 2, 1 5, 6, 7
2 222 2, 1, 2 8, 9, 10
Edit for updated question
You can add a B column to summarise to achieve the desided results:
data <- data.frame(A = c(rep(111, 3), rep(222, 3)), B = c("", "", "", "a" , "", "a"), C = c(5:10))
data
library(dplyr)
data %>%
group_by(A) %>%
summarise(B = names(sort(table(B),decreasing=TRUE))[1],
C = toString(C)) %>%
ungroup()
# A tibble: 2 x 3
A B C
<dbl> <fct> <chr>
1 111 "" 5, 6, 7
2 222 a 8, 9, 10
This will return the most frequent value in B column (as order gives you ordered indexes).
Hope this helps.
You could write one function to return unique values
library(dplyr)
get_common_vars <- function(x) {
if(n_distinct(x) > 1) unique(x[x !='']) else unique(x)
}
and then use it on all columns that you are interested :
data %>%
group_by(A) %>%
mutate(C = toString(C)) %>%
summarise_at(vars(B:C), get_common_vars)
# ^------ Include all columns here
# A tibble: 2 x 3
# A B C
# <dbl> <fct> <chr>
#1 111 "" 5, 6, 7
#2 222 a 8, 9, 10
You can also use the paste() function and leverage the collapse argument.
data %>%
group_by(A) %>%
summarise(
B = paste(unique(B), collapse = ""),
C = paste(C, collapse = ", "))
# A tibble: 2 x 3
A B C
<chr> <chr> <chr>
1 111 "" 5, 6, 7
2 222 a 8, 9, 10

Summarize all group values and a conditional subset in the same call

I'll illustrate my question with an example.
Sample data:
df <- data.frame(ID = c(1, 1, 2, 2, 3, 5), A = c("foo", "bar", "foo", "foo", "bar", "bar"), B = c(1, 5, 7, 23, 54, 202))
df
ID A B
1 1 foo 1
2 1 bar 5
3 2 foo 7
4 2 foo 23
5 3 bar 54
6 5 bar 202
What I want to do is to summarize, by ID, the sum of B and the sum of B when A is "foo". I can do this in a couple steps like:
require(magrittr)
require(dplyr)
df1 <- df %>%
group_by(ID) %>%
summarize(sumB = sum(B))
df2 <- df %>%
filter(A == "foo") %>%
group_by(ID) %>%
summarize(sumBfoo = sum(B))
left_join(df1, df2)
ID sumB sumBfoo
1 1 6 1
2 2 30 30
3 3 54 NA
4 5 202 NA
However, I'm looking for a more elegant/faster way, as I'm dealing with 10gb+ of out-of-memory data in sqlite.
require(sqldf)
my_db <- src_sqlite("my_db.sqlite3", create = T)
df_sqlite <- copy_to(my_db, df)
I thought of using mutate to define a new Bfoo column:
df_sqlite %>%
mutate(Bfoo = ifelse(A=="foo", B, 0))
Unfortunately, this doesn't work on the database end of things.
Error in sqliteExecStatement(conn, statement, ...) :
RS-DBI driver: (error in statement: no such function: IFELSE)
You can do both sums in a single dplyr statement:
df1 <- df %>%
group_by(ID) %>%
summarize(sumB = sum(B),
sumBfoo = sum(B[A=="foo"]))
And here is a data.table version:
library(data.table)
dt = setDT(df)
dt1 = dt[ , .(sumB = sum(B),
sumBfoo = sum(B[A=="foo"])),
by = ID]
dt1
ID sumB sumBfoo
1: 1 6 1
2: 2 30 30
3: 3 54 0
4: 5 202 0
Writing up #hadley's comment as an answer
df_sqlite %>%
group_by(ID) %>%
mutate(Bfoo = if(A=="foo") B else 0) %>%
summarize(sumB = sum(B),
sumBfoo = sum(Bfoo)) %>%
collect
If you want to do counting instead of summarizing, then the answer is somewhat different. The change in code is small, especially in the conditional counting part.
df1 <- df %>%
group_by(ID) %>%
summarize(countB = n(),
countBfoo = sum(A=="foo"))
df1
Source: local data frame [4 x 3]
ID countB countBfoo
1 1 2 1
2 2 2 2
3 3 1 0
4 5 1 0
If you wanted to count the rows, instead of summing them, can you pass a variable to the function:
df1 <- df %>%
group_by(ID) %>%
summarize(RowCountB = n(),
RowCountBfoo = n(A=="foo"))
I get an error both with n() and nrow().

Resources