Finding unique columns - r

Please help. I have a data frame like the following:
df <- data.frame("G"=c(1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7),
"C"=c(1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0,1,0),
"SKU"=c("a","b","c","a","c","d","a","c","d","a","b","c","a","b","c","b","c","d","a","b","c"))
df
G C SKU
1 1 a
1 0 b
1 0 c
2 0 a
2 1 c
2 0 d
3 1 a
3 0 c
3 0 d
4 0 a
4 1 b
4 0 c
5 1 a
5 0 b
5 0 c
6 0 b
6 1 c
6 0 d
7 0 a
7 1 b
7 0 c
I want to find unique "blocks" in this data frame. For example, here we have three blocks: (a,b,c) , (a,c,d) and (b,c,d). I want to create table with these unique blocks, and to sum variable "C" in all "G" for each SKU in certain block. And finally get data frame like this:
New_G SKU New_C
1 1 a 2
2 1 b 2
3 1 c 0
4 2 a 1
5 2 c 1
6 2 d 0
7 3 b 0
8 3 c 1
9 3 d 0
As I said, here three unique blocks, New_G - identificator of every block, and New_C - the sum of "C" for each SKU that in certain block. (For example, see the first row. SKU = "a", NEW_C = 2 - it means that in the old data frame SKU "a" while being in the block (a,b,c) has variable "C" = 1 two times) (Another example: see the forth row. Again SKU = "a", but NEW_C = 1 - it means that in the old data frame SKU "a" while being in the block (a,c,d) has variable "C" = 1 once)
If my question is not clear let me now, please.

You can create an index with toString and use a simple sum by group from there. The challenge is getting the index of unique groups:
ind <- df %>% group_by(G) %>% summarise(temp=toString(SKU)) %>% mutate(fac=as.numeric(as.factor(temp)))
ind <- rep(ind$fac, each=3)
df$ind <- ind
df %>% group_by(ind, SKU) %>% summarise(New_C = sum(C))
# Source: local data frame [9 x 3]
# Groups: ind [?]
#
# ind SKU New_C
# (dbl) (fctr) (dbl)
# 1 1 a 2
# 2 1 b 2
# 3 1 c 0
# 4 2 a 1
# 5 2 c 1
# 6 2 d 0
# 7 3 b 0
# 8 3 c 1
# 9 3 d 0
Edit
This is possibly quicker:
df %>% group_by(G) %>%
mutate(temp=toString(SKU)) %>%
group_by(temp, SKU) %>%
summarise(New_C = sum(C))

Using dplyr:
library(dplyr)
df %>%
group_by(G) %>%
summarize(bin = paste(SKU, collapse=',')) %>%
left_join(df, by=c('G' = 'G')) %>%
group_by(bin, SKU) %>%
summarize(New_C = sum(C))
Output:
bin SKU New_C
(chr) (fctr) (dbl)
1 a,b,c a 2
2 a,b,c b 2
3 a,b,c c 0
4 a,c,d a 1
5 a,c,d c 1
6 a,c,d d 0
7 b,c,d b 0
8 b,c,d c 1
9 b,c,d d 0

Here is a solution in base R.
Grp <- vapply(unique(df$G), function(x) paste(df$SKU[which(df$G==x)], collapse = ""), "abc", USE.NAMES = FALSE)
ID <- vapply(1:nrow(df), function(x) paste(df$SKU[x],Grp[df$G[x]], collapse=""), "a abc", USE.NAMES = FALSE)
UniG <- unique(Grp)
New_G <- do.call(c, lapply(1:length(UniG), function(x) rep(x, nchar(UniG[x]))))
Newdf <- data.frame(New_G, t(sapply(unique(ID), function(x) list(SKU = strsplit(x,split = " ")[[1]][1], New_C = sum(df$C[which(ID==x)])), USE.NAMES = FALSE)))
> Newdf
New_G SKU New_C
1 1 a 2
2 1 b 2
3 1 c 0
4 2 a 1
5 2 c 1
6 2 d 0
7 3 b 0
8 3 c 1
9 3 d 0
The dplyr solutions provided by Pierre Lafortune and Edward R. Mazurek are much faster. The BuildRandomDF below builds a data frame very similar to the one the OP posted.
library(gtools)
BuildRandomDF <- function(n) {
set.seed(117)
samp1 <- sample(3:5, n, replace = TRUE)
Len5 <- length(which(samp1==5))
Len4 <- length(which(samp1==4))
Len3 <- length(which(samp1==3))
perm5 <- permutations(5,5,letters[1:5])
perm4 <- permutations(4,4,letters[1:4])
perm3 <- permutations(3,3,letters[1:3])
sampPerm5 <- sample(nrow(perm5), Len5, replace = TRUE)
sampPerm4 <- sample(nrow(perm4), Len4, replace = TRUE)
sampPerm3 <- sample(nrow(perm3), Len3, replace = TRUE)
G <- do.call(c, lapply(1:n, function(x) rep(x, samp1[x])))
i <- j <- k <- 0L
SKU <- do.call(c, lapply(1:n, function(x) {
if (samp1[x]==3) {
perm3[sampPerm3[j <<- j+1L],]
} else if (samp1[x]==4) {
perm4[sampPerm4[k <<- k+1L],]
} else {
perm5[sampPerm5[i <<- i+1L],]
}}))
C <- sample(0:1, length(SKU), replace = TRUE)
data.frame(G, C, SKU)
}
Below are the functions:
library(dplyr)
DplyrTest <- function(df) {
df %>% group_by(G) %>%
mutate(temp=toString(SKU)) %>%
group_by(temp, SKU) %>%
summarise(New_C = sum(C))
}
DplyrCheck2 <- function(df) {
df %>%
group_by(G) %>%
summarize(bin = paste(SKU, collapse=',')) %>%
left_join(df, by=c('G' = 'G')) %>%
group_by(bin, SKU) %>%
summarize(New_C = sum(C))
}
BaseTest <- function(df) {
Grp <- vapply(unique(df$G), function(x) paste(df$SKU[which(df$G==x)], collapse = ""), "abc", USE.NAMES = FALSE)
ID <- vapply(1:nrow(df), function(x) paste(df$SKU[x],Grp[df$G[x]], collapse=""), "a abc", USE.NAMES = FALSE)
UniG <- unique(Grp)
New_G <- do.call(c, lapply(1:length(UniG), function(x) rep(x, nchar(UniG[x]))))
Newdf <- data.frame(New_G, t(sapply(unique(ID), function(x) list(SKU = strsplit(x,split = " ")[[1]][1], New_C = sum(df$C[which(ID==x)])), USE.NAMES = FALSE)))
Newdf
}
Below are the timings:
df <- BuildRandomDF(10^4)
system.time(df1 <- DplyrCheck(df))
user system elapsed
0.43 0.00 0.43
system.time(df2 <- DplyrCheck2(df))
user system elapsed
0.39 0.00 0.39
system.time(df3 <- BaseTest(df))
user system elapsed
5.15 0.00 5.19
all(sort(unlist(df3$New_C))==sort(df1$New_C))
[1] TRUE
all(sort(df1$New_C)==sort(df2$New_C))
[1] TRUE

Related

Use a grouped field to filter the original table in the summarise

Edit.
I´ve rewritten the question hoping it makes more sense.
Given this data:
> df
Cat1 Cat2 Q
1 A B 1
2 A C 1
3 B D 1
4 B C 1
5 C C 1
6 C D 1
You can easily group by Cat1 and sum Q using dplyr:
> df %>% group_by(Cat1) %>% summarise(Sum1 = sum(Q))
# A tibble: 3 x 2
Cat1 Sum1
<fct> <dbl>
1 A 2
2 B 2
3 C 2
Now, my question is, as a next step, can you use the groups in the group by (i.e. A, B and C) to operate in the original table? For example, how could you sum Q when Cat2 equals each group?
Meaning, for A there is no match in Cat2, so the sum of Q would be 0. For B there is only a match in the first row, so the sum of Q would be 1. For C there is a match in the second, the fourth and the fifth row, so the sum of Q would be 3:
# A tibble: 3 x 3
Cat1 Sum1 Sum2
<fct> <dbl> <dbl>
1 A 2 0
2 B 2 1
3 C 2 3
Note that this is not what I´m asking:
> df %>% group_by(Cat1) %>% summarise(Sum1 = sum(Q), Sum2 = sum(Q[Cat1==Cat2]))
# A tibble: 3 x 3
Cat1 Sum1 Sum2
<fct> <dbl> <dbl>
1 A 2 0
2 B 2 0
3 C 2 1
#antoine-sac propose in the comments to duplicate df and do a left join on Cat1(Grouped) = Cat2. Of course this would solve the problem, but it´s not the question I´m trying to answer.
Code:
Cat1 <- c("A","A","B","B","C","C")
Cat2 <- c("B","C","D","C","C","D")
Cat1 <- factor(Cat1, levels = c("A","B","C","D"))
Cat2 <- factor(Cat2, levels = c("A","B","C","D"))
Q <- c(1,1,1,1,1,1)
df <- data.frame(Cat1, Cat2, Q)
I think a join is the cleanest way to do it. Think about yourself reading your code again in 6 months: you want the meaning of your code to be obvious.
library("dplyr")
df <- read.table(text = " Cat1 Cat2 Q
1 A B 1
2 A C 1
3 B D 1
4 B C 1
5 C C 1
6 C D 1", stringsAsFactor = FALSE)
df1 <- df %>%
group_by(Cat1) %>%
summarise(Sum1 = sum(Q))
df2 <- df %>%
group_by(Cat2) %>%
summarise(Sum2 = sum(Q))
full_join(df1, df2, by = c("Cat1" = "Cat2")) %>%
tidyr::replace_na(list(Sum1 = 0, Sum2 = 0))
# # A tibble: 4 x 3
# Cat1 Sum1 Sum2
# <chr> <dbl> <dbl>
# 1 A 2 0
# 2 B 2 1
# 3 C 2 3
# 4 D 0 2
With a full_join, you keep all values in Cat1 or Cat2 (A, B, C , D) but you can use a left_join (to keep A, B, C), a right_join (to keep B, C, D) or an inner_join (to keep B, C).
These are respectively the values in Cat1, in Cat2 or both in Cat1 and Cat2.
It may seem painful, especially if you have a lot of categories, but if you have to do it more than once, it is actually easy to automate in a function.
EDIT: actually it is not easy at all if you want to use dplyr due to non-standard evaluation. Here's how you'd do it:
sum_cats <- function(df, cat1, cat2, value) {
cat1 <- enquo(cat1)
cat2 <- enquo(cat2)
value <- enquo(value)
sum1 <- paste0("Sum_", quo_name(cat1))
df1 <- df %>%
rename(cat = !! cat1) %>%
group_by(cat) %>%
summarise(!! sum1 := sum(!! value))
sum2 <- paste0("Sum_", quo_name(cat2))
df2 <- df %>%
rename(cat = !! cat2) %>%
group_by(cat) %>%
summarise(!! sum2 := sum(!! value))
full_join(df1, df2, by = "cat") %>%
tidyr::replace_na(rlang::list2(!! sum1 := 0, !! sum2 := 0))
}
Now you can just call sum_cats to do all the work:
df %>%
sum_cats(Cat1, Cat2, Q)
# cat Sum_Cat1 Sum_Cat2
# <chr> <dbl> <dbl>
# 1 A 2 0
# 2 B 2 1
# 3 C 2 3
# 4 D 0 2
You can try
df %>%
group_by(Cat1) %>%
summarise(sum1 = sum(Q),
sum2 = sum(ifelse(.$Cat2 == Cat1[1], Q, 0)))
# A tibble: 3 x 3
Cat1 sum1 sum2
<fct> <dbl> <dbl>
1 A 2 0
2 B 2 1
3 C 2 3
By using the .$ you will compare and sum up the ungrouped original data.
You probably could construct a new column and summarise from the new column:
df %>% mutate(new_Quantity=ifelse(Start == End, Quantity,0)) %>% group_by(Start) %>% summarise(Sum = sum(new_Quantity))

repeat dataframe n times whilst adding column

This is my reproducible code:
df <- data.frame(x = c(1, 2), y = c(3, 4))
df1 <- df %>% mutate(z = 1)
df2 <- df %>% mutate(z = 2)
df3 <- df %>% mutate(z = 3)
df <- rbind(df1, df2, df3)
df
I repeat the original data frame df 3 times, whilst adding one column where the number in the column indicated the repetition. In my use case, I have to do this more than 3 times. I could use a loop but is there a neater way? I guess i cannot use expand.grid.
You can also do it with a merge:
dfz <- data.frame(z = 1:3)
merge(df, dfz)
# x y z
# 1 1 3 1
# 2 2 4 1
# 3 1 3 2
# 4 2 4 2
# 5 1 3 3
# 6 2 4 3
We can create a list column and unnest
library(tidyverse)
df %>%
mutate(z = list(1:3)) %>%
unnest %>%
arrange(z)
# x y z
#1 1 3 1
#2 2 4 1
#3 1 3 2
#4 2 4 2
#5 1 3 3
#6 2 4 3
We can also do a cross join with sqldf. This creates a Cartesian Product of df and the reps tables:
library(sqldf)
reps <- data.frame(z = 1:3)
sqldf("select * from df, reps order by z")
or simply with map_dfr from purrr:
library(purrr)
map_dfr(1:3, ~cbind(df, z = .))
Output:
x y z
1 1 3 1
2 2 4 1
3 1 3 2
4 2 4 2
5 1 3 3
6 2 4 3
Yet another option using base R
n <- 3
do.call(rbind,
Map(`[<-`, replicate(n = n,
expr = df,
simplify = FALSE),
"z",
value = seq_len(n)))
# x y z
#1 1 3 1
#2 2 4 1
#3 1 3 2
#4 2 4 2
#5 1 3 3
#6 2 4 3
A few other ways not covered yet:
# setup
df = data.frame(x = c(1, 2), y = c(3, 4))
n = 3
# simple row indexing, add column manually
result = df[rep(1:nrow(df), 3), ]
result$id = rep(1:n, each = nrow(df))
# cross join in base
merge(df, data.frame(id = 1:n), by = NULL)
# cross join in tidyr
tidyr::crossing(df, data.frame(id = 1:n))
# dplyr version of the row-index method above
slice(df, rep(1:n(), n)) %>% mutate(id = rep(1:n, each = nrow(df)))
Inspiration drawn heavily from an old question of mine, How can I repeat a data frame?. Basically the same question but without the id column requirement.

Consolidate flag columns into single column in R

Suppose I have a dataframe like this
testtbl <- data.frame(ID = c('1','2','3','4'),
A = c(1,0,1,1),
B = c(1,1,1,1),
C = c(0,0,0,1),
D = c(0,1,1,1))
> testtbl
ID A B C D
1 1 1 1 0 0
2 2 0 1 0 1
3 3 1 1 0 1
4 4 1 1 1 1
Where columns A-D are flags that can either be 1 or 0. I would like to consolidate these columns into 1 column, where the new dataframe would look something like:
> testtbl
ID flag
1 1 A,B
2 2 B,D
3 3 A,B,D
4 4 A,B,C,D
At little confused on how I would approach this and would appreciate any hints or help.
A solution from dplyr and tidyr.
library(dplyr)
library(tidyr)
testtbl2 <- testtbl %>%
gather(Col, Val, -ID) %>%
filter(Val == 1) %>%
select(-Val) %>%
group_by(ID) %>%
summarise(flag = toString(Col))
testtbl2
# # A tibble: 4 x 2
# ID flag
# <fctr> <chr>
# 1 1 A, B
# 2 2 B, D
# 3 3 A, B, D
# 4 4 A, B, C, D
You can also do it without any libraries with an apply:
testtbl <- data.frame(ID = c('1','2','3','4'),
A = c(1,0,1,1),
B = c(1,1,1,1),
C = c(0,0,0,1),
D = c(0,1,1,1))
test<-data.frame(ID=testtbl$ID, flag=(apply(testtbl[,-1], 1, function(x) paste0(names(x)[which(x==1)], collapse=','))))
Base R
do.call(rbind, lapply(split(testtbl, testtbl$ID), function(x)
data.frame(ID = x[1],
flag = paste(sort(names(x)[-1][x[-1] > 0]),
collapse = ","))))
# ID flag
#1 1 A,B
#2 2 B,D
#3 3 A,B,D
#4 4 A,B,C,D

Count number of values in row using dplyr

This question should have a simple, elegant solution but I can't figure it out, so here it goes:
Let's say I have the following dataset and I want to count the number of 2s present in each row using dplyr.
set.seed(1)
ID <- LETTERS[1:5]
X1 <- sample(1:5, 5,T)
X2 <- sample(1:5, 5,T)
X3 <- sample(1:5, 5,T)
df <- data.frame(ID,X1,X2,X3)
library(dplyr)
Now, the following works:
df %>%
rowwise %>%
mutate(numtwos = sum(c(X1,X2,X3) == 2))
But how do I avoid typing out all of the column names?
I know this is probably easier to do without dplyr, but more generally I want to know how I can use dplyr's mutate with multiple columns without typing out all the column names.
Try rowSums:
> set.seed(1)
> ID <- LETTERS[1:5]
> X1 <- sample(1:5, 5,T)
> X2 <- sample(1:5, 5,T)
> X3 <- sample(1:5, 5,T)
> df <- data.frame(ID,X1,X2,X3)
> df
ID X1 X2 X3
1 A 2 5 2
2 B 2 5 1
3 C 3 4 4
4 D 5 4 2
5 E 2 1 4
> rowSums(df == 2)
[1] 2 1 0 1 1
Alternatively, with dplyr:
> df %>% mutate(numtwos = rowSums(. == 2))
ID X1 X2 X3 numtwos
1 A 2 5 2 2
2 B 2 5 1 1
3 C 3 4 4 0
4 D 5 4 2 1
5 E 2 1 4 1
Here's another alternative using purrr:
library(purrr)
df %>%
by_row(function(x) {
sum(x[-1] == 2) },
.to = "numtwos",
.collate = "cols"
)
Which gives:
#Source: local data frame [5 x 5]
#
# ID X1 X2 X3 numtwos
# <fctr> <int> <int> <int> <int>
#1 A 2 5 2 2
#2 B 2 5 1 1
#3 C 3 4 4 0
#4 D 5 4 2 1
#5 E 2 1 4 1
As per mentioned in the NEWS, row based functionals are still maturing in dplyr:
We are still figuring out what belongs in dplyr and what belongs in
purrr. Expect much experimentation and many changes with these
functions.
Benchmark
We can see how rowwise() and do() compare to purrr::by_row() for this type of problem and how they "perform" against rowSums() and the tidy data way:
largedf <- df[rep(seq_len(nrow(df)), 10e3), ]
library(microbenchmark)
microbenchmark(
steven = largedf %>%
by_row(function(x) {
sum(x[-1] == 2) },
.to = "numtwos",
.collate = "cols"),
psidom = largedf %>%
rowwise %>%
do(data_frame(numtwos = sum(.[-1] == 2))) %>%
cbind(largedf, .),
gopala = largedf %>%
gather(key, value, -ID) %>%
group_by(ID) %>%
summarise(numtwos = sum(value == 2)) %>%
inner_join(largedf, .),
evan = largedf %>%
mutate(numtwos = rowSums(. == 2)),
times = 10L,
unit = "relative"
)
Results:
#Unit: relative
# expr min lq mean median uq max neval cld
# steven 1225.190659 1261.466936 1267.737126 1227.762573 1276.07977 1339.841636 10 b
# psidom 3677.603240 3759.402212 3726.891458 3678.717170 3728.78828 3777.425492 10 c
# gopala 2.715005 2.684599 2.638425 2.612631 2.59827 2.572972 10 a
# evan 1.000000 1.000000 1.000000 1.000000 1.00000 1.000000 10 a
Just wanted to add to the answer of #evan.oman in case you only want to sum rows for specific columns, not all of them. You can use the regular select and/or select_helpers functions. In this example, we don't want to include X1 in rowSums:
df %>%
mutate(numtwos = rowSums(select(., -X1) == 2))
ID X1 X2 X3 numtwos
1 A 2 5 2 1
2 B 2 5 1 0
3 C 3 4 4 0
4 D 5 4 2 1
5 E 2 1 4 0
One approach is to use a combination of dplyr and tidyr to convert data into long format, and do the computation:
library(dplyr)
library(tidyr)
df %>%
gather(key, value, -ID) %>%
group_by(ID) %>%
summarise(numtwos = sum(value == 2)) %>%
inner_join(df, .)
Output is as follows:
ID X1 X2 X3 numtwos
1 A 2 5 2 2
2 B 2 5 1 1
3 C 3 4 4 0
4 D 5 4 2 1
5 E 2 1 4 1
You can use do, which doesn't add the column to your original data frame and you need to add the column to your original data frame.
df %>%
rowwise %>%
do(numtwos = sum(.[-1] == 2)) %>%
data.frame
numtwos
1 2
2 1
3 0
4 1
5 1
Add a cbind to bind the new column to the original data frame:
df %>%
rowwise %>%
do(numtwos = sum(.[-1] == 2)) %>%
data.frame %>% cbind(df, .)
ID X1 X2 X3 numtwos
1 A 2 5 2 2
2 B 2 5 1 1
3 C 3 4 4 0
4 D 5 4 2 1
5 E 2 1 4 1

Using 'window' functions in dplyr

I need to process rows of a data-frame in order, but need to look-back for certain rows. Here is an approximate example:
library(dplyr)
d <- data_frame(trial = rep(c("A","a","b","B","x","y"),2))
d <- d %>%
mutate(cond = rep('', n()), num = as.integer(rep(0,n())))
for (i in 1:nrow(d)){
if(d$trial[i] == "A"){
d$num[i] <- 0
d$cond[i] <- "A"
}
else if(d$trial[i] == "B"){
d$num[i] <- 0
d$cond[i] <- "B"
}
else{
d$num[i] <- d$num[i-1] +1
d$cond[i] <- d$cond[i-1]
}
}
The resulting data-frame looks like
> d
Source: local data frame [12 x 3]
trial cond num
1 A A 0
2 a A 1
3 b A 2
4 B B 0
5 x B 1
6 y B 2
7 A A 0
8 a A 1
9 b A 2
10 B B 0
11 x B 1
12 y B 2
What is the proper way of doing this using dplyr?
dlpyr-only solution:
d %>%
group_by(i=cumsum(trial %in% c('A','B'))) %>%
mutate(cond=trial[1],num=seq(n())-1) %>%
ungroup() %>%
select(-i)
# trial cond num
# 1 A A 0
# 2 a A 1
# 3 b A 2
# 4 B B 0
# 5 x B 1
# 6 y B 2
# 7 A A 0
# 8 a A 1
# 9 b A 2
# 10 B B 0
# 11 x B 1
# 12 y B 2
Try
d %>%
mutate(cond = zoo::na.locf(ifelse(trial=="A"|trial=="B", trial, NA))) %>%
group_by(id=rep(1:length(rle(cond)$values), rle(cond)$lengths)) %>%
mutate(num = 0:(n()-1)) %>% ungroup %>%
select(-id)
Here is one way. The first thing was to add A or B in cond using ifelse. Then, I employed na.locf() from the zoo package in order to fill NA with A or B. I wanted to assign a temporary group ID before I took care of num. I borrowed rleid() in the data.table package. Grouping the data with the temporary group ID (i.e., foo), I used row_number() which is one of the window functions in the dplyr package. Note that I tried to remove foo doing select(-foo). But, the column wanted to stay. I think this is probably something to do with compatibility of the function.
library(zoo)
library(dplyr)
library(data.table)
d <- data_frame(trial = rep(c("A","a","b","B","x","y"),2))
mutate(d, cond = ifelse(trial == "A" | trial == "B", trial, NA),
cond = na.locf(cond),
foo = rleid(cond)) %>%
group_by(foo) %>%
mutate(num = row_number() - 1)
# trial cond foo num
#1 A A 1 0
#2 a A 1 1
#3 b A 1 2
#4 B B 2 0
#5 x B 2 1
#6 y B 2 2
#7 A A 3 0
#8 a A 3 1
#9 b A 3 2
#10 B B 4 0
#11 x B 4 1
#12 y B 4 2

Resources