split, apply and combine on 2 columns of data - r

I've got a dataframe consisting of a group and 2 value columns, as such:
group val1 val2
A 5 3
A 2 4
A 3 1
B 3 6
B 2 1
B 0 2
I want to work out the number of rows where val1 > val2, split by subset. Initially I hardcoded this per subgroup with:
number_a <- nrow(subset(df, group=="A" & val1 > val2))
number_b <- nrow(subset(df, group=="B" & val1 > val2))
What's the proper way of automating this? I tried using the split() function but I couldn't work out how to pass in both val1 and val2 column.

Pretty straight forward using data.table
If you want the number of rows
library(data.table)
setDT(df)[, .(RowsNum = sum(val1 > val2)), by = group]
# group RowsNum
# 1: A 2
# 2: B 1
If you looking for split, apply combinations in base R, could also try
sapply(split(df[-1], df[1]), function(x) sum(x[1] > x[2]))
# A B
# 2 1
Or using tapply (also from base R)
tapply(with(df, val1 > val2), df[1], sum)
# group
# A B
# 2 1
If you want the rows themselves
setDT(df)[, .SD[val1 > val2]]
# group val1 val2
# 1: A 5 3
# 2: A 3 1
# 3: B 2 1
Or very simple with base R too
df[with(df, val1 > val2), ]
# group val1 val2
# 1 A 5 3
# 3 A 3 1
# 5 B 2 1
Or
subset(df, val1 > val2)
# group val1 val2
# 1 A 5 3
# 3 A 3 1
# 5 B 2 1

Another option using dplyr
library(dplyr)
filter(df, val1 >val2)
# group val1 val2
#1 A 5 3
#2 A 3 1
#3 B 2 1
If you need the nrows
df %>%
group_by(group) %>%
filter(val1 >val2) %>%
summarise(RowsNum=n())
# group RowsNum
#1 A 2
#2 B 1
Or using aggregate from base R
aggregate(cbind(RowsNum = val1 > val2) ~ group, df, sum)
# group RowsNum
#1 A 2
#2 B 1

You can try this
data <- data.frame(group,val1,val2)
attach(data)
aggregate(val1~group,data[which(val1 > val2),],length)

Related

lapply aggregate columns in multiple dataframes R

I have several dataframes in a list in R. There are entries in each of those DF I would like to summarise. Im trying to get into lapply so that would be my preferred way (though if theres a better solution I would be happy to know it and why).
My Sample data:
df1 <- data.frame(Count = c(1,2,3), ID = c("A","A","C"))
df2 <- data.frame(Count = c(1,1,2), ID = c("C","B","C"))
dfList <- list(df1,df2)
> head(dfList)
[[1]]
Count ID
1 1 A
2 2 A
3 3 C
[[2]]
Count ID
1 1 C
2 1 B
3 2 C
I tried to implement this in lapply with
dfList_agg<-lapply(dfList, function(i) {
aggregate(i[[1:length(i)]][1L], by=list(names(i[[1:length(i)]][2L])), FUN=sum)
})
However this gives me a error "arguments must have same length". What am I doing wrong?
My desired output would be the sum of Column "Count" by "ID" which looks like this:
>head(dfList_agg)
[[1]]
Count ID
1 3 A
2 3 C
[[2]]
Count ID
1 3 C
2 1 B
I think you've overcomplicated it. Try this...
dfList_agg<-lapply(dfList, function(i) {
aggregate(i[,1], by=list(i[,2]), FUN=sum)
})
dflist_agg
[[1]]
Group.1 x
1 A 3
2 C 3
[[2]]
Group.1 x
1 B 1
2 C 3
Here is a third option
lapply(dfList, function(x) aggregate(. ~ ID, data = x, FUN = "sum"))
#[[1]]
# ID Count
#1 A 3
#2 C 3
#
#[[2]]
#ID Count
#1 B 1
#2 C 3
I guess this is what you need
library(dplyr)
lapply(dfList,function(x) ddply(x,.(ID),summarize,Count=sum(Count)))
An option with tidyverse would be
library(tidyverse)
map(dfList, ~ .x %>%
group_by(ID) %>%
summarise(Count = sum(Count)) %>%
select(names(.x)))
#[[1]]
# A tibble: 2 x 2
# Count ID
# <dbl> <fctr>
#1 3.00 A
#2 3.00 C
#[[2]]
# A tibble: 2 x 2
# Count ID
# <dbl> <fctr>
#1 1.00 B
#2 3.00 C

Fill sequence by factor

I need to fill $Year with missing values of the sequence by the factor of $Country. The $Count column can just be padded out with 0's.
Country Year Count
A 1 1
A 2 1
A 4 2
B 1 1
B 3 1
So I end up with
Country Year Count
A 1 1
A 2 1
A 3 0
A 4 2
B 1 1
B 2 0
B 3 1
Hope that's clear guys, thanks in advance!
This is a dplyr/tidyr solution using complete and full_seq:
library(dplyr)
library(tidyr)
df %>% group_by(Country) %>% complete(Year=full_seq(Year,1),fill=list(Count=0))
Country Year Count
<chr> <dbl> <dbl>
1 A 1 1
2 A 2 1
3 A 3 0
4 A 4 2
5 B 1 1
6 B 2 0
7 B 3 1
library(data.table)
# d is your original data.frame
setDT(d)
foo <- d[, .(Year = min(Year):max(Year)), Country]
res <- merge(d, foo, all.y = TRUE)[is.na(Count), Count := 0]
Similar to #PoGibas' answer:
library(data.table)
# set default values
def = list(Count = 0L)
# create table with all levels
fullDT = setkey(DT[, .(Year = seq(min(Year), max(Year))), by=Country])
# initialize to defaults
fullDT[, names(def) := def ]
# overwrite from data
fullDT[DT, names(def) := mget(sprintf("i.%s", names(def))) ]
which gives
Country Year Count
1: A 1 1
2: A 2 1
3: A 3 0
4: A 4 2
5: B 1 1
6: B 2 0
7: B 3 1
This generalizes to having more columns (besides Count). I guess similar functionality exists in the "tidyverse", with a name like "expand" or "complete".
Another base R idea can be to split on Country, use setdiff to find the missing values from the seq(max(Year)), and rbind them to original data frame. Use do.call to rbind the list back to a data frame, i.e.
d1 <- do.call(rbind, c(lapply(split(df, df$Country), function(i){
x <- rbind(i, data.frame(Country = i$Country[1],
Year = setdiff(seq(max(i$Year)), i$Year),
Count = 0));
x[with(x, order(Year)),]}), make.row.names = FALSE))
which gives,
Country Year Count
1 A 1 1
2 A 2 1
3 A 3 0
4 A 4 2
5 B 1 1
6 B 2 0
7 B 3 1
> setkey(DT,Country,Year)
> DT[setkey(DT[, .(min(Year):max(Year)), by = Country], Country, V1)]
Country Year Count
1: A 1 1
2: A 2 1
3: A 3 NA
4: A 4 2
5: B 1 1
6: B 2 NA
7: B 3 1
Another dplyr and tidyr solution.
library(dplyr)
library(tidyr)
dt2 <- dt %>%
group_by(Country) %>%
do(data_frame(Country = unique(.$Country),
Year = full_seq(.$Year, 1))) %>%
full_join(dt, by = c("Country", "Year")) %>%
replace_na(list(Count = 0))
Here is an approach in base R that uses tapply, do.call, range, and seq, to calculate year sequences. Then constructs a data.frame from the named list that is returned, merges this onto the original which adds the desired rows, and finally fills in missing values.
# get named list with year sequences
temp <- tapply(dat$Year, dat$Country, function(x) do.call(seq, as.list(range(x))))
# construct data.frame
mydf <- data.frame(Year=unlist(temp), Country=rep(names(temp), lengths(temp)))
# merge onto original
mydf <- merge(dat, mydf, all=TRUE)
# fill in missing values
mydf[is.na(mydf)] <- 0
This returns
mydf
Country Year Count
1 A 1 1
2 A 2 1
3 A 3 0
4 A 4 2
5 B 1 1
6 B 2 0
7 B 3 1

R: how to select rows with two conditions (bought both products)

I have a dataset which is similar to the following:
ID = c(1,2,3,4,1,2,3)
Product = c("a", "b", "c", "a","b","a","a")
Quantity = c(1,1,1,1,1,1,1)
df = data.frame(ID, Product, Quantity)
# ID Product Quantity
#1 1 a 1
#2 2 b 1
#3 3 c 1
#4 4 a 1
#5 1 b 1
#6 2 a 1
#7 3 a 1
I want to select the people who purchased both product a and product b. In the case of the above example, the desired result I want is:
ID Product Quantity
1 a 1
2 b 1
1 b 1
2 a 1
I cannot recall a function that does this for me. What I can think of is through loop but I am hoping to find a more succinct solution.
With ave:
df[
with(df, ave(as.character(Product), ID, FUN=function(x) all(c("a","b") %in% x) ))=="TRUE",
]
# ID Product Quantity
#1 1 a 1
#2 2 b 1
#5 1 b 1
#6 2 a 1
You could do the following with dplyr
library(dplyr)
df %>%
filter(Product %in% c('a','b')) %>% # Grab only desired products
group_by(ID) %>% # For each ID...
filter(n() > 1) %>% # Only grab IDs where the count >1
ungroup # Remove grouping.
## # A tibble: 4 x 3
## ID Product Quantity
## <dbl> <fctr> <dbl>
## 1 1 a 1
## 2 2 b 1
## 3 1 b 1
## 4 2 a 1
Edit
Here is a slightly more concise dplyr version using any (similar to how Psidom used it in the data.table solution):
df %>%
group_by(ID) %>%
filter(all(c('a','b') %in% as.character(Product))) %>%
ungroup
Another option using data.table:
library(data.table)
setDT(df)[, .SD[all(c("a", "b") %in% Product)], ID]
# ID Product Quantity
#1: 1 a 1
#2: 1 b 1
#3: 2 b 1
#4: 2 a 1
Here is an option using data.table
library(data.table)
setDT(df, key = "Product")[c("a", "b")][, if(uniqueN(Product)==2) .SD , ID]
# ID Product Quantity
#1: 1 a 1
#2: 1 b 1
#3: 2 a 1
#4: 2 b 1

Remove duplicates based on 2nd column condition

I am trying to remove duplicate rows from a data frame based on the max value on a different column
So, for the data frame:
df<-data.frame (rbind(c("a",2,3),c("a",3,4),c("a",3,5),c("b",1,3),c("b",2,6),c("r",4,5))
colnames(df)<-c("id","val1","val2")
id val1 val2
a 2 3
a 3 4
a 3 5
b 1 3
b 2 6
r 4 5
I would like to keep remove all duplicates by id with the condition that for the corresponding rows they do not have the maximum value for val2.
Thus the data frame should become:
a 3 5
b 2 6
r 4 5
-> remove all a duplicates but keep row with the max value for df$val2 for subset(df, df$id=="a")
Using base R. Here, the columns are factors. Make sure to convert it to numeric
df$val2 <- as.numeric(as.character(df$val2))
df[with(df, ave(val2, id, FUN=max)==val2),]
# id val1 val2
#3 a 3 5
#5 b 2 6
#6 r 4 5
Or using dplyr
library(dplyr)
df %>%
group_by(id) %>%
filter(val2==max(val2))
# id val1 val2
#1 a 3 5
#2 b 2 6
#3 r 4 5
One possible way is to use data.table
library(data.table)
setDT(df)[, .SD[which.max(val2)], by = id]
## id val1 val2
## 1: a 3 5
## 2: b 2 6
## 3: r 4 5
Here's how I hope your data is really set up
df <- data.frame (id = c(rep("a", 3), rep("b", 2), "r"),
val1 = c(2, 3, 3, 1, 2, 4), val2 = c(3, 4, 5, 3, 6, 5))
You could do a split-unsplit
> unsplit(lapply(split(df, df$id), function(x) {
if(nrow(x) > 1) {
x[duplicated(x$id) & x$val2 == max(x$val2),]
} else {
x
}
}), levels(df$id))
# id val1 val2
# 3 a 3 5
# 5 b 2 6
# 6 r 4 5
You can also use Reduce(rbind, ...) or do.call(rbind, ...) in place of unsplit
Another one
df %>% group_by(id) %>%
slice(which.max(val2))
id val1 val2
a 3 5
b 2 6
r 4 5

Normalizing a multivalued column in a large table

I'm having an issue converting the VBA code seen in this this post to an R-script.
The problem is as follows, I have a column (from a source database, not by choice) that contains multiple values for the attribute. I'd like to normalize this table and retain the order in which each value occurs in each cell.
An example dataset:
dat <- data.frame(
ID = c(1:3),
Multi = c("VAL1 VAL2 VAL3","VAL2 VAL3","VAL3 VAL1")
,stringsAsFactors=FALSE)
ID Multi
1 1 VAL1 VAL2 VAL3
2 2 VAL2 VAL3
3 3 VAL2 VAL3 VAL1
The pseudocode would be something like:
Loop over each row
Split string Multi with a space as a separator
For each splitted string, append new row in a separate data.frame with the ID, Order within the total string and the value.
The result would look like:
ID Order Multi
1 1 1 VAL1
2 1 2 VAL2
3 1 3 VAL3
4 2 1 VAL2
5 2 2 VAL3
6 3 1 VAL2
7 3 2 VAL3
8 3 3 VAL1
I'm currently looking at doing so with a data.frame, I'm thinking data.table would be more appropriate as my table will have approximately 400.000 of these rows.
I apologize for not having any code ready, I'm still contemplating whether I need to use the apply family, data.table or a simple for loop. I'll keep this post updated with any progress I make.
Here are a couple of ways...
In base R:
X <- setNames(strsplit(as.character(dat$Multi), " "), dat$ID)
X1 <- stack(X)
X1$order <- ave(X1$ind, X1$ind, FUN = seq_along)
X1
# values ind order
# 1 VAL1 1 1
# 2 VAL2 1 2
# 3 VAL3 1 3
# 4 VAL2 2 1
# 5 VAL3 2 2
# 6 VAL2 3 1
# 7 VAL3 3 2
# 8 VAL1 3 3
OR (better):
X <- strsplit(as.character(dat[, "Multi"]), " ", fixed = TRUE)
len <- vapply(X, length, 1L)
data.frame(ID = rep(dat[, "ID"], len), order = sequence(len),
Multi = unlist(X, use.names=FALSE))
Using concat.split.multiple from my "splitstackshape" package (probably not too efficient on 400,000 rows though).
library(splitstackshape)
out <- concat.split.multiple(dat, "Multi", " ", "long")
out[order(out$ID, out$time), ]
# ID time Multi
# 1 1 1 VAL1
# 4 1 2 VAL2
# 7 1 3 VAL3
# 2 2 1 VAL2
# 5 2 2 VAL3
# 8 2 3 <NA>
# 3 3 1 VAL2
# 6 3 2 VAL3
# 9 3 3 VAL1
And, since you requested "data.table":
library(data.table)
DT <- data.table(dat)
DTL <- DT[, list(unlist(strsplit(as.character(Multi), " "))), by = ID]
DTL[, order := sequence(.N), by = ID]
DTL
# ID V1 order
# 1: 1 VAL1 1
# 2: 1 VAL2 2
# 3: 1 VAL3 3
# 4: 2 VAL2 1
# 5: 2 VAL3 2
# 6: 3 VAL2 1
# 7: 3 VAL3 2
# 8: 3 VAL1 3
Update: Some timings
I didn't bother testing my "splitstackshape" approach, because that uses read.table under the hood, so I know it won't stand up to the demands of performance.
However, base R still seems to win out!
Sample "big" data:
dat_big <- do.call(rbind, replicate(floor(4000/3), dat, simplify = FALSE))
dat_big <- do.call(rbind, replicate(100, dat_big, simplify = FALSE))
dat_big$ID <- make.unique(as.character(dat_big$ID))
DT <- data.table(dat)
DT_big <- data.table(dat_big)
Functions to test:
fun1 <- function(inDF) {
X <- strsplit(as.character(inDF[, "Multi"]), " ", fixed = TRUE)
len <- vapply(X, length, 1L)
data.frame(ID = rep(inDF[, "ID"], len), order = sequence(len),
Multi = unlist(X, use.names=FALSE))
}
fun2 <- function(inDT) {
DTL <- inDT[, list(unlist(strsplit(as.character(Multi), " ", fixed = TRUE))), by = ID]
DTL[, order := sequence(.N), by = ID]
DTL
}
The results for base R:
system.time(outDF <- fun1(dat_big))
# user system elapsed
# 6.418 0.000 6.454
dim(outDF)
# [1] 1066400 3
head(outDF)
# ID order Multi
# 1 1 1 VAL1
# 2 1 2 VAL2
# 3 1 3 VAL3
# 4 2 1 VAL2
# 5 2 2 VAL3
# 6 3 1 VAL2
tail(outDF)
# ID order Multi
# 1066395 1.133299 3 VAL3
# 1066396 2.133299 1 VAL2
# 1066397 2.133299 2 VAL3
# 1066398 3.133299 1 VAL2
# 1066399 3.133299 2 VAL3
# 1066400 3.133299 3 VAL1
The results for "data.table":
system.time(outDT <- fun2(DT_big))
# user system elapsed
# 14.035 0.000 14.057
dim(outDT)
# [1] 1066400 3
outDT
# ID V1 order
# 1: 1 VAL1 1
# 2: 1 VAL2 2
# 3: 1 VAL3 3
# 4: 2 VAL2 1
# 5: 2 VAL3 2
# ---
# 1066396: 2.133299 VAL2 1
# 1066397: 2.133299 VAL3 2
# 1066398: 3.133299 VAL2 1
# 1066399: 3.133299 VAL3 2
# 1066400: 3.133299 VAL1 3

Resources