Subset a data frame based on column entry (or rank) - r

I have a data.frame as simple as this one:
id group idu value
1 1 1_1 34
2 1 2_1 23
3 1 3_1 67
4 2 4_2 6
5 2 5_2 24
6 2 6_2 45
1 3 1_3 34
2 3 2_3 67
3 3 3_3 76
from where I want to retrieve a subset with the first entries of each group; something like:
id group idu value
1 1 1_1 34
4 2 4_2 6
1 3 1_3 34
id is not unique so the approach should not rely on it.
Can I achieve this avoiding loops?
dput() of data:
structure(list(id = c(1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L), group = c(1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), idu = structure(c(1L, 3L, 5L,
7L, 8L, 9L, 2L, 4L, 6L), .Label = c("1_1", "1_3", "2_1", "2_3",
"3_1", "3_3", "4_2", "5_2", "6_2"), class = "factor"), value = c(34L,
23L, 67L, 6L, 24L, 45L, 34L, 67L, 76L)), .Names = c("id", "group",
"idu", "value"), class = "data.frame", row.names = c(NA, -9L))

Using Gavin's million row df:
DF3 <- data.frame(id = sample(1000, 1000000, replace = TRUE),
group = factor(rep(1:1000, each = 1000)),
value = runif(1000000))
DF3 <- within(DF3, idu <- factor(paste(id, group, sep = "_")))
I think the fastest way is to reorder the data frame and then use duplicated:
system.time({
DF4 <- DF3[order(DF3$group), ]
out2 <- DF4[!duplicated(DF4$group), ]
})
# user system elapsed
# 0.335 0.107 0.441
This compares to 7 seconds for Gavin's fastet lapply + split method on my computer.
Generally, when working with data frames, the fastest approach is usually to generate all the indices and then do a single subset.

Update in light of OP's comment
If doing this on million+ rows, all options thus supplied will be slow. Here are some comparison timings on a dummy data set of 100,000 rows:
set.seed(12)
DF3 <- data.frame(id = sample(1000, 100000, replace = TRUE),
group = factor(rep(1:100, each = 1000)),
value = runif(100000))
DF3 <- within(DF3, idu <- factor(paste(id, group, sep = "_")))
> system.time(out1 <- do.call(rbind, lapply(split(DF3, DF3["group"]), `[`, 1, )))
user system elapsed
19.594 0.053 19.984
> system.time(out3 <- aggregate(DF3[,-2], DF3["group"], function (x) x[1]))
user system elapsed
12.419 0.141 12.788
I gave up doing them with a million rows. Far faster, believe it or not, is:
out2 <- matrix(unlist(lapply(split(DF3[, -4], DF3["group"]), `[`, 1,)),
byrow = TRUE, nrow = (lev <- length(levels(DF3$group))))
colnames(out2) <- names(DF3)[-4]
rownames(out2) <- seq_len(lev)
out2 <- as.data.frame(out2)
out2$group <- factor(out2$group)
out2$idu <- factor(paste(out2$id, out2$group, sep = "_"),
levels = levels(DF3$idu))
The outputs are (effectively) the same:
> all.equal(out1, out2)
[1] TRUE
> all.equal(out1, out3[, c(2,1,3,4)])
[1] "Attributes: < Component 2: Modes: character, numeric >"
[2] "Attributes: < Component 2: target is character, current is numeric >"
(the difference between out1 (or out2) and out3 (the aggregate() version) is just in the rownames of the components.)
with a timing of:
user system elapsed
0.163 0.001 0.168
on the 100,000 row problem, and on this million row problem:
set.seed(12)
DF3 <- data.frame(id = sample(1000, 1000000, replace = TRUE),
group = factor(rep(1:1000, each = 1000)),
value = runif(1000000))
DF3 <- within(DF3, idu <- factor(paste(id, group, sep = "_")))
with a timing of
user system elapsed
11.916 0.000 11.925
Working with the matrix version (that produces out2) is quicker doing the million rows that the other versions are at doing the 100,000 row problem. This just shows that working with matrices is very quick indeed, and the bottleneck in the my do.call() version is rbind()-ing the result together.
The million row problem timing was done with:
system.time({out4 <- matrix(unlist(lapply(split(DF3[, -4], DF3["group"]),
`[`, 1,)),
byrow = TRUE,
nrow = (lev <- length(levels(DF3$group))))
colnames(out4) <- names(DF3)[-4]
rownames(out4) <- seq_len(lev)
out4 <- as.data.frame(out4)
out4$group <- factor(out4$group)
out4$idu <- factor(paste(out4$id, out4$group, sep = "_"),
levels = levels(DF3$idu))})
Original
If your data are in DF, say, then:
do.call(rbind, lapply(with(DF, split(DF, group)), head, 1))
will do what you want:
> do.call(rbind, lapply(with(DF, split(DF, group)), head, 1))
idu group
1 1 1
2 4 2
3 7 3
If the new data are in DF2 then we get:
> do.call(rbind, lapply(with(DF2, split(DF2, group)), head, 1))
id group idu value
1 1 1 1_1 34
2 4 2 4_2 6
3 1 3 1_3 34
But for speed, we probably want to subset instead of using head() and we can gain a bit by not using with(), eg:
do.call(rbind, lapply(split(DF2, DF2$group), `[`, 1, ))
> system.time(replicate(1000, do.call(rbind, lapply(split(DF2, DF2$group), `[`, 1, ))))
user system elapsed
3.847 0.040 4.044
> system.time(replicate(1000, do.call(rbind, lapply(split(DF2, DF2$group), head, 1))))
user system elapsed
4.058 0.038 4.111
> system.time(replicate(1000, aggregate(DF2[,-2], DF2["group"], function (x) x[1])))
user system elapsed
3.902 0.042 4.106

One solution using plyr, assuming your data is in an object named zzz:
ddply(zzz, "group", function(x) x[1 ,])
Another option that takes the difference between rows and should prove faster, but relies on the object being ordered before hand. This also assumes you don't have a group value of 0:
zzz <- zzz[order(zzz$group) ,]
zzz[ diff(c(0,zzz$group)) != 0, ]

I think this will do the trick:
aggregate(data["idu"], data["group"], function (x) x[1])
For your updated question, I'd recommend using ddply from the plyr package:
ddply(data, .(group), function (x) x[1,])

Related

Finding different max between specified rows of different fields

I'm looking to find the max values of different columns based on specified rows of each column.
My actual data frame is 50K columns and 1K+ rows so I can't use a loop without greatly increasing run time.
Data Frame:
row
V1
V2
V3
V4
1
5
2
4
5
2
3
5
1
6
3
7
3
2
6
4
2
5
3
10
5
6
9
1
2
beg_row <- c(2, 1, 2, 3)
end_row <- c(4, 3, 3, 5)
output:
c(7, 5, 2, 10)
You can try mapply (but I suspect that it won't speed up the runtime if you have massive columns)
> mapply(function(x, y, z) max(x[y:z]), df[-1], beg_row, end_row)
V1 V2 V3 V4
7 5 2 10
Data
df <- structure(list(row = 1:5, V1 = c(5L, 3L, 7L, 2L, 6L), V2 = c(
2L,
5L, 3L, 5L, 9L
), V3 = c(4L, 1L, 2L, 3L, 1L), V4 = c(
5L, 6L, 6L,
10L, 2L
)), class = "data.frame", row.names = c(NA, -5L))
beg_row <- c(2, 1, 2, 3)
end_row <- c(4, 3, 3, 5)
An option with dplyr
library(dplyr)
df1 %>%
summarise(across(-row, ~ {
i1 <- match(cur_column(), names(df1)[-1])
max(.x[beg_row[i1]:end_row[i1]])}))
V1 V2 V3 V4
1 7 5 2 10
Or another option is to create NA outside the range and then use colMaxs
library(matrixStats)
colMaxs(as.matrix((NA^!(row(df1[-1]) >= beg_row[col(df1[-1])] &
row(df1[-1]) <= end_row[col(df1[-1])])) * df1[-1]), na.rm = TRUE)
[1] 7 5 2 10
The fastest approach that I have found is to use data.table and a for loop. I have tested it with a dataframe of 2K rows and 50K columns.
library(data.table)
beg_row <- sample(1:50, 49999, replace = T)
end_row <- sample(100:150, 49999, replace = T)
df <- matrix(sample(1:50, 50000*2000, replace = T), 2000, 50000)
df <- as.data.frame(df)
dt <- setDT(df)
vmax <- rep(0, ncol(dt)-1)
for (i in 2:ncol(dt)) {
vmax[i-1] <- max(dt[[i]][beg_row[i-1]:end_row[i-1]])
}
Another possible solution, based on purrr::pmap_dbl:
library(purrr)
pmap_dbl(list(beg_row, end_row, 2:ncol(df)), ~ max(df[..1:..2, ..3]))
#> [1] 7 5 2 10

column names to matrix format in r

I have the following dataset:
A..B A..C B..C
value 2 5 9
and I would like to break it in a way such as I get the following output:
A B C
A 1 2 5
B 2 1 9
C 5 9 1
in ideas on how can I do this in r?
Maybe you can try the base R code below
dn <- strsplit(names(df), "..", fixed = TRUE)
mat <- `dimnames<-`(diag(rep(1, ncol(df))), replicate(2, list(unique(unlist(dn)))))
inds <- do.call(rbind, lapply(dn, function(x) rbind(x, rev(x))))
mat[inds] <- rep(unlist(df), each = 2)
or
dn <- strsplit(names(df), "..", fixed = TRUE)
mat <- `dimnames<-`(diag(rep(1, ncol(df))), replicate(2, list(unique(unlist(dn)))))
for (k in seq_along(dn)) {
mat[do.call(cbind, as.list(dn[[k]]))] <- df[, k]
}
mat[lower.tri(mat)] <- t(mat)[lower.tri(mat)]
such that
> mat
A B C
A 1 2 5
B 2 1 9
C 5 9 1
Data
> dput(df)
structure(list(A..B = 2L, A..C = 5L, B..C = 9L), class = "data.frame", row.names = "value")
An option with tidyverse
library(dplyr)
library(tidyr)
library(tibble)
df %>%
pivot_longer(cols = everything()) %>%
separate(name, into = c('name1', 'name2')) %>%
complete(name1 = LETTERS[1:3], name2 = LETTERS[1:3],
fill = list(value = 0)) %>%
pivot_wider(names_from = name2, values_from = value) %>%
column_to_rownames('name1') %>%
as.matrix %>%
{. + t(.)} %>%
`diag<-`(., 1)
# A B C
#A 1 2 5
#B 2 1 9
#C 5 9 1
data
df <- structure(list(A..B = 2L, A..C = 5L, B..C = 9L),
class = "data.frame", row.names = "value")
Here's another option that uses matrix indexing to fill in the values:
library(splitstackshape)
# stack your dataset and split the names into two columns
x <- cSplit(stack(df), "ind", "..")
# ij is going to be your index of row and column combinations
ij <- as.matrix(x[, 2:3])
u <- unique(c(ij))
# initialze a matrix of 1s
m <- matrix(1, nrow = length(u), ncol = length(u),
dimnames = list(u, u))
# replace the relevant indices with values
m[rbind(ij, ij[, 2:1])] <- x$values
m
# A B C
# A 1 2 5
# B 2 1 9
# C 5 9 1
In base you can use strsplit to get the names, use unique to get all levels and create a matrix initialized with 1L and the size of the levels. Then you can fill up the matrix by using the names to find the position of the values.
i <- do.call(rbind, strsplit(names(x), "..", TRUE))
u <- unique(as.vector(i))
m <- matrix(1L, length(u), length(u), dimnames = list(u, u))
m[rbind(i, i[,2:1])] <- unlist(x)
#m[rbind(i, i[,2:1])] <- x #Alternative in case of a vector
m
# A B C
#A 1 2 5
#B 2 1 9
#C 5 9 1
Data:
x <- data.frame(A..B = 2L, A..C = 5L, B..C = 9L, row.names = "value")
#x <- c(A..B = 2L, A..C = 5L, B..C = 9L) #Alternative as a vector

Using a vector as a grep pattern

I am new to R. I am trying to search the columns using grep multiple times within an apply loop. I use grep to specify which rows are summed based on the vector individuals
individuals <-c("ID1","ID2".....n)
bcdata_total <- sapply(individuals, function(x) {
apply(bcdata_clean[,grep(individuals, colnames(bcdata_clean))], 1, sum)
})
bcdata is of random size and contains random data but contains columns that have individuals in part of the string
>head(bcdata)
ID1-4 ID1-3 ID2-5
A 3 2 1
B 2 2 3
C 4 5 5
grep(individuals[1],colnames(bcdata_clean)) returns a vector that looks like
[1] 1 2, a list of the column names containing ID1. That vector is used to select columns to be summed in bcdata_clean. This should occur n number of times depending on the length of individuals
However this returns the error
In grep(individuals, colnames(bcdata)) :
argument 'pattern' has length > 1 and only the first element will be used
And results in all the columns of bcdata being identical
Ideally individuals would increment each time the function is run like this for each iteration
apply(bcdata_clean[,grep(individuals[1,2....n], colnames(bcdata_clean))], 1, sum)
and would result in something like this
>head(bcdata_total)
ID1 ID2
A 5 1
B 4 3
C 9 5
But I'm not sure how to increment individuals. What is the best way to do this within the function?
You can use split.default to split data on similarly named columns and sum them row-wise.
sapply(split.default(df, sub('-.*', '', names(df))), rowSums, na.rm. = TRUE)
# ID1 ID2
#A 5 1
#B 4 3
#C 9 5
data
df <- structure(list(`ID1-4` = c(3L, 2L, 4L), `ID1-3` = c(2L, 2L, 5L
), `ID2-5` = c(1L, 3L, 5L)), class = "data.frame", row.names = c("A", "B", "C"))
Passing individuals as my argument in function(x) fixed my issue
bcdata_total <- sapply(individuals, function(individuals) {
apply(bcdata_clean[,grep(individuals, colnames(bcdata_clean))], 1, sum)
})
An option with tidyverse
library(dplyr)
library(tidyr)
library(tibble)
df %>%
rownames_to_column('rn') %>%
pivot_longer(cols = -rn, names_to = c(".value", "grp"), names_sep="-") %>%
group_by(rn) %>%
summarise(across(starts_with('ID'), sum, na.rm = TRUE), .groups = 'drop') %>%
column_to_rownames('rn')
# ID1 ID2
#A 5 1
#B 4 3
#C 9 5
data
df <- df <- structure(list(`ID1-4` = c(3L, 2L, 4L), `ID1-3` = c(2L, 2L, 5L
), `ID2-5` = c(1L, 3L, 5L)), class = "data.frame", row.names = c("A", "B", "C"))

R - Fastest way to loop subsets and get outputs using datatables (calculating monthly measures)

I have a problem where I am hoping to calculate some monthly measures for different entities but the code I am currently using appears to be very slow. I am wondering if perhaps you may know of a better solution.
A simplified version of my dataset is below. The problem is that one of datasets contains some 6m individual daily observations and my current method appears to be very slow.
date event id return
2000-07-06 2 1 0.1
2000-07-07 1 1 0.2
2000-07-09 0 1 0.6
2000-07-10 0 1 0.4
2000-07-15 2 1 0.7
2000-07-16 1 1 0.3
2000-07-20 0 1 0.1
2000-07-21 1 1 0.2
2000-07-06 1 2 0.3
2000-07-07 2 2 0.4
2000-07-15 0 2 0.6
2000-07-16 0 2 0.8
2000-07-17 2 2 0.9
2000-07-18 1 2 0.1
To calculate these measures I am running code that looks like the following:
for (j in 1:length(list.of.ids)) {
for (i in 1:(number.of.months) {
temp <- subset(data, data$date < FirstDayMonth[i+1] & data$date >= FirstDayMonth[i] & data$id == list.of.ids[j])
total[i,j+1] <- sum(temp$return, na.rm = TRUE)
}
}
Note: total[,] is a matrix with a time column and one column for each id and the number of rows equals every month in the dataset. I am hoping to have a matrix that stores all my monthly measures for ids and months. This loop allows me to calculate the monthly sum of returns by id and then store it in that matrix.
Again, the code above allows me to subset on a month period (by restricting my observations to be between the first day of two consecutive months) and on ids. The problem is, for my larger datasets this is very slow.
Are there any improvements to the code that will allow me to get my desired output faster?
Improvements that should yield speedup:
for (j in 1:length(list.of.ids)) {
id1 <- data$id == list.of.ids[j]
# outside 2nd loop so no redundant operations wont be made
for (i in 1:(number.of.months)) {
id2 <- data$date < FirstDayMonth[i+1] & data$date >= FirstDayMonth[i]
total[i, j+1] <- sum(data$return[id1 & id2], na.rm = TRUE)
}
}
(probably big improvements, as we do not need to create new data.frame object each time, we just get indexes for which elements we need to calculate the sum)
But I would use data.table:
require(data.table)
data <- as.data.table(data)
data[, ym := format(date, '%Y-%m')]
res <- data[, sum(return, na.rm = T), keyby = .(ym, id)]
res
# ym id V1
# 1: 2000-07 1 2.6
# 2: 2000-07 2 3.1
if needed the end result can be transformed to matrix:
m <- matrix(res$V1, nrow = length(unique(res$ym)))
m
# [,1] [,2]
# [1,] 2.6 3.1
Update:
Faster version:
setDT(data) # converts data to data.table
x <- data[, .(date = unique(date))][, .(date, ym = format(date, '%Y-%m'))]
data[x, ym := i.ym, on = 'date']
res <- data[, sum(return, na.rm = T), keyby = .(ym, id)]
res
(format(date, '%Y-%m') is slow, so we take only unique dates and calculate ym for them, then merge that to data. This should be quite faster, if you have lots of duplicated dates.)
Update 2:
Conversion to matrix can be obtained with:
resdt <- dcast(res, ym ~ id, value.var = 'V1') # change data structure
resdt[1:2, 1:3]
# ym 1 2
# 1: 2000-01 6.824182 2.535805
# 2: 2000-02 3.825659 6.769578
resdt[, ym := NULL] # delets ym
setcolorder(resdt, neworder = list.of.ids) # reorder columns
m <- as.matrix(resdt)
m[1:2, 1:2]
# 1 2 3
# [1,] 6.824182 2.535805 -1.193692
# [2,] 3.825659 6.769578 -1.117223
This should be considerably faster:
for(i in 1:length(number.of.months)) {
inds <- dat$date < FirstDayMonth[i+1] & dat$date >= FirstDayMonth[i]
total[i,] <- rowsum(dat$result[inds], dat$id[inds], na.rm=TRUE)
}
Using aggregate. The year-month variable ym we can create with the substring of the first to the seventh character of the date column.
m <- with(dat, aggregate(list(return=return),
by=list(ym=substr(date, 1, 7), id=id), sum))
m
# ym id return
# 1 2000-07 1 2.6
# 2 2000-07 2 3.1
Or tapply.
m <- with(dat, tapply(return, list(ym=substr(date, 1, 7), id=id), sum))
m
# id
# ym 1 2
# 2000-07 2.6 3.1
Data
dat <- structure(list(date = c("2000-07-06", "2000-07-07", "2000-07-09",
"2000-07-10", "2000-07-15", "2000-07-16", "2000-07-20", "2000-07-21",
"2000-07-06", "2000-07-07", "2000-07-15", "2000-07-16", "2000-07-17",
"2000-07-18"), event = c(2L, 1L, 0L, 0L, 2L, 1L, 0L, 1L, 1L,
2L, 0L, 0L, 2L, 1L), id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L), return = c(0.1, 0.2, 0.6, 0.4, 0.7, 0.3,
0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 0.9, 0.1)), row.names = c(NA, -14L
), class = "data.frame")

Multiple uses of setdiff() on consecutive groups without for looping

I would like to setdiff between consecutive groups without for looping, if possible with a datatable way or a function of apply family.
Dataframe df :
id group
1 L1 1
2 L2 1
3 L1 2
4 L3 2
5 L4 2
6 L3 3
7 L5 3
8 L6 3
9 L1 4
10 L4 4
11 L2 5
I want to know how much new ids there are between consecutive groups. So, for example, if we compare group 1 and 2, there are two new ids : L3 and L4 so it returns 2 (not with setdiff directly but with length()), if we compare group 2 and 3, L5 and L6 are the news ids so it returns 2 and so on.
Expected results :
new_id
2
2
2
1
Data :
structure(list(id = structure(c(1L, 2L, 1L, 3L, 4L, 3L, 5L, 6L,
1L, 4L, 2L), .Label = c("L1", "L2", "L3", "L4", "L5", "L6"), class = "factor"),
group = c(1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5)), class = "data.frame", row.names = c(NA,
-11L), .Names = c("id", "group"))
Here is an option with mapply:
lst <- with(df, split(id, group))
mapply(function(x, y) length(setdiff(y, x)), head(lst, -1), tail(lst, -1))
#1 2 3 4
#2 2 2 1
Here is a data.table way with merge. Suppose the original data.frame is named dt:
library(data.table)
setDT(dt)
dt2 <- copy(dt)[, group := group + 1]
merge(
dt, dt2, by = 'group', allow.cartesian = T
)[, .(n = length(setdiff(id.x, id.y))), by = group]
# group n
# 1: 2 2
# 2: 3 2
# 3: 4 2
# 4: 5 1
You could use Reduce to run a comparison function on pairwise elements in a list. For example
xx<-Reduce(function(a, b) {
x <- setdiff(b$id, a$id);
list(id=b$id, new=x, newcount=length(x))
}, split(df, df$group),
acc=TRUE)[-1]
Then you can get the counts of new elements out with
sapply(xx, '[[', "newcount")
and you can get the new values with
sapply(xx, '[[', "new")
L = split(d, d$group) #Split data ('d') by group and create a list
#use lapply to access 'id' for each sub group in the list and obtain setdiff
sapply(2:length(L), function(i)
setNames(length(setdiff(L[[i]][,1], L[[i-1]][,1])),
nm = paste(names(L)[i], names(L)[i-1], sep = "-")))
#2-1 3-2 4-3 5-4
# 2 2 2 1

Resources