Subsetting top 4 observations of each unique ID - r

I have a dataframe of 4 columns and a few thousands rows. I am ordering the dataframe according to thier 4th column-which is their ID-(descending) then to the second column (ascending). Here's what my data looks like:
X1 X2 X3 X4
24 1 23 25
21 3 19 25
19 6 20 25
11 12 14 25
14 9 21 24
3 12 25 24
24 15 23 24
8 1 4 23
17 4 12 23
16 11 23 23
20 19 21 23
24 19 16 23
19 20 7 23
19 22 22 22
11 2 18 21
15 9 19 21
10 14 9 21
17 15 19 21
16 20 6 21
I am trying to keep the highest 4 values of each ID (if available), my desired output would be
X1 X2 X3 X4
24 1 23 25
21 3 19 25
19 6 20 25
11 12 14 25
14 9 21 24
3 12 25 24
24 15 23 24
8 1 4 23
17 4 12 23
16 11 23 23
20 19 21 23
19 22 22 22
11 2 18 21
15 9 19 21
10 14 9 21
17 15 19 21
# note that 2 of the 23 ID observations and one of the 21 ID observations were removed.
I was wondering if there is there some short command that can do the job for me? I can think of a command that is around 1 page long! which is subsetting the data according to the 4th column, taking the top 5, then rbind them again. But that sounds so unprofessional!
Here's a command to generate similar example:
m0 <- matrix(0, 100, 4)
df <- data.frame(apply(m0, c(1,2), function(x) sample(c(0:25),1)))
##fix(df)
odf <- df[order(-as.numeric(df$X4), as.numeric(df$X2)), ]
Thanks all.

maybe data.table:
require(data.table)
df<-read.table(header=T,text=" X1 X2 X3 X4
24 1 23 25
21 3 19 25
19 6 20 25
11 12 14 25
14 9 21 24
3 12 25 24
24 15 23 24
8 1 4 23
17 4 12 23
16 11 23 23
20 19 21 23
24 19 16 23
19 20 7 23
19 22 22 22
11 2 18 21
15 9 19 21
10 14 9 21
17 15 19 21
16 20 6 21")
data.table(df)[,.SD[order(X2)][1:4,],by="X4"][!is.na(X3)][,list(X1,X2,X3,X4)]
X1 X2 X3 X4
1: 24 1 23 25
2: 21 3 19 25
3: 19 6 20 25
4: 11 12 14 25
5: 14 9 21 24
6: 3 12 25 24
7: 24 15 23 24
8: 8 1 4 23
9: 17 4 12 23
10: 16 11 23 23
11: 20 19 21 23
12: 19 22 22 22
13: 11 2 18 21
14: 15 9 19 21
15: 10 14 9 21
16: 17 15 19 2
here's what's happening in the data.table call:
data.table(df)[ # data.table of df
,.SD[ # for each by=X4, .SD is the sub-table
order(X2)][1:4,], # first four entries ordered by X2
by="X4"][ # X4 is the grouping variable
!is.na(X3)][ # filter out NAs (i.e. less than 4 entries per row)
,list(X1,X2,X3,X4)] # order the columns

I think that Thomas's solution is fine, but can be improved. I would guess that the splitting, recombining, and reordering might be time consuming.
Instead, I would create a vector from which we can subset.
This is easily done with ave and should work since the data are already ordered.
Continuing from:
odf <- df[order(-as.numeric(df$X4), as.numeric(df$X2)), ]
we can do:
out <- odf[ave(odf$X4, odf$X4, FUN = seq_along) <= 4, ]
head(out)
# X1 X2 X3 X4
# 24 3 4 13 25
# 6 23 5 13 25
# 19 9 11 24 25
# 40 10 13 11 25
# 93 16 2 25 24
# 26 10 11 13 24
tail(out)
# X1 X2 X3 X4
# 61 23 7 13 2
# 2 9 9 5 2
# 17 18 18 16 2
# 67 12 1 1 1
# 52 22 14 24 1
# 9 16 24 6 1
Update: New alternatives and benchmarks
The "dplyr" package would be great for this, and the syntax is pretty compact. But first, let's set some things up to see how fast these options are:
Functions to benchmark
fun1 <- function() {
odf <- df[order(-as.numeric(df$X4), as.numeric(df$X2)), ]
out <- do.call(rbind, lapply(split(odf, odf$X4), function(z) head(z[order(z$X2),],4) ))
out[order(out$X4, decreasing=TRUE),]
}
fun2 <- function() {
odf <- df[order(-as.numeric(df$X4), as.numeric(df$X2)), ]
odf[ave(odf$X4, odf$X4, FUN = seq_along) <= 4, ]
}
fun3 <- function() {
DT <- data.table(df)
DT[, X := -X4]
setkey(DT, X, X2)
DT[, .SD[sequence(min(.N, 4))], by = X][, X:=NULL][]
}
fun4 <- function() {
group_by(arrange(df, desc(X4), X2), X4) %.%
mutate(vals = seq_along(X4)) %.%
filter(vals <= 4)
}
A bigger version of your sample data
set.seed(1)
df <- data.frame(matrix(sample(0:1000, 1000000 * 4, replace = TRUE), ncol = 4))
The necessary packages
library(data.table)
library(dplyr)
library(microbenchmark)
The first two approaches (Thomas's and my first approach) take a fair amount of time, so instead of benchmarking, I'll just time them once.
system.time(fun1())
# user system elapsed
# 6.645 0.007 6.670
system.time(fun2())
# user system elapsed
# 4.053 0.004 4.186
Here's the "dplyr" and "data.table" results.
microbenchmark(fun3(), fun4(), times = 20)
# Unit: seconds
# expr min lq median uq max neval
# fun3() 2.157956 2.221746 2.303286 2.343951 2.392391 20
# fun4() 1.169212 1.180780 1.194994 1.206651 1.369922 20
Compare the output of the "dplyr" and "data.table" approaches:
out_DT <- fun3()
out_DP <- fun4()
out_DT
# X1 X2 X3 X4
# 1: 340 0 708 1000
# 2: 144 1 667 1000
# 3: 73 2 142 1000
# 4: 79 2 826 1000
# 5: 169 0 870 999
# ---
# 4000: 46 4 2 1
# 4001: 88 0 809 0
# 4002: 535 0 522 0
# 4003: 75 3 234 0
# 4004: 983 3 492 0
head(out_DP, 5)
# Source: local data frame [5 x 5]
# Groups: X4
#
# X1 X2 X3 X4 vals
# 1 340 0 708 1000 1
# 2 144 1 667 1000 2
# 3 73 2 142 1000 3
# 4 79 2 826 1000 4
# 5 169 0 870 999 1
tail(out_DP, 5)
# Source: local data frame [5 x 5]
# Groups: X4
#
# X1 X2 X3 X4 vals
# 4000 46 4 2 1 4
# 4001 88 0 809 0 1
# 4002 535 0 522 0 2
# 4003 75 3 234 0 3
# 4004 983 3 492 0 4

I include your code again with a set.seed call, so that this is exactly reproducible.
set.seed(1)
m0 <- matrix(0, 100, 4)
df <- data.frame(apply(m0, c(1,2), function(x) sample(c(0:25),1)))
odf <- df[order(-as.numeric(df$X4), as.numeric(df$X2)), ]
Here's the code you need using a split-apply-combine strategy:
out <- do.call(rbind, lapply(split(odf, odf$X4), function(z) head(z[order(z$X2),],4) ))
out <- out[order(out$X4, decreasing=TRUE),]
Result:
> dim(out)
[1] 79 4
> head(out)
X1 X2 X3 X4
25.24 3 4 13 25
25.6 23 5 13 25
25.19 9 11 24 25
25.40 10 13 11 25
24.93 16 2 25 24
24.26 10 11 13 24

Related

Combination of all pairs of rows using R

Here is my dataset:
data <- read.table(header = TRUE, text = "
group index group_index x y z
a 1 a1 12 13 14
a 2 a2 15 20 22
b 1 b1 24 17 28
b 2 b2 12 19 30
b 3 b3 31 32 33 ")
For each case in group "a" and each case in group "b", I wanna combine their x, y, z values in a row, so the data matrix or dataframe I want will look like:
[,1] [,2] [,3] [,4] [,5] [,6] [,7]
[1,] a1_b1 12 13 14 24 17 28 # x,y,z for a1, follows by x,y,z for b1
[2,] a1_b2 12 13 14 12 19 30 # x,y,z for a1, follows by x,y,z for b2
[3,] a1_b3 12 13 14 31 32 33
[4,] a2_b1 15 20 22 24 17 28 # x,y,z for a2, follows by x,y,z for b1
[5,] a2_b2 15 20 22 12 19 30
[6,] a2_b3 15 20 22 31 32 33
I'm wondering how to achieve this goal? Thanks so much!
We can split data based on group and take a cartesian product using merge
list_df <- split(data[c("x", "y", "z")], data$group)
out <- merge(list_df[[1]], list_df[[2]], by = NULL)
out[do.call(order, out), ]
# x.x y.x z.x x.y y.y z.y
#3 12 13 14 12 19 30
#1 12 13 14 24 17 28
#5 12 13 14 31 32 33
#4 15 20 22 12 19 30
#2 15 20 22 24 17 28
#6 15 20 22 31 32 33
You could also do a join on non-matching group values (< instead of != to avoid repeating pairs)
library(data.table)
setDT(data)
data[data, on = .(group < group),
.(g = paste0(group_index, '_', i.group_index),
x, y, z, i.x, i.y, i.z),
nomatch = NULL]
# g x y z i.x i.y i.z
# 1: a1_b1 12 13 14 24 17 28
# 2: a2_b1 15 20 22 24 17 28
# 3: a1_b2 12 13 14 12 19 30
# 4: a2_b2 15 20 22 12 19 30
# 5: a1_b3 12 13 14 31 32 33
# 6: a2_b3 15 20 22 31 32 33
A simple solution using dplyr:
library(tidyverse)
dcross <- left_join(data, data, by=character(), suffix=c("1", "2")) |>
filter(group1 != group2)
# index1 group_index1 x1 y1 index2 group_index2 x2 y2
# 1 1 a1 12 13 1 b1 24 17
# 2 1 a1 12 13 2 b2 12 19
# 3 1 a1 12 13 3 b3 31 32
# 4 2 a2 15 20 1 b1 24 17
# 5 2 a2 15 20 2 b2 12 19
# 6 2 a2 15 20 3 b3 31 32
And to get the described matrix from the dataframe
dcross |>
select(matches("^[xyz]\\d")) |>
as.matrix()
# x1 y1 z1 x2 y2 z2
# [1,] 12 13 14 24 17 28
# [2,] 12 13 14 12 19 30
# [3,] 12 13 14 31 32 33
# [4,] 15 20 22 24 17 28
# [5,] 15 20 22 12 19 30
# [6,] 15 20 22 31 32 33

Assigning Values based on row value

I have a large vector (column of a data frame) where values containing integers 1 to 30. I want to replace numbers from 1 to 5 with 1, 6 to 10 with 5, 11 to 15 with 9...
> x3 <- sample(1:30, 100, rep=TRUE)
> x3
[1] 13 24 16 30 10 6 15 10 3 17 18 22 11 13 29 7 25 28 17 27 1 5 6 20 15 15 8 10 13 26 27 24 3 24 5 7 10 6 28 27 1 4 22 25 14 13 2 10 4 29 23 24 30 24 29 11 2 28 23 1 1 2
[63] 3 23 13 26 21 22 11 4 8 26 17 11 20 23 6 14 24 5 15 21 11 13 6 14 20 11 22 9 6 29 4 30 20 30 4 24 23 29
As I mentioned this is a column in a data frame and with above assignment I want to create a different column. If I do the following I have to do this 30 times.
myFrame$NewColumn[myFrame$oldColumn==1] <- 1
myFrame$NewColumn[myFrame$oldColumn==2] <- 1
myFrame$NewColumn[myFrame$oldColumn==3] <- 1
...
Whats a better way to do this?
We can do this with cut (suppose what you mean by '...' is 10, 11, 12):
x4 <- cut(x3,
breaks = c(seq(1, 30, 5), 30), right = F, include.lowest = T, # generate correct intervals
labels = 4 * (0:5) + 1) # number to fill
# x4 is factor. We should convert it to character first then to the number
x4 <- as.numeric(as.character(x4))
Did you try:
myFrame$NewColumn[myFrame$oldColumn > 0 & myFrame$oldColumn< 6] <- 1
myFrame$NewColumn[myFrame$oldColumn > 5 & myFrame$oldColumn< 11] <- 1
...
Or even better:
myFrame$NewColumn <- as.integer((myFrame$oldColumn - 1)/5)) * 4 + 1

Assign weights in lpSolveAPI to prioritise variables

I am trying to set up a linear programming solution using lpSolveAPI and R to solve a scheduling problem. Below is a small sample of the data; the minutes required for each session id, and their 'preferred' order/weight.
id <- 1:100
min <- sample(0:500, 100)
weight <- (1:100)/sum(1:100)
data <- data.frame(id, min, weight)
What I want to do is arrange/schedule these session IDs so that there are maximum number sessions in a day, preferably by their weight and each day is capped by a total of 400 minutes.
This is how I have set it up currently in R:
require(lpSolveAPI)
#Set up matrix to hold results; each row represents day
r <- 5
c <- 10
row <- 1
results <- matrix(0, nrow = r, ncol = c)
rownames(results) <- format(seq(Sys.Date(), by = "days", length.out = r), "%Y-%m-%d")
for (i in 1:r){
for(j in 1:c){
lp <- make.lp(0, nrow(data))
set.type(lp, 1:nrow(data), "binary")
set.objfn(lp, rep(1, nrow(data)))
lp.control(lp, sense = "max")
add.constraint(lp, data$min, "<=", 400)
set.branch.weights(lp, data$weight)
solve(lp)
a <- get.variables(lp)*data$id
b <- a[a!=0]
tryCatch(results[row, 1:length(b)] <- b, error = function(x) 0)
if(dim(data[!data$id == a,])[1] > 0) {
data <- data[!data$id== a,]
row <- row + 1
}
break
}
}
sum(results > 0)
barplot(results) #View of scheduled IDs
A quick look at the results matrix tells me that while the setup works to maximise number of sessions so that the total minutes in a day are close to 400 as possible, the setup doesn't follow the weights given. I expect my results matrix to be filled with increasing session IDs.
I have tried assigning different weights, weights in reverse order etc. but for some reason my setup doesn't seem to enforce "set.branch.weights".
I have read the documentation for "set.branch.weights" from lpSolveAPI but I think I am doing something wrong here.
Example - Data:
id min weight
1 67 1
2 72 2
3 36 3
4 91 4
5 80 5
6 44 6
7 76 7
8 58 8
9 84 9
10 96 10
11 21 11
12 1 12
13 41 13
14 66 14
15 89 15
16 62 16
17 11 17
18 42 18
19 68 19
20 25 20
21 44 21
22 90 22
23 4 23
24 33 24
25 31 25
Should be
Day 1 67 72 36 91 80 44 76
Day 2 58 84 96 21 1 41 66 89
Day 3 62 11 42 68 25 44 90 4 33 31
Each day has a cumulative sum of <= 480m.
My simple minded approach:
df = read.table(header=T,text="
id min weight
1 67 1
2 72 2
3 36 3
4 91 4
5 80 5
6 44 6
7 76 7
8 58 8
9 84 9
10 96 10
11 21 11
12 1 12
13 41 13
14 66 14
15 89 15
16 62 16
17 11 17
18 42 18
19 68 19
20 25 20
21 44 21
22 90 22
23 4 23
24 33 24
25 31 25")
# assume sorted by weight
daynr = 1
daymax = 480
dayusd = 0
for (i in 1:nrow(df))
{
v = df$min[i]
dayusd = dayusd + v
if (dayusd>daymax)
{
daynr = daynr + 1
dayusd = v
}
df$day[[i]] = daynr
}
This will give:
> df
id min weight day
1 1 67 1 1
2 2 72 2 1
3 3 36 3 1
4 4 91 4 1
5 5 80 5 1
6 6 44 6 1
7 7 76 7 1
8 8 58 8 2
9 9 84 9 2
10 10 96 10 2
11 11 21 11 2
12 12 1 12 2
13 13 41 13 2
14 14 66 14 2
15 15 89 15 2
16 16 62 16 3
17 17 11 17 3
18 18 42 18 3
19 19 68 19 3
20 20 25 20 3
21 21 44 21 3
22 22 90 22 3
23 23 4 23 3
24 24 33 24 3
25 25 31 25 3
>
I will concentrate on the first solve. We basically solve a knapsack problem (objective + one constraint):
When I run this model as is I get:
> solve(lp)
[1] 0
> x <- get.variables(lp)
> weightx <- data$weight * x
> sum(x)
[1] 14
> sum(weightx)
[1] 0.5952381
Now when I change the objective to
I get:
> solve(lp)
[1] 0
> x <- get.variables(lp)
> weightx <- data$weight * x
> sum(x)
[1] 14
> sum(weightx)
[1] 0.7428571
I.e. the count stayed at 14, but the weight improved.

Add data frames row wise with [d]plyr

I have two data frames
df1
# a b
# 1 10 20
# 2 11 21
# 3 12 22
# 4 13 23
# 5 14 24
# 6 15 25
df2
# a b
# 1 4 8
I want the following output:
df3
# a b
# 1 14 28
# 2 15 29
# 3 16 30
# 4 17 31
# 5 18 32
# 6 19 33
i.e. add df2 to each row of df1.
Is there a way to get the desired output using plyr (mdplyr??) or dplyr?
I see no reason for "dplyr" for something like this. In base R you could just do:
df1 + unclass(df2)
# a b
# 1 14 28
# 2 15 29
# 3 16 30
# 4 17 31
# 5 18 32
# 6 19 33
Which is the same as df1 + list(4, 8).
One liner with dplyr.
mutate_each(df1, funs(.+ df2$.), a:b)
# a b
#1 14 28
#2 15 29
#3 16 30
#4 17 31
#5 18 32
#6 19 33
A base R solution using sweet function sweep:
sweep(df1, 2, unlist(df2), '+')
# a b
#1 14 28
#2 15 29
#3 16 30
#4 17 31
#5 18 32
#6 19 33

Combine two dataframes one above the other

I have two dataframes and I want to put one above the other "with" column names of second as a row of the new dataframe. Column names are different and one dataframe has more columns.
For example:
mydf1 <- data.frame(V1=c(1:5), V2=c(21:25))
mydf1
V1 V2
1 1 21
2 2 22
3 3 23
4 4 24
5 5 25
mydf2 <- data.frame(C1=c(1:10), C2=c(21:30),C3=c(41:50))
mydf2
C1 C2 C3
1 1 21 41
2 2 22 42
3 3 23 43
4 4 24 44
5 5 25 45
6 6 26 46
7 7 27 47
8 8 28 48
9 9 29 49
10 10 30 50
Result:
mydf
V1 V2
1 1 21 NA
2 2 22 NA
3 3 23 NA
4 4 24 NA
5 5 25 NA
6 C1 C2 C3
7 1 21 41
8 2 22 42
9 3 23 43
10 4 24 44
11 5 25 45
12 6 26 46
13 7 27 47
14 8 28 48
15 9 29 49
16 10 30 50
I dont care if all numeric values treated like characters.
Many thanks
You can do this easily without any packages:
mydf1 <- data.frame(V1=c(1:5), V2=c(21:25))
mydf1[,3] <- NA
names(mydf1) <- c("one", "two", "three")
mydf2 <- data.frame(C1=c(1:10), C2=c(21:30),C3=c(41:50))
names <- t(as.data.frame(names(mydf2)))
names <- as.data.frame(names)
names(mydf2) <- c("one", "two", "three")
names(names) <- c("one", "two", "three")
mydf3 <- rbind(mydf1, names)
mydf4 <- rbind(mydf3, mydf2)
> mydf4
one two three
1 1 21 <NA>
2 2 22 <NA>
3 3 23 <NA>
4 4 24 <NA>
5 5 25 <NA>
6 C1 C2 C3
7 1 21 41
8 2 22 42
9 3 23 43
10 4 24 44
11 5 25 45
12 6 26 46
13 7 27 47
14 8 28 48
15 9 29 49
16 10 30 50
>
Of course, you can edit the <- c("one", "two", "three") to make the final column names whatever you'd like. For example:
> mydf1 <- data.frame(V1=c(1:5), V2=c(21:25))
> mydf1[,3] <- NA
> names(mydf1) <- c("V1", "V2", "NA")
> mydf2 <- data.frame(C1=c(1:10), C2=c(21:30),C3=c(41:50))
> names <- t(as.data.frame(names(mydf2)))
> names <- as.data.frame(names)
> names(mydf2) <- c("V1", "V2", "NA")
> names(names) <- c("V1", "V2", "NA")
> mydf3 <- rbind(mydf1, names)
> mydf4 <- rbind(mydf3, mydf2)
> row.names(mydf4) <- NULL
> mydf4
V1 V2 NA
1 1 21 <NA>
2 2 22 <NA>
3 3 23 <NA>
4 4 24 <NA>
5 5 25 <NA>
6 C1 C2 C3
7 1 21 41
8 2 22 42
9 3 23 43
10 4 24 44
11 5 25 45
12 6 26 46
13 7 27 47
14 8 28 48
15 9 29 49
16 10 30 50
If you need to resort a package for any reason when scaling this up to your real use case, then try melt from reshape2 or the package plyr. However, use of a package shouldn't be necessary.
I don't know what you tried with write.table, but that seems to me like the way to go.
I would create a function something like this:
myFun <- function(...) {
L <- list(...)
temp <- tempfile()
maxCol <- max(vapply(L, ncol, 1L))
lapply(L, function(x)
suppressWarnings(
write.table(x, file = temp, row.names = FALSE,
sep = ",", append = TRUE)))
read.csv(temp, header = FALSE, fill = TRUE,
col.names = paste0("New_", sequence(maxCol)),
stringsAsFactors = FALSE)
}
Usage would then simply be:
myFun(mydf1, mydf2)
# New_1 New_2 New_3
# 1 V1 V2
# 2 1 21
# 3 2 22
# 4 3 23
# 5 4 24
# 6 5 25
# 7 C1 C2 C3
# 8 1 21 41
# 9 2 22 42
# 10 3 23 43
# 11 4 24 44
# 12 5 25 45
# 13 6 26 46
# 14 7 27 47
# 15 8 28 48
# 16 9 29 49
# 17 10 30 50
The function is written such that you can specify more than two data.frames as input:
mydf3 <- data.frame(matrix(1:8, ncol = 4))
myFun(mydf1, mydf2, mydf3)
# New_1 New_2 New_3 New_4
# 1 V1 V2
# 2 1 21
# 3 2 22
# 4 3 23
# 5 4 24
# 6 5 25
# 7 C1 C2 C3
# 8 1 21 41
# 9 2 22 42
# 10 3 23 43
# 11 4 24 44
# 12 5 25 45
# 13 6 26 46
# 14 7 27 47
# 15 8 28 48
# 16 9 29 49
# 17 10 30 50
# 18 X1 X2 X3 X4
# 19 1 3 5 7
# 20 2 4 6 8
Here's one approach with the rbind.fill function (part of the plyr package).
library(plyr)
setNames(rbind.fill(setNames(mydf1, names(mydf2[seq(mydf1)])),
rbind(names(mydf2), mydf2)), names(mydf1))
V1 V2 NA
1 1 21 <NA>
2 2 22 <NA>
3 3 23 <NA>
4 4 24 <NA>
5 5 25 <NA>
6 C1 C2 C3
7 1 21 41
8 2 22 42
9 3 23 43
10 4 24 44
11 5 25 45
12 6 26 46
13 7 27 47
14 8 28 48
15 9 29 49
16 10 30 50
Give this a try.
Assign the column names from the second data set to a vector, and then replace the second set's names with the names from the first set. Then create a list where the middle element is the vector you assigned. Now when you call rbind, it should be fine since everything is in the right order.
d1$V3 <- NA
nm <- names(d2)
names(d2) <- names(d1)
dc <- do.call(rbind, list(d1,nm,d2))
rownames(dc) <- NULL
dc

Resources