Split duplicated rows of matrices in lists with base R - r

I have a list of matrices, with duplicated values in column id. How can I split the duplicates in all list elements?
The way I do it with data.frames is with lapply + split + duplicated, but that doesnt work with matrices, as they are also split up in numerics. I would like to keep the matrix structure.
## Data.frame - all good
df <- data.frame(
id = rep(1:10, each = 2),
val = rep(10, each = 20)
)
df_list <- rep(list(df), 2);
lapply(df_list, function(x){split(x, duplicated(x[,'id']))$'FALSE'})
## Matrix - Here's my problem
mt <- as.matrix(data.frame(
id = rep(seq(1,10,1), each = 2),
val = rep(10, each = 20)
))
mt_list <- rep(list(mt), 2)
lapply(mt_list, function(x){split(x, duplicated(x[,'id']))$'FALSE'})

Maybe try
split(df,ave(df$id, df$id, FUN= function(x) seq_along(x)))
$`1`
id val
1 1 10
3 2 10
5 3 10
7 4 10
9 5 10
11 6 10
13 7 10
15 8 10
17 9 10
19 10 10
$`2`
id val
2 1 10
4 2 10
6 3 10
8 4 10
10 5 10
12 6 10
14 7 10
16 8 10
18 9 10
20 10 10

While writing that question and fiddling around with the code, I came up with a solution.
Since i didnt find anything about this specific setup, I though I'll post it anyway.
The functions subset / subset.matrix work:
lapply(mt_list, function(x){subset.matrix(x, !duplicated(x[,'id']))})
I benchmarked the different options; subset.matrix seems to be slightly faster than just subset.
mt <- as.matrix(data.frame(
id = rep(seq(1,1000,1), each = 2),
val = rep(1000, each = 20)
))
mt_list <- rep(list(mt), 50)
mc <- microbenchmark::microbenchmark(
subset = lapply(mt_list, function(x){subset(x, !duplicated(x[,'id']))}),
subset.matrix = lapply(mt_list, function(x){subset.matrix(x, !duplicated(x[,'id']))}),
split = lapply(mt_list, function(x){matrix(split(x, duplicated(x[,'id']))$'FALSE', ncol = 2)}),
unique = lapply( mt_list, unique )
)
mc
Unit: milliseconds
expr min lq mean median uq max neval cld
subset 3.758708 3.862849 4.256363 3.900580 3.981629 9.713416 100 a
subset.matrix 3.583632 3.700450 4.174137 3.729881 3.821947 9.611992 100 a
split 32.630604 33.061503 34.535531 33.262841 33.726039 77.531039 100 b
unique 144.832487 148.408874 155.099591 150.326865 155.456601 430.992916 100 c

Related

How to store replicate runs in a dataframe

I have a function (call it random_func) that generates random numbers according to some rules using parameters. I'm trying to repeatedly call that function and store the results in a dataframe.
df <- lapply(c(1,2,3,4,5), FUN = function(x) replicate(100, expr = random_func(n=10, param=x)))
Right now, the output is a list of 5 vectors each with 100 elements. What R voodoo do I need to do in order to get it to look something like:
param, result
1, 5
1, 6
1, 8
...
5, 10
set.seed(42)
do.call(rbind, #rbind results for different x together
lapply(c(1,2), FUN = function(x)
data.frame(param = x, #will be recycled
result = do.call(what = c, #concatenate results of replicate
replicate(n = 2,
expr = rnorm(n = 3, mean = x), #replace with random_func
simplify = FALSE))))) #when FALSE, replicate returns list
# param result
# 1 1 2.3709584
# 2 1 0.4353018
# 3 1 1.3631284
# 4 1 1.6328626
# 5 1 1.4042683
# 6 1 0.8938755
# 7 2 3.5115220
# 8 2 1.9053410
# 9 2 4.0184237
# 10 2 1.9372859
# 11 2 3.3048697
# 12 2 4.2866454
rerun and map_df solution
from purrr
library(dplyr)
library(purrr)
Random function
random_func <- function(n, param) {
rnorm(n)+(param*10)
}
solution
myfun <- function() {
df <- 100 %>%
rerun(x=10, y=1:5) %>%
map_df(~data.frame(param=.x$y, result=random_func(n=.x$x, param=.x$y)))
}
Output
df <- myfun()
head(df)
param result
1 1 10.15325
2 2 19.52867
3 3 30.08218
4 4 40.06418
5 5 48.39804
6 1 11.00435
Additional validation
df %>%
group_by(param) %>%
summarise(mean = mean(result))
param mean
1 1 10.00634
2 2 20.03874
3 3 30.11093
4 4 40.06166
5 5 50.02632
Performance
library(microbenchmark)
microbenchmark(myfun())
expr min lq mean median uq max neval
myfun() 65.93166 66.80521 69.42349 68.5152 69.57185 90.77295 100

How can I add a row manually to a dataframe? [duplicate]

The following code combines a vector with a dataframe:
newrow = c(1:4)
existingDF = rbind(existingDF,newrow)
However this code always inserts the new row at the end of the dataframe.
How can I insert the row at a specified point within the dataframe? For example, lets say the dataframe has 20 rows, how can I insert the new row between rows 10 and 11?
Here's a solution that avoids the (often slow) rbind call:
existingDF <- as.data.frame(matrix(seq(20),nrow=5,ncol=4))
r <- 3
newrow <- seq(4)
insertRow <- function(existingDF, newrow, r) {
existingDF[seq(r+1,nrow(existingDF)+1),] <- existingDF[seq(r,nrow(existingDF)),]
existingDF[r,] <- newrow
existingDF
}
> insertRow(existingDF, newrow, r)
V1 V2 V3 V4
1 1 6 11 16
2 2 7 12 17
3 1 2 3 4
4 3 8 13 18
5 4 9 14 19
6 5 10 15 20
If speed is less important than clarity, then #Simon's solution works well:
existingDF <- rbind(existingDF[1:r,],newrow,existingDF[-(1:r),])
> existingDF
V1 V2 V3 V4
1 1 6 11 16
2 2 7 12 17
3 3 8 13 18
4 1 2 3 4
41 4 9 14 19
5 5 10 15 20
(Note we index r differently).
And finally, benchmarks:
library(microbenchmark)
microbenchmark(
rbind(existingDF[1:r,],newrow,existingDF[-(1:r),]),
insertRow(existingDF,newrow,r)
)
Unit: microseconds
expr min lq median uq max
1 insertRow(existingDF, newrow, r) 660.131 678.3675 695.5515 725.2775 928.299
2 rbind(existingDF[1:r, ], newrow, existingDF[-(1:r), ]) 801.161 831.7730 854.6320 881.6560 10641.417
Benchmarks
As #MatthewDowle always points out to me, benchmarks need to be examined for the scaling as the size of the problem increases. Here we go then:
benchmarkInsertionSolutions <- function(nrow=5,ncol=4) {
existingDF <- as.data.frame(matrix(seq(nrow*ncol),nrow=nrow,ncol=ncol))
r <- 3 # Row to insert into
newrow <- seq(ncol)
m <- microbenchmark(
rbind(existingDF[1:r,],newrow,existingDF[-(1:r),]),
insertRow(existingDF,newrow,r),
insertRow2(existingDF,newrow,r)
)
# Now return the median times
mediansBy <- by(m$time,m$expr, FUN=median)
res <- as.numeric(mediansBy)
names(res) <- names(mediansBy)
res
}
nrows <- 5*10^(0:5)
benchmarks <- sapply(nrows,benchmarkInsertionSolutions)
colnames(benchmarks) <- as.character(nrows)
ggplot( melt(benchmarks), aes(x=Var2,y=value,colour=Var1) ) + geom_line() + scale_x_log10() + scale_y_log10()
#Roland's solution scales quite well, even with the call to rbind:
5 50 500 5000 50000 5e+05
insertRow2(existingDF, newrow, r) 549861.5 579579.0 789452 2512926 46994560 414790214
insertRow(existingDF, newrow, r) 895401.0 905318.5 1168201 2603926 39765358 392904851
rbind(existingDF[1:r, ], newrow, existingDF[-(1:r), ]) 787218.0 814979.0 1263886 5591880 63351247 829650894
Plotted on a linear scale:
And a log-log scale:
insertRow2 <- function(existingDF, newrow, r) {
existingDF <- rbind(existingDF,newrow)
existingDF <- existingDF[order(c(1:(nrow(existingDF)-1),r-0.5)),]
row.names(existingDF) <- 1:nrow(existingDF)
return(existingDF)
}
insertRow2(existingDF,newrow,r)
V1 V2 V3 V4
1 1 6 11 16
2 2 7 12 17
3 1 2 3 4
4 3 8 13 18
5 4 9 14 19
6 5 10 15 20
microbenchmark(
+ rbind(existingDF[1:r,],newrow,existingDF[-(1:r),]),
+ insertRow(existingDF,newrow,r),
+ insertRow2(existingDF,newrow,r)
+ )
Unit: microseconds
expr min lq median uq max
1 insertRow(existingDF, newrow, r) 513.157 525.6730 531.8715 544.4575 1409.553
2 insertRow2(existingDF, newrow, r) 430.664 443.9010 450.0570 461.3415 499.988
3 rbind(existingDF[1:r, ], newrow, existingDF[-(1:r), ]) 606.822 625.2485 633.3710 653.1500 1489.216
The .before argument in dplyr::add_row can be used to specify the row.
dplyr::add_row(
cars,
speed = 0,
dist = 0,
.before = 3
)
#> speed dist
#> 1 4 2
#> 2 4 10
#> 3 0 0
#> 4 7 4
#> 5 7 22
#> 6 8 16
#> ...
You should try dplyr package
library(dplyr)
a <- data.frame(A = c(1, 2, 3, 4),
B = c(11, 12, 13, 14))
system.time({
for (i in 50:1000) {
b <- data.frame(A = i, B = i * i)
a <- bind_rows(a, b)
}
})
Output
user system elapsed
0.25 0.00 0.25
In contrast with using rbind function
a <- data.frame(A = c(1, 2, 3, 4),
B = c(11, 12, 13, 14))
system.time({
for (i in 50:1000) {
b <- data.frame(A = i, B = i * i)
a <- rbind(a, b)
}
})
Output
user system elapsed
0.49 0.00 0.49
There is some performance gain.
Insert blank row after five row in data frame and use this library package.
library(berryFunctions)
df <- insertRows(df, 5 , new = "")

add values that are the same within a vector

I have a vector
x <- c(1,2,5,4,3,1,1,4,2,6,7,2,4,1,5)
and I want to add the values that are the same, giving me the new vector
x <- c(4, 6, 3, 12, 10, 6, 7)
It sounds quite simple but I am stuck.
You can use sapply to iterate over the vector of the unique values, and then sum each one of the corresponding entries, like so:
> sapply(unique(x), function(i) sum(x[x == i]))
[1] 4 6 10 12 3 6 7
If the order is relevant, please indicate which order do you want.
In this solution, the order is the same as the output of unique, which you can use to know what is the sum of what value.
> unique(x)
[1] 1 2 5 4 3 6 7
Edit
It looks like you want the ascending order of unique values. In that case, you can do like this:
> sapply(sort(unique(x)), function(i) sum(x[x == i]))
[1] 4 6 3 12 10 6 7
aggregate(x, list(number = x), FUN = sum )
# number x
#1 1 4
#2 2 6
#3 3 3
#4 4 12
#5 5 10
#6 6 6
#7 7 7
The result is a data.frame and you can extract the second column as usual.
Here's another option, for fun:
with(rle(sort(x)), lengths * values)
# [1] 4 6 3 12 10 6 7
Benchmarks
library(microbenchmark)
x <- c(1,2,5,4,3,1,1,4,2,6,7,2,4,1,5)
x <- rep(x, length.out=1000)
matthew <- function() with(rle(sort(x)), lengths * values)
iled <- function() sapply(sort(unique(x)), function(i) sum(x[x == i]))
kota <- function() as.numeric(table(x) * as.integer(names(table(x))))
deena <- function() {
freqTable = as.data.frame(table(x))
as.numeric(as.character(freqTable$x)) * freqTable$Freq
}
roland <- function() aggregate(x, list(number = x), FUN = sum )$x
microbenchmark(matthew(), iled(), kota(), deena(), roland())
# Unit: microseconds
# expr min lq mean median uq max neval
# matthew() 105.5 116.9 167.5 122.5 131.3 1466 100
# iled() 111.2 125.6 160.3 131.4 138.8 1449 100
# kota() 1821.5 1899.3 1960.4 1915.9 1940.7 3031 100
# deena() 1124.7 1175.6 1221.1 1187.9 1207.7 2700 100
# roland() 1912.2 1967.9 2116.6 1995.5 2078.5 3610 100
One way to do that would be multiplying each element with its frequency. The table function does a good with that :
freqTable = as.data.frame(table(x))
requiredResult = as.numeric(as.character(freqTable$x)) * freqTable$Freq
You want to compute #n * n for each n, where #n is the number of occurrence of n.
Just an alternative approach to the sapply above.
table(x) * as.integer(names(table(x)))
# x
# 1 2 3 4 5 6 7
# 4 6 3 12 10 6 7

Apply a correction factor to one column based on the value of a second column

Example Data
A<-c(1,4,5,6,2,3,4,5,6,7,8,7)
B<-c(4,6,7,8,2,2,2,3,8,8,7,8)
DF<-data.frame(A,B)
What I would like to do is apply a correction factor to column A, based on the values of column B. The rules would be something like this
If B less than 4 <- Multiply A by 1
If B equal to 4 and less than 6 <- Multiply A by 2
If B equal or greater than 6 <- Multiply by 4
I suppose I could write an "if" statement (and I'd be glad to see a good example), but I'd also be interested in using square bracket indexing to speed things up.
The end result would look like this
A B
2 4
16 6
20 7
24 8
ect
Use this:
within(DF, A <- ifelse(B>=6, 4, ifelse(B<4, 1, 2)) * A)
Or this (corrected by #agstudy):
within(DF, {A[B>=6] <- A[B>=6]*4; A[B>=4 & B<6] <- A[B>=4 & B<6]*2})
Benchmarking:
DF <- data.frame(A=rpois(1e4, 5), B=rpois(1e4, 5))
a <- function(DF) within(DF, A <- ifelse(B>=6, 4, ifelse(B<4, 1, 2)) * A)
b <- function(DF) within(DF, {A[B>=6] <- A[B>=6]*4; A[B>=4 & B<6] <- A[B>=4 & B<6]*2})
identical(a(DF), b(DF))
#[1] TRUE
microbenchmark(a(DF), b(DF), times=1000)
#Unit: milliseconds
# expr min lq median uq max neval
# a(DF) 8.603778 10.253799 11.07999 11.923116 53.91140 1000
# b(DF) 3.763470 3.889065 5.34851 5.480294 39.72503 1000
Similar to #Ferdinand solution but using transform
transform(DF, newcol = ifelse(B<4, A,
ifelse(B>=6,4*A,2*A)))
A B newcol
1 1 4 2
2 4 6 16
3 5 7 20
4 6 8 24
5 2 2 2
6 3 2 3
7 4 2 4
8 5 3 5
9 6 8 24
10 7 8 28
11 8 7 32
12 7 8 28
I prefer to use findInterval as an index into a set of factors for such operations. The proliferation of nested test-conditional and consequent vectors with multiple ifelse calls offends my efficiency sensibilities:
DF$A <- DF$A * c(1,2,4)[findInterval(DF$B, c(-Inf,4,6,Inf) ) ]
DF
A B
1 2 4
2 16 6
3 20 7
4 24 8
snipped ....
Benchmark:
DF <- data.frame(A=rpois(1e4, 5), B=rpois(1e4, 5))
a <- function(DF) within(DF, A <- ifelse(B>=6, 4, ifelse(B<4, 1, 2)) * A)
b <- function(DF) within(DF, {A[B>=6] <- A[B>=6]*4; A[B>=4 & B<6] <- A[B>=4 & B<6]*2})
ccc <- function(DF) within(DF, {A * c(1,2,4)[findInterval(B, c(-Inf,4,6,Inf) ) ]})
microbenchmark(a(DF), b(DF), ccc(DF), times=1000)
#-----------
Unit: microseconds
expr min lq median uq max neval
a(DF) 7616.107 7843.6320 8105.0340 8322.5620 93549.85 1000
b(DF) 2638.507 2789.7330 2813.8540 3072.0785 92389.57 1000
ccc(DF) 604.555 662.5335 676.0645 698.8665 85375.14 1000
Note: I would not have done this using within if I were coding my own function, but thought for fairness to the earlier effort, I would make it apples <-> apples.

Add new row to dataframe, at specific row-index, not appended?

The following code combines a vector with a dataframe:
newrow = c(1:4)
existingDF = rbind(existingDF,newrow)
However this code always inserts the new row at the end of the dataframe.
How can I insert the row at a specified point within the dataframe? For example, lets say the dataframe has 20 rows, how can I insert the new row between rows 10 and 11?
Here's a solution that avoids the (often slow) rbind call:
existingDF <- as.data.frame(matrix(seq(20),nrow=5,ncol=4))
r <- 3
newrow <- seq(4)
insertRow <- function(existingDF, newrow, r) {
existingDF[seq(r+1,nrow(existingDF)+1),] <- existingDF[seq(r,nrow(existingDF)),]
existingDF[r,] <- newrow
existingDF
}
> insertRow(existingDF, newrow, r)
V1 V2 V3 V4
1 1 6 11 16
2 2 7 12 17
3 1 2 3 4
4 3 8 13 18
5 4 9 14 19
6 5 10 15 20
If speed is less important than clarity, then #Simon's solution works well:
existingDF <- rbind(existingDF[1:r,],newrow,existingDF[-(1:r),])
> existingDF
V1 V2 V3 V4
1 1 6 11 16
2 2 7 12 17
3 3 8 13 18
4 1 2 3 4
41 4 9 14 19
5 5 10 15 20
(Note we index r differently).
And finally, benchmarks:
library(microbenchmark)
microbenchmark(
rbind(existingDF[1:r,],newrow,existingDF[-(1:r),]),
insertRow(existingDF,newrow,r)
)
Unit: microseconds
expr min lq median uq max
1 insertRow(existingDF, newrow, r) 660.131 678.3675 695.5515 725.2775 928.299
2 rbind(existingDF[1:r, ], newrow, existingDF[-(1:r), ]) 801.161 831.7730 854.6320 881.6560 10641.417
Benchmarks
As #MatthewDowle always points out to me, benchmarks need to be examined for the scaling as the size of the problem increases. Here we go then:
benchmarkInsertionSolutions <- function(nrow=5,ncol=4) {
existingDF <- as.data.frame(matrix(seq(nrow*ncol),nrow=nrow,ncol=ncol))
r <- 3 # Row to insert into
newrow <- seq(ncol)
m <- microbenchmark(
rbind(existingDF[1:r,],newrow,existingDF[-(1:r),]),
insertRow(existingDF,newrow,r),
insertRow2(existingDF,newrow,r)
)
# Now return the median times
mediansBy <- by(m$time,m$expr, FUN=median)
res <- as.numeric(mediansBy)
names(res) <- names(mediansBy)
res
}
nrows <- 5*10^(0:5)
benchmarks <- sapply(nrows,benchmarkInsertionSolutions)
colnames(benchmarks) <- as.character(nrows)
ggplot( melt(benchmarks), aes(x=Var2,y=value,colour=Var1) ) + geom_line() + scale_x_log10() + scale_y_log10()
#Roland's solution scales quite well, even with the call to rbind:
5 50 500 5000 50000 5e+05
insertRow2(existingDF, newrow, r) 549861.5 579579.0 789452 2512926 46994560 414790214
insertRow(existingDF, newrow, r) 895401.0 905318.5 1168201 2603926 39765358 392904851
rbind(existingDF[1:r, ], newrow, existingDF[-(1:r), ]) 787218.0 814979.0 1263886 5591880 63351247 829650894
Plotted on a linear scale:
And a log-log scale:
insertRow2 <- function(existingDF, newrow, r) {
existingDF <- rbind(existingDF,newrow)
existingDF <- existingDF[order(c(1:(nrow(existingDF)-1),r-0.5)),]
row.names(existingDF) <- 1:nrow(existingDF)
return(existingDF)
}
insertRow2(existingDF,newrow,r)
V1 V2 V3 V4
1 1 6 11 16
2 2 7 12 17
3 1 2 3 4
4 3 8 13 18
5 4 9 14 19
6 5 10 15 20
microbenchmark(
+ rbind(existingDF[1:r,],newrow,existingDF[-(1:r),]),
+ insertRow(existingDF,newrow,r),
+ insertRow2(existingDF,newrow,r)
+ )
Unit: microseconds
expr min lq median uq max
1 insertRow(existingDF, newrow, r) 513.157 525.6730 531.8715 544.4575 1409.553
2 insertRow2(existingDF, newrow, r) 430.664 443.9010 450.0570 461.3415 499.988
3 rbind(existingDF[1:r, ], newrow, existingDF[-(1:r), ]) 606.822 625.2485 633.3710 653.1500 1489.216
The .before argument in dplyr::add_row can be used to specify the row.
dplyr::add_row(
cars,
speed = 0,
dist = 0,
.before = 3
)
#> speed dist
#> 1 4 2
#> 2 4 10
#> 3 0 0
#> 4 7 4
#> 5 7 22
#> 6 8 16
#> ...
You should try dplyr package
library(dplyr)
a <- data.frame(A = c(1, 2, 3, 4),
B = c(11, 12, 13, 14))
system.time({
for (i in 50:1000) {
b <- data.frame(A = i, B = i * i)
a <- bind_rows(a, b)
}
})
Output
user system elapsed
0.25 0.00 0.25
In contrast with using rbind function
a <- data.frame(A = c(1, 2, 3, 4),
B = c(11, 12, 13, 14))
system.time({
for (i in 50:1000) {
b <- data.frame(A = i, B = i * i)
a <- rbind(a, b)
}
})
Output
user system elapsed
0.49 0.00 0.49
There is some performance gain.
Insert blank row after five row in data frame and use this library package.
library(berryFunctions)
df <- insertRows(df, 5 , new = "")

Resources