Averaging every n columns and keep first two columns in the new data.frame in r - r

I have a data frame for daily time series with 4 observation for every day (every 6 hours) for each x and y (I have 202552 cells).
> head(tab,10)
x y X1990.05.01.01.00.00 X1990.05.01.07.00.00 X1990.05.01.13.00.00 X1990.05.01.19.00.00 X1990.05.02.01.00.00 X1990.05.02.07.00.00 X1990.05.02.13.00.00
1 5.000 60 276.9105 277.8516 278.9908 279.2422 279.6751 279.8078 280.4396
2 5.125 60 276.8863 277.8682 278.9966 279.2543 279.6863 279.7885 280.4033
3 5.250 60 276.8621 277.8830 279.0006 279.2659 279.6989 279.7688 280.3661
4 5.375 60 276.8379 277.8969 279.0029 279.2772 279.7123 279.7477 280.3289
5 5.500 60 276.8142 277.9094 279.0033 279.2879 279.7257 279.7244 280.2909
6 5.625 60 276.7913 277.9224 279.0033 279.2987 279.7396 279.6993 280.2523
7 5.750 60 276.7707 277.9363 279.0020 279.3094 279.7531 279.6715 280.2142
8 5.875 60 276.7537 277.9520 279.0002 279.3202 279.7656 279.6406 280.1770
9 6.000 60 276.7416 277.9713 278.9980 279.3314 279.7773 279.6070 280.1407
10 6.125 60 276.7357 277.9946 278.9953 279.3435 279.7871 279.5707 280.1071
X1990.05.02.19.00.00 X1990.05.03.01.00.00 X1990.05.03.07.00.00 X1990.05.03.13.00.00 X1990.05.03.19.00.00 X1990.05.04.01.00.00 X1990.05.04.07.00.00
1 280.5674 280.3316 280.3796 280.2308 280.6216 280.6216 280.1842
2 280.5414 280.3106 280.3697 280.2133 280.6220 280.6368 280.2053
3 280.5145 280.2886 280.3594 280.1927 280.6184 280.6503 280.2227
4 280.4858 280.2653 280.3482 280.1703 280.6113 280.6619 280.2380
5 280.4562 280.2420 280.3379 280.1466 280.6010 280.6722 280.2492
6 280.4262 280.2192 280.3280 280.1219 280.5880 280.6816 280.2572
7 280.3957 280.1981 280.3209 280.0973 280.5732 280.6910 280.2613
8 280.3661 280.1793 280.3159 280.0748 280.5571 280.7009 280.2626
9 280.3384 280.1640 280.3155 280.0542 280.5414 280.7112 280.2599
10 280.3128 280.1542 280.3195 280.0385 280.5270
I'd like to compute the daily average for every 4 columns (as each day has 4 measurements). I was able to use this function but I need to keep x and y for each row.
### daily mean
byapply <- function(x, by, fun, ...)
{
# Create index list
if (length(by) == 1)
{
nc <- ncol(x)
split.index <- rep(1:ceiling(nc / by), each = by, length.out = nc)
} else # 'by' is a vector of groups
{
nc <- length(by)
split.index <- by
}
index.list <- split(seq(from = 1, to = nc), split.index)
# Pass index list to fun using sapply() and return object
sapply(index.list, function(i)
{
do.call(fun, list(x[, i], ...))
})
}
DM<- data.frame(byapply(tab[3:2800], 4, rowMeans))
> head(DM, 10)
X1 X2 X3 X4 X5
1 278.2488 280.1225 280.3909 279.4138 276.6809
2 278.2514 280.1049 280.3789 279.4395 276.7141
3 278.2529 280.0871 280.3648 279.4634 276.7437
4 278.2537 280.0687 280.3488 279.4858 276.7691
5 278.2537 280.0493 280.3319 279.5066 276.7909
6 278.2539 280.0294 280.3143 279.5264 276.8090
7 278.2546 280.0086 280.2974 279.5453 276.8244
8 278.2565 279.9873 280.2818 279.5639 276.8377
9 278.2605 279.9658 280.2688 279.5819 276.8495
10 278.2673 279.9444 280.2598 279.5998 276.8611
Then I can use cbind to link daily means with each x and y
lonlat<-tab[-(3:2800)]
DMxy<- data.frame(cbind(lonlat, DM))
But I am looking for a way that I can compute the daily average directly by keeping the first two columns (x and y) in the new data frame (without deleting x and y) to minimize any possible error in cobind

Instead of
DM<- data.frame(byapply(tab[3:2800], 4, rowMeans))
try
DM2 <- cbind(byapply(tab[-(1:2)], 4, rowMeans), tab[1:2])
That will get you the desired result in a single step. Also, you minimize the chance of a mistake because you don't need to know the length of your dataframe; tab[-(1:2)] means "Every column in tab except the first two".

Classic textbook case to not store data in wide format due to needed operations such as grouped aggregation, specifically averaging. Consider melting your data into long format, and aggregate by the day for each X and Y grouping:
DATA (OP's posted example but filled in missing row 10 last two values)
txt= ' x y X1990.05.01.01.00.00 X1990.05.01.07.00.00 X1990.05.01.13.00.00 X1990.05.01.19.00.00 X1990.05.02.01.00.00 X1990.05.02.07.00.00 X1990.05.02.13.00.00 X1990.05.02.19.00.00 X1990.05.03.01.00.00 X1990.05.03.07.00.00 X1990.05.03.13.00.00 X1990.05.03.19.00.00 X1990.05.04.01.00.00 X1990.05.04.07.00.00
1 5.000 60 276.9105 277.8516 278.9908 279.2422 279.6751 279.8078 280.4396 280.5674 280.3316 280.3796 280.2308 280.6216 280.6216 280.1842
2 5.125 60 276.8863 277.8682 278.9966 279.2543 279.6863 279.7885 280.4033 280.5414 280.3106 280.3697 280.2133 280.6220 280.6368 280.2053
3 5.250 60 276.8621 277.8830 279.0006 279.2659 279.6989 279.7688 280.3661 280.5145 280.2886 280.3594 280.1927 280.6184 280.6503 280.2227
4 5.375 60 276.8379 277.8969 279.0029 279.2772 279.7123 279.7477 280.3289 280.4858 280.2653 280.3482 280.1703 280.6113 280.6619 280.2380
5 5.500 60 276.8142 277.9094 279.0033 279.2879 279.7257 279.7244 280.2909 280.4562 280.2420 280.3379 280.1466 280.6010 280.6722 280.2492
6 5.625 60 276.7913 277.9224 279.0033 279.2987 279.7396 279.6993 280.2523 280.4262 280.2192 280.3280 280.1219 280.5880 280.6816 280.2572
7 5.750 60 276.7707 277.9363 279.0020 279.3094 279.7531 279.6715 280.2142 280.3957 280.1981 280.3209 280.0973 280.5732 280.6910 280.2613
8 5.875 60 276.7537 277.9520 279.0002 279.3202 279.7656 279.6406 280.1770 280.3661 280.1793 280.3159 280.0748 280.5571 280.7009 280.2626
9 6.000 60 276.7416 277.9713 278.9980 279.3314 279.7773 279.6070 280.1407 280.3384 280.1640 280.3155 280.0542 280.5414 280.7112 280.2599
10 6.125 60 276.7357 277.9946 278.9953 279.3435 279.7871 279.5707 280.1071 280.3128 280.1542 280.3195 280.0385 280.5270 280.6581 280.3139'
df <- read.table(text=txt, header=TRUE)
CODE
library(reshape2)
mdf <- melt(df, id.vars = c('x', 'y'), variable.name = "day")
mdf$day <- gsub("X", "", mdf$day)
mdf$datetime <- as.POSIXct(mdf$day, format="%Y.%m.%d.%H.%M.%S")
mdf$day <- format(mdf$datetime, "%Y-%m-%d")
head(mdf)
# x y day value datetime
# 1 5.000 60 1990-05-01 276.9105 1990-05-01 01:00:00
# 2 5.125 60 1990-05-01 276.8863 1990-05-01 01:00:00
# 3 5.250 60 1990-05-01 276.8621 1990-05-01 01:00:00
# 4 5.375 60 1990-05-01 276.8379 1990-05-01 01:00:00
# 5 5.500 60 1990-05-01 276.8142 1990-05-01 01:00:00
# 6 5.625 60 1990-05-01 276.7913 1990-05-01 01:00:00
aggdf <- aggregate(value ~ x + y + day, mdf, FUN=mean)
aggdf <- with(aggdf, aggdf[order(x,y),]) # RE-ORDER BY X
row.names(aggdf) <- NULL # RESET ROW NAMES
head(aggdf, 12)
# x y day value
# 1 5.000 60 1990-05-01 278.2488
# 2 5.000 60 1990-05-02 280.1225
# 3 5.000 60 1990-05-03 280.3909
# 4 5.000 60 1990-05-04 280.4029
# 5 5.125 60 1990-05-01 278.2514
# 6 5.125 60 1990-05-02 280.1049
# 7 5.125 60 1990-05-03 280.3789
# 8 5.125 60 1990-05-04 280.4211
# 9 5.250 60 1990-05-01 278.2529
# 10 5.250 60 1990-05-02 280.0871
# 11 5.250 60 1990-05-03 280.3648
# 12 5.250 60 1990-05-04 280.4365

Related

R: Counting rows in a dataframe in which all values fall within individual ranges

I have a dataframe (A rows x K columns). For each column, I get the 5th and 95th percentile value. I want to know how many rows in the df have all K of their values within these K sets of 5th and 95th percentile values.
Example code below works, removing rows that do not fall within the bounds (1 and 9 here, but in practice will be percentiles), and then counting what remains. But A will be 10K and K will be 40, and I am simulating this dataframe 10K times, so I am wondering if there is code that will run faster.
data <- rbind(c(1,2,3,4,5), c(3,5,7,8,5), c(2,8,9,5,9), c(9,1,1,8,9),
c(3,5,6,7,5))
Lower_Bound <- rbind(1,1,1,1,1)
Upper_Bound <- rbind(9,9,9,9,9)
for (i in c(1:5)) {
data <- data[data[,i] > Lower_Bound[i,],]
data <- data[data[,i] < Upper_Bound[i,],]
}
N <- nrow(data)
If I understand correctly, the OP is only interested in the number of rows which fulfill the condition. So, there is no need to actually remove rows fromdata that do not fall within the bounds. It is sufficient to count the number of rows which do fall within the bounds.
This answer contains solutions for
matrices
data.frames
and a benchmark which compares
OP's approach,
apply() with matrices and data.frames,
an approach using purrr's map() and reduce() functions.
apply() with matrices
Let's start with the provided sample data and fixed Lower_Bound and Upper_Bound. Please, note that all three objects are matrices created by rbind(). This is in contrast to the text of the question which refers to a dataframe (A rows x K columns). Anyhow, we will provide solutions for both cases.
apply(data, 1, function(x) all(x > Lower_Bound & x < Upper_Bound))
returns a vector of type logical
[1] FALSE TRUE FALSE FALSE TRUE
The number of rows which fulfill the condition can be derived by
N <- sum(apply(data, 1, function(x) all(x > Lower_Bound & x < Upper_Bound)))
N
[1] 2
because TRUE is coerced to 1L and FALSE to 0L.
The next step is to also compute the bounds for each column as 5th and 95th percentile. For this, we have to create a new sample dataset mat, again as matrix
# create sample data
n_col <- 5
n_row <- 10
set.seed(42) # required for reproducible results
mat <- sapply(1:n_col, function(x) rnorm(n_row, mean = x))
mat
[,1] [,2] [,3] [,4] [,5]
[1,] 2.3709584 3.3048697 2.693361 4.455450 5.205999
[2,] 0.4353018 4.2866454 1.218692 4.704837 4.638943
[3,] 1.3631284 0.6111393 2.828083 5.035104 5.758163
[4,] 1.6328626 1.7212112 4.214675 3.391074 4.273295
[5,] 1.4042683 1.8666787 4.895193 4.504955 3.631719
[6,] 0.8938755 2.6359504 2.569531 2.282991 5.432818
[7,] 2.5115220 1.7157471 2.742731 3.215541 4.188607
[8,] 0.9053410 -0.6564554 1.236837 3.149092 6.444101
[9,] 3.0184237 -0.4404669 3.460097 1.585792 4.568554
[10,] 0.9372859 3.3201133 2.360005 4.036123 5.655648
For demonstration, each column has a different mean.
# count number of rows
probs <- c(0.05, 0.95)
bounds <- apply(mat, 2, quantile, probs)
idx <- apply(mat, 1, function(x) all(x > bounds[1, ] & x < bounds[2, ]))
N <- sum(idx)
N
1 5
If required, the subset of mat which fulfills the condition can be derived by
mat[idx, ]
[,1] [,2] [,3] [,4] [,5]
[1,] 2.3709584 3.304870 2.693361 4.455450 5.205999
[2,] 1.6328626 1.721211 4.214675 3.391074 4.273295
[3,] 0.8938755 2.635950 2.569531 2.282991 5.432818
[4,] 2.5115220 1.715747 2.742731 3.215541 4.188607
[5,] 0.9372859 3.320113 2.360005 4.036123 5.655648
The bounds are
bounds
[,1] [,2] [,3] [,4] [,5]
5% 0.641660 -0.5592606 1.226857 1.899532 3.882318
95% 2.790318 3.8517060 4.588960 4.886484 6.135429
apply() with data.frames
In case the dataset is a data.frame we can use the same code, i.e.,
df <- as.data.frame(mat)
probs <- c(0.05, 0.95)
bounds <- apply(df, 2, quantile, probs)
idx <- apply(df, 1, function(x) all(x > bounds[1, ] & x < bounds[2, ]))
N <- sum(idx)
Benchmark
The OP is looking for code which is faster than OP's own approach because the OP wants to replicate the simulation 10000 times.
So, here is a benchmark which compares
OP1: OP's own approach using matrices
OP2: a slightly modified version of OP1
apply_mat: the apply() function with matrices
apply_df: the apply() function with data.frames
purrr: using map(), pmap(), and reduce() from the purrr package
(Note that the list of methods is not exhaustive)
The benchmark is repeated for varying problem sizes, i.e., 5, 10, and 40 columns as well as 100, 1000, and 10000 rows. The largest problem size corresponds to the size of OP's simulations. As some codes modify the input dataset, all runs start with a fresh copy of the input data.
library(bench)
library(purrr)
library(ggplot2)
bm <- press(
n_col = c(5L, 10L, 40L)
, n_row = 10L^(2:4)
, {
set.seed(42)
mat0 <- sapply(1:n_col, function(x) rnorm(n_row, mean = x))
df0 <- as.data.frame(mat0)
mark(
OP1 = {
data <- data.table::copy(mat0)
Lower_Bound <- as.matrix(apply(data, 2, quantile, probs = 0.05), ncol = 1L)
Upper_Bound <- as.matrix(apply(data, 2, quantile, probs = 0.95), ncol = 1L)
for (i in seq_len(ncol(data))) {
data <- data[data[, i] > Lower_Bound[i, ], ]
data <- data[data[, i] < Upper_Bound[i, ], ]
}
nrow(data)
},
OP2 = {
data <- data.table::copy(mat0)
Lower_Bound <- as.matrix(apply(data, 2, quantile, probs = 0.05), ncol = 1L)
Upper_Bound <- as.matrix(apply(data, 2, quantile, probs = 0.95), ncol = 1L)
for (i in seq_len(ncol(data))) {
data <- data[data[, i] > Lower_Bound[i, ] & data[, i] < Upper_Bound[i, ], ]
}
nrow(data)
},
apply_mat = {
mat <- data.table::copy(mat0)
probs <- c(0.05, 0.95)
bounds <- apply(mat, 2, quantile, probs)
idx <- apply(mat, 1, function(x) all(x > bounds[1, ] & x < bounds[2, ]))
sum(idx)
},
apply_df = {
df <- data.table::copy(df0)
probs <- c(0.05, 0.95)
bounds <- apply(df, 2, quantile, probs)
idx <- apply(df, 1, function(x) all(x > bounds[1, ] & x < bounds[2, ]))
sum(idx)
},
purrr = {
data.table::copy(df0) %>%
map2(map_dfc(., quantile, probs), ~ (.x > .y[1L] & .x < .y[2L])) %>%
pmap(all) %>%
reduce(`+`)
}
)
}
)
autoplot(bm)
Note the logarithmic time scale
print(bm[, 1:11], n = Inf)
# A tibble: 45 x 11
expression n_col n_row min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
<bch:expr> <int> <dbl> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
1 OP1 5 100 1.46ms 1.93ms 493. 88.44KB 0 248 0 503ms
2 OP2 5 100 1.34ms 1.78ms 534. 71.56KB 0 267 0 500ms
3 apply_mat 5 100 1.16ms 1.42ms 621. 26.66KB 2.17 286 1 461ms
4 apply_df 5 100 1.41ms 1.8ms 526. 34.75KB 0 263 0 500ms
5 purrr 5 100 2.34ms 2.6ms 374. 17.86KB 0 187 0 500ms
6 OP1 10 100 2.42ms 2.78ms 344. 205.03KB 0 172 0 500ms
7 OP2 10 100 2.37ms 2.71ms 354. 153.38KB 2.07 171 1 484ms
8 apply_mat 10 100 1.76ms 2.12ms 457. 51.64KB 0 229 0 501ms
9 apply_df 10 100 2.31ms 2.63ms 367. 67.78KB 0 184 0 501ms
10 purrr 10 100 3.44ms 4.1ms 222. 34.89KB 2.09 106 1 477ms
11 OP1 40 100 9.4ms 10.57ms 92.9 955.41KB 0 47 0 506ms
12 OP2 40 100 9.18ms 10.08ms 96.8 638.92KB 0 49 0 506ms
13 apply_mat 40 100 5.44ms 6.46ms 146. 429.95KB 2.12 69 1 472ms
14 apply_df 40 100 6.12ms 6.75ms 141. 608.66KB 0 71 0 503ms
15 purrr 40 100 10.43ms 11.8ms 84.9 149.53KB 0 43 0 507ms
16 OP1 5 1000 1.75ms 1.94ms 478. 837.55KB 2.10 228 1 477ms
17 OP2 5 1000 1.69ms 1.94ms 487. 674.36KB 0 244 0 501ms
18 apply_mat 5 1000 4.84ms 5.62ms 176. 255.17KB 0 89 0 506ms
19 apply_df 5 1000 6.37ms 7.66ms 122. 333.58KB 0 62 0 506ms
20 purrr 5 1000 9.86ms 11.22ms 87.7 165.52KB 2.14 41 1 467ms
21 OP1 10 1000 3.35ms 3.91ms 253. 1.89MB 0 127 0 503ms
22 OP2 10 1000 3.33ms 3.72ms 256. 1.41MB 2.06 124 1 484ms
23 apply_mat 10 1000 5.86ms 6.93ms 142. 491.09KB 0 72 0 508ms
24 apply_df 10 1000 7.74ms 10.08ms 99.2 647.86KB 0 50 0 504ms
25 purrr 10 1000 14.55ms 15.44ms 62.5 323.17KB 2.23 28 1 448ms
26 OP1 40 1000 13.8ms 16.28ms 58.8 8.68MB 2.18 27 1 459ms
27 OP2 40 1000 13.29ms 14.72ms 67.9 5.84MB 0 34 0 501ms
28 apply_mat 40 1000 12.17ms 13.85ms 68.5 4.1MB 2.14 32 1 467ms
29 apply_df 40 1000 14.61ms 15.86ms 62.9 5.78MB 0 32 0 509ms
30 purrr 40 1000 41.85ms 43.66ms 22.7 1.25MB 0 12 0 529ms
31 OP1 5 10000 5.57ms 6.55ms 147. 8.15MB 2.07 71 1 482ms
32 OP2 5 10000 5.38ms 6.27ms 157. 6.55MB 2.06 76 1 485ms
33 apply_mat 5 10000 43.98ms 46.9ms 20.7 2.48MB 0 11 0 532ms
34 apply_df 5 10000 53.59ms 56.53ms 17.8 3.24MB 3.57 5 1 280ms
35 purrr 5 10000 86.32ms 88.83ms 11.1 1.6MB 0 6 0 540ms
36 OP1 10 10000 12.03ms 13.63ms 72.3 18.97MB 2.07 35 1 484ms
37 OP2 10 10000 11.66ms 12.97ms 76.5 14.07MB 4.25 36 2 471ms
38 apply_mat 10 10000 50.31ms 51.77ms 18.5 4.77MB 0 10 0 541ms
39 apply_df 10 10000 62.09ms 65.17ms 15.1 6.3MB 0 8 0 528ms
40 purrr 10 10000 125.82ms 128.3ms 7.35 3.13MB 2.45 3 1 408ms
41 OP1 40 10000 53.38ms 56.34ms 16.2 87.79MB 5.41 6 2 369ms
42 OP2 40 10000 46.24ms 47.43ms 20.3 58.82MB 2.25 9 1 444ms
43 apply_mat 40 10000 78.25ms 83.79ms 11.4 40.94MB 2.85 4 1 351ms
44 apply_df 40 10000 95.66ms 97.02ms 10.3 57.58MB 2.06 5 1 486ms
45 purrr 40 10000 361.26ms 373.23ms 2.68 12.31MB 0 2 0 746ms
Conclusions
To my surprise, OPs approach does perform quite well despite the repeated copy operations. In fact, for OP's problem size of 10000 rows and 40 columns the modified version OP2 is nearly tow times faster than apply_mat.
A possible explanation (which needs to be verified, though) is that OPs approach is kind of recursive where the number of rows to be checked are reduced when iterating over the columns.
Interestingly, the purrr variant has the lowest memory requirements.
Taking the median run time of about 50 ms for the OP2 method from this benchmark, 10000 repetitions of the simulation may take less than 10 minutes.

time between max and min of cycles

I have a series of data of 60,000 data which part of the data is as the figure 1 (the whole curve is not so nice and uniform like this image (some other part of data is as second image)) but there are many cycles with different period in my data.
I need to calculate the time of three red, green and purple rectangles for each of the cycles (** the time between each maximum and minimum and total time of cycles **)
Can you give me some ideas on how to do it in R ... is there any special command or package that I can use?
Premise is that the mean value of the data range is used to split the data into categories of peaks and not peaks. Then a running id is generated to group each set of data so an appropriate min or max value can be determined. The half_cycle provides the red and green boxes, while full_cycle provides the purple box for max-to-max and min-to-min. There is likely room for improvement, but it gives a method that can be adjusted as needed.
This sample uses random data since no sample data was provided.
set.seed(7)
wave <- c(seq(20, 50, 10), seq(50, 60, 0.5), seq(50, 20, -10))
df1 <- data.frame(time = seq_len(length(wave) * 5),
data = as.vector(replicate(5, wave + rnorm(length(wave), sd = 5))))
library(dplyr)
df1 %>%
mutate(peak = data > mean(range(df1$data))) %>%
mutate(run = cumsum(peak != lag(peak, default = TRUE))) %>%
group_by(run) %>%
mutate(max = max(data), min = min(data)) %>%
filter((peak == TRUE & data == max) | (peak == FALSE & data == min)) %>%
mutate(max = if_else(data == max, max, NULL), min = if_else(data == min, min , NULL)) %>%
ungroup() %>%
mutate(half_cycle = time - lag(time), full_cycle = time - lag(time, n = 2L))
# A tibble: 11 x 8
time data peak run max min half_cycle full_cycle
<int> <dbl> <lgl> <int> <dbl> <dbl> <int> <int>
1 2 24.0 FALSE 1 NA 24.0 NA NA
2 12 67.1 TRUE 2 67.1 NA 10 NA
3 29 15.1 FALSE 3 NA 15.1 17 27
4 54 68.5 TRUE 4 68.5 NA 25 42
5 59 20.8 FALSE 5 NA 20.8 5 30
6 80 70.6 TRUE 6 70.6 NA 21 26
7 87 18.3 FALSE 7 NA 18.3 7 28
8 108 63.1 TRUE 8 63.1 NA 21 28
9 117 13.8 FALSE 9 NA 13.8 9 30
10 140 64.5 TRUE 10 64.5 NA 23 32
11 145 22.4 FALSE 11 NA 22.4 5 28

Outputting percentiles by filtering a data frame

Note that, as requested in the comments, that this question has been revised.
Consider the following example:
df <- data.frame(FILTER = rep(1:10, each = 10), VALUE = 1:100)
I would like to, for each value of FILTER, create a data frame which contains the 1st, 2nd, ..., 99th percentiles of VALUE. The final product should be
PERCENTILE df_1 df_2 ... df_10
1 [first percentiles]
2 [second percentiles]
etc., where df_i is based on FILTER == i.
Note that FILTER, although it contains numbers, is actually categorical.
The way I have been doing this is by using dplyr:
nums <- 1:10
library(dplyr)
for (i in nums){
df_temp <- filter(df, FILTER == i)$VALUE
assign(paste0("df_", i), quantile(df_temp, probs = (1:99)/100))
}
and then I would have to cbind these (with 1:99 in the first column), but I would rather not type in every single df name. I have considered using a loop on the names of these data frames, but this would involve using eval(parse()).
Here's a basic outline of a possibly smoother approach. I have not included every single aspect of your desired output, but the modification should be fairly straightforward.
df <- data.frame(FILTER = rep(1:10, each = 10), VALUE = 1:100)
df_s <- lapply(split(df,df$FILTER),
FUN = function(x) quantile(x$VALUE,probs = c(0.25,0.5,0.75)))
out <- do.call(cbind,df_s)
colnames(out) <- paste0("df_",colnames(out))
> out
df_1 df_2 df_3 df_4 df_5 df_6 df_7 df_8 df_9 df_10
25% 3.25 13.25 23.25 33.25 43.25 53.25 63.25 73.25 83.25 93.25
50% 5.50 15.50 25.50 35.50 45.50 55.50 65.50 75.50 85.50 95.50
75% 7.75 17.75 27.75 37.75 47.75 57.75 67.75 77.75 87.75 97.75
I did this for just 3 quantiles to keep things simple, but it obviously extends. And you can add the 1:99 column afterwards as well.
I suggest that you use a list.
list_of_dfs <- list()
nums <- 1:10
for (i in nums){
list_of_dfs[[i]] <- nums*i
}
df <- data.frame(list_of_dfs[[1]])
df <- do.call("cbind",args=list(df,list_of_dfs))
colnames(df) <- paste0("df_",1:10)
You'll get the result you want:
df_1 df_2 df_3 df_4 df_5 df_6 df_7 df_8 df_9 df_10
1 1 2 3 4 5 6 7 8 9 10
2 2 4 6 8 10 12 14 16 18 20
3 3 6 9 12 15 18 21 24 27 30
4 4 8 12 16 20 24 28 32 36 40
5 5 10 15 20 25 30 35 40 45 50
6 6 12 18 24 30 36 42 48 54 60
7 7 14 21 28 35 42 49 56 63 70
8 8 16 24 32 40 48 56 64 72 80
9 9 18 27 36 45 54 63 72 81 90
10 10 20 30 40 50 60 70 80 90 100
How about using get?
df <- data.frame(1:10)
for (i in nums) {
df <- cbind(df, get(paste0("df_", i)))
}
# get rid of first useless column
df <- df[, -1]
# get names
names(df) <- paste0("df_", nums)
df

Automate regression by rows

I have a data.frame
set.seed(100)
exp <- data.frame(exp = c(rep(LETTERS[1:2], each = 10)), re = c(rep(seq(1, 10, 1), 2)), age1 = seq(10, 29, 1), age2 = seq(30, 49, 1),
h = c(runif(20, 10, 40)), h2 = c(40 + runif(20, 4, 9)))
I'd like to make a lm for each row in a data set (h and h2 ~ age1 and age2)
I do it by loop
exp$modelh <- 0
for (i in 1:length(exp$exp)){
age = c(exp$age1[i], exp$age2[i])
h = c(exp$h[i], exp$h2[i])
model = lm(age ~ h)
exp$modelh[i] = coef(model)[1] + 100 * coef(model)[2]
}
and it works well but takes some time with very large files. Will be grateful for the faster solution f.ex. dplyr
Using dplyr, we can try with rowwise() and do. Inside the do, we concatenate (c) the 'age1', 'age2' to create 'age', likewise, we can create 'h', apply lm, extract the coef to create the column 'modelh'.
library(dplyr)
exp %>%
rowwise() %>%
do({
age <- c(.$age1, .$age2)
h <- c(.$h, .$h2)
model <- lm(age ~ h)
data.frame(., modelh = coef(model)[1] + 100*coef(model)[2])
} )
gives the output
# exp re age1 age2 h h2 modelh
#1 A 1 10 30 19.23298 46.67906 68.85506
#2 A 2 11 31 17.73018 47.55402 66.17050
#3 A 3 12 32 26.56967 46.69174 84.98486
#4 A 4 13 33 11.69149 47.74486 61.98766
#5 A 5 14 34 24.05648 46.10051 82.90167
#6 A 6 15 35 24.51312 44.85710 89.21053
#7 A 7 16 36 34.37208 47.85151 113.37492
#8 A 8 17 37 21.10962 48.40977 74.79483
#9 A 9 18 38 26.39676 46.74548 90.34187
#10 A 10 19 39 15.10786 45.38862 75.07002
#11 B 1 20 40 28.74989 46.44153 100.54666
#12 B 2 21 41 36.46497 48.64253 125.34773
#13 B 3 22 42 18.41062 45.74346 81.70062
#14 B 4 23 43 21.95464 48.77079 81.20773
#15 B 5 24 44 32.87653 47.47637 115.95097
#16 B 6 25 45 30.07065 48.44727 101.10688
#17 B 7 26 46 16.13836 44.90204 84.31080
#18 B 8 27 47 20.72575 47.14695 87.00805
#19 B 9 28 48 20.78425 48.94782 84.25406
#20 B 10 29 49 30.70872 44.65144 128.39415
We could do this with the devel version of data.table i.e. v1.9.5. Instructions to install the devel version are here.
We convert the 'data.frame' to 'data.table' (setDT), create a column 'rn' with the option keep.rownames=TRUE. We melt the dataset by specifying the patterns in the measure to convert from 'wide' to 'long' format. Grouped by 'rn', we do the lm and get the coef. This can be assigned as a new column in the original dataset ('exp') while removing the unwanted 'rn' column by assigning (:=) it to NULL.
library(data.table)#v1.9.5+
modelh <- melt(setDT(exp, keep.rownames=TRUE), measure=patterns('^age', '^h'),
value.name=c('age', 'h'))[, {model <- lm(age ~h)
coef(model)[1] + 100 * coef(model)[2]},rn]$V1
exp[, modelh:= modelh][, rn := NULL]
exp
# exp re age1 age2 h h2 modelh
# 1: A 1 10 30 19.23298 46.67906 68.85506
# 2: A 2 11 31 17.73018 47.55402 66.17050
# 3: A 3 12 32 26.56967 46.69174 84.98486
# 4: A 4 13 33 11.69149 47.74486 61.98766
# 5: A 5 14 34 24.05648 46.10051 82.90167
# 6: A 6 15 35 24.51312 44.85710 89.21053
# 7: A 7 16 36 34.37208 47.85151 113.37492
# 8: A 8 17 37 21.10962 48.40977 74.79483
# 9: A 9 18 38 26.39676 46.74548 90.34187
#10: A 10 19 39 15.10786 45.38862 75.07002
#11: B 1 20 40 28.74989 46.44153 100.54666
#12: B 2 21 41 36.46497 48.64253 125.34773
#13: B 3 22 42 18.41062 45.74346 81.70062
#14: B 4 23 43 21.95464 48.77079 81.20773
#15: B 5 24 44 32.87653 47.47637 115.95097
#16: B 6 25 45 30.07065 48.44727 101.10688
#17: B 7 26 46 16.13836 44.90204 84.31080
#18: B 8 27 47 20.72575 47.14695 87.00805
#19: B 9 28 48 20.78425 48.94782 84.25406
#20: B 10 29 49 30.70872 44.65144 128.39415
Great (double) answer from #akrun.
Just a suggestion for your future analysis as you mentioned "it's an example of a bigger problem". Obviously, if you are really interested in building models rowwise then you'll create more and more columns as your age and h observations increase. If you get N observations you'll have to use 2xN columns for those 2 variables only.
I'd suggest to use a long data format in order to increase your rows instead of your columns.
Something like:
exp[1,] # how your first row (model building info) looks like
# exp re age1 age2 h h2
# 1 A 1 10 30 19.23298 46.67906
reshape(exp[1,], # how your model building info is transformed
varying = list(c("age1","age2"),
c("h","h2")),
v.names = c("age_value","h_value"),
direction = "long")
# exp re time age_value h_value id
# 1.1 A 1 1 10 19.23298 1
# 1.2 A 1 2 30 46.67906 1
Apologies if the "bigger problem" refers to something else and this answer is irrelevant.
With base R, the function sprintf can help us create formulas. And lapply carries out the calculation.
strings <- sprintf("c(%f,%f) ~ c(%f,%f)", exp$age1, exp$age2, exp$h, exp$h2)
lst <- lapply(strings, function(x) {model <- lm(as.formula(x));coef(model)[1] + 100 * coef(model)[2]})
exp$modelh <- unlist(lst)
exp
# exp re age1 age2 h h2 modelh
# 1 A 1 10 30 19.23298 46.67906 68.85506
# 2 A 2 11 31 17.73018 47.55402 66.17050
# 3 A 3 12 32 26.56967 46.69174 84.98486
# 4 A 4 13 33 11.69149 47.74486 61.98766
# 5 A 5 14 34 24.05648 46.10051 82.90167
# 6 A 6 15 35 24.51312 44.85710 89.21053
# 7 A 7 16 36 34.37208 47.85151 113.37493
# 8 A 8 17 37 21.10962 48.40977 74.79483
# 9 A 9 18 38 26.39676 46.74548 90.34187
# 10 A 10 19 39 15.10786 45.38862 75.07002
# 11 B 1 20 40 28.74989 46.44153 100.54666
# 12 B 2 21 41 36.46497 48.64253 125.34773
# 13 B 3 22 42 18.41062 45.74346 81.70062
# 14 B 4 23 43 21.95464 48.77079 81.20773
# 15 B 5 24 44 32.87653 47.47637 115.95097
# 16 B 6 25 45 30.07065 48.44727 101.10688
# 17 B 7 26 46 16.13836 44.90204 84.31080
# 18 B 8 27 47 20.72575 47.14695 87.00805
# 19 B 9 28 48 20.78425 48.94782 84.25406
# 20 B 10 29 49 30.70872 44.65144 128.39416
In the lapply function the expression as.formula(x) is what converts the formulas created in the first line into a format usable by the lm function.
Benchmark
library(dplyr)
library(microbenchmark)
set.seed(100)
big.exp <- data.frame(age1=sample(30, 1e4, T),
age2=sample(30:50, 1e4, T),
h=runif(1e4, 10, 40),
h2= 40 + runif(1e4,4,9))
microbenchmark(
plafort = {strings <- sprintf("c(%f,%f) ~ c(%f,%f)", big.exp$age1, big.exp$age2, big.exp$h, big.exp$h2)
lst <- lapply(strings, function(x) {model <- lm(as.formula(x));coef(model)[1] + 100 * coef(model)[2]})
big.exp$modelh <- unlist(lst)},
akdplyr = {big.exp %>%
rowwise() %>%
do({
age <- c(.$age1, .$age2)
h <- c(.$h, .$h2)
model <- lm(age ~ h)
data.frame(., modelh = coef(model)[1] + 100*coef(model)[2])
} )}
,times=5)
t: seconds
expr min lq mean median uq max neval cld
plafort 13.00605 13.41113 13.92165 13.56927 14.53814 15.08366 5 a
akdplyr 26.95064 27.64240 29.40892 27.86258 31.02955 33.55940 5 b
(Note: I downloaded the newest 1.9.5 devel version of data.table today, but continued to receive errors when trying to test it.
The results also differ fractionally (1.93 x 10^-8). Rounding likely accounts for the difference.)
all.equal(pl, ak)
[1] "Attributes: < Component “class”: Lengths (1, 3) differ (string compare on first 1) >"
[2] "Attributes: < Component “class”: 1 string mismatch >"
[3] "Component “modelh”: Mean relative difference: 1.933893e-08"
Conclusion
The lapply approach seems to perform well compared to dplyr with respect to speed, but it's 5 digit rounding may be an issue. Improvements may be possible. Perhaps using apply after converting to matrix to increase speed and efficiency.

I am trying to change the sign of a number to negative if it is below a specific target and it is to remain positive if it is above.

I put the section of code in bold that seems to be the problem. Here is the code:
## price impact analysis
rm(list=ls())
### import data from excel spreadsheets
chtr_trades <- read.csv("F:/FRE 6951 Mkt Micro Struc/CHTRTRADES.csv")
chtr_quotes <- read.csv("F:/FRE 6951 Mkt Micro Struc/CHTRQUOTES.csv")
## initialize bid ask
max_bid <- NULL
min_ask <- NULL
### cleans data
maxrm <- function(x) {
max(x, na.rm=TRUE)
}
minrm <- function(x) {
min(x, na.rm=TRUE)
}
## retrieve max bid and ask for each iteration
max_bid<- tapply(chtr_quotes[,4],chtr_quotes[,3], maxrm)
min_ask<- tapply(chtr_quotes[,5],chtr_quotes[,3], minrm)
time <- levels(chtr_quotes[,3])
## calculate previous second midpoint
midpoint <- (min_ask + max_bid)/2
askbidtime <- data.frame(midpoint,time,max_bid,min_ask)
row.names(askbidtime) <- seq(nrow(askbidtime))
askbidtime[,2] <- as.POSIXct(askbidtime[,2], format="%H:%M:%S")
ordered.askbidtime <- askbidtime[order(askbidtime$time),]
row.names(ordered.askbidtime) <- seq(nrow(ordered.askbidtime))
chtr_trades_revised <-chtr_trades[which(as.POSIXct(chtr_trades[,3],format="%H:%M:%S") %in% ordered.askbidtime[,2]),]
midpoint<-NULL
midpoint[1:5] <- NA
for(i in 6:3917) {
midpoint[i] <- as.numeric(ordered.askbidtime[which(ordered.askbidtime[,2]==as.POSIXct(chtr_trades_revised[i,3],format="%H:%M:%S"))-1,1])
}
***## sign trades
chtr_trades_revised$midpoint
chtr_trades_revised$midpoint <- midpoint
for(i in 6:3917) {
if((!is.na(chtr_trades_revised$midpoint[i])) & (chtr_trades_revised$midpoint[i] > chtr_trades_revised$PRICE[i])) {
chtr_trades_revised$signed_volume <- -chtr_trades_revised$SIZE
}
if((!is.na(chtr_trades_revised$midpoint[i])) & (chtr_trades_revised$midpoint[i] < chtr_trades_revised$PRICE[i])) {
chtr_trades_revised$signed_volume <- chtr_trades_revised$SIZE
}
}***
Here are the results. In the last column rows 4062 and 4054 should be positive but it makes the entire column negative:
SYMBOL DATE TIME PRICE SIZE midpoint signed_volume
4060 CHTR 20130718 2014-08-26 15:59:44 124.46 100 124.485 -100
4061 CHTR 20130718 2014-08-26 15:59:52 124.46 100 124.495 -100
4062 CHTR 20130718 2014-08-26 15:59:55 124.52 100 124.490 -100
4063 CHTR 20130718 2014-08-26 15:59:58 124.53 100 124.410 -100
4064 CHTR 20130718 2014-08-26 16:00:00 124.57 7951 124.550 -7951
4065 CHTR 20130718 2014-08-26 16:00:00 124.53 100 124.550 -100
Here's a cute way:
foo<- 1:10
threshold <- 5
foo<- foo*(-1)^(foo < threshold)
foo
[1] -1 -2 -3 -4 5 6 7 8 9 10
Another method:
foo = 1:10 ; threshold = 5
foo
[1] 1 2 3 4 5 6 7 8 9 10
foo = ifelse(foo>=threshold, foo, -foo)
foo
[1] -1 -2 -3 -4 5 6 7 8 9 10

Resources