Getting output in combination form in r - r

I have 3 variables. Hence, I will get 6 combinations. I want to produce the 2nd column in conbination form. I got the folloWing result.
P<- matrix(c (
0.427, -0.193, 0.673,
-0.193, 0.094, -0.428,
0.673, -0.428, 224.099
), c( 3,3))
G <-matrix(c(
0.238, -0.033, 0.468,
-0.033, 0.084, -0.764,
0.468, -0.764, 205.144
), c(3,3))
A<-rep(1, each=nrow(P))
df<-do.call(rbind, lapply(1:ncol(P), function(x) {
do.call(rbind, combn(ncol(P), x, function(y) {
data.frame(comb = paste(y, collapse = ""),
B = ((solve(P)%*%G)%*%A),
stringsAsFactors = FALSE)
}, simplify = FALSE))
}))
>df
comb B
1 1 -19.7814149
2 1 -44.1515387
3 1 0.8891786
4 2 -19.7814149
5 2 -44.1515387
6 2 0.8891786
7 3 -19.7814149
8 3 -44.1515387
9 3 0.8891786
10 12 -19.7814149
11 12 -44.1515387
12 12 0.8891786
13 13 -19.7814149
14 13 -44.1515387
15 13 0.8891786
16 23 -19.7814149
17 23 -44.1515387
18 23 0.8891786
19 123 -19.7814149
20 123 -44.1515387
21 123 0.8891786
Here I got only 3 (-19.7814149,-44.1515387,0.8891786) values But I wanted 12 values.
comb B
1 0.5574
2 0.8936
3 0.9154
12 10.0772, 21.233
13 0.2083 , 0.9169
23 -3.1085, 0.9061
123 -19.7814, -44.1515, 0.8892
I can't manage this. Furthermore, I want to use these B values to calculate my desired result (GA), where my formula is
b<-t(B)
gain<-do.call(rbind, lapply(1:ncol(P), function(x) {
do.call(rbind, combn(ncol(P), x, function(y) {
data.frame(GA = abs(round(1.76*(sum(G[y,y] %*% B[y] * A[y]))/ sqrt((b[y] %*%P[y,y])%*%B[y]),2)),
stringsAsFactors = FALSE)
}, simplify = FALSE))
}))
My final output is
comb B GA
1 0.5574 0.641
2 0.8936 0.4822
3 0.9154 24.1186
12 10.0772, 21.233 3.123
13 0.2083 , 0.9169 24.1748
23 -3.1085, 0.9061 24.0867
123 -19.7814, -44.1515, 0.8892 24.9097
Is there any solution?

Related

Find the mid point where the sum of upper half and lower half of data.table is almost equal

I have a data.table in which I want to find the column ID at which the sum of all rows above and below that ID is equal or almost equal.
dt = x <- structure(list(value = c(7.496, 11.073, 11.329, 9.282, 8.748, 12.515, 7.46, 9.189, 9.62, 5.815, 5.945,
7.778, 10.077, 15.311, 8.591, 6.048, 7.568, 6.14, 6.591, 5.376,
8.038, 7.496, 7.983, 6.591, 6.591, 7.44, 6.453, 11.589, 5.751,
8.464, 7.577, 6.014, 12.733, 7.108, 14.857, 15.503, 12.468, 13.39,
10.796, 10.923, 7.215, 13.72, 7.574, 11.77, 10.409, 7.591, 6.174,
6.748, 10.091, 9.8, 6.527, 9.251, 6.622, 13.742, 4.454, 8.331,
7.702, 7.197, 9.629, 9.76, 3.663, 19.55, 8.107, 9.637, 10.146,
9.564, 6.947, 14.45, 10.266, 5.457, 10.629, 6.275, 2.48, 4.513,
6.755, 2.885, 5.773, 2.855, 2.429, 2.955, 2.486, 3.239, 4.29,
3.043, 3.501, 3.276, 4.018, 2.727, 5.199, 2.371, 3.732, 2.533,
4.482, 3.215, 7.782, 3.435, 4.201, 3.074, 3.475, 2.923, 3.025,
4.308, 3.932, 2.923, 3.491, 2.852, 3.916), ID = 1:107), row.names = c(NA,
-107L), class = "data.frame")
> dt
value ID
1 7.496 1
2 11.073 2
3 11.329 3
4 9.282 4
5 8.748 5
6 12.515 6
7 7.460 7
8 9.189 8
9 9.620 9
10 5.815 10
11 5.945 11
12 7.778 12
13 10.077 13
14 15.311 14
15 8.591 15
16 6.048 16
17 7.568 17
18 6.140 18
19 6.591 19
20 5.376 20
21 8.038 21
22 7.496 22
23 7.983 23
24 6.591 24
25 6.591 25
26 7.440 26
27 6.453 27
28 11.589 28
29 5.751 29
30 8.464 30
31 7.577 31
32 6.014 32
33 12.733 33
34 7.108 34
35 14.857 35
...
...
One of the solutions is to split the data.table into two half's and then manually check the split point
split(dt, cumsum(dt$value) <= sum(dt$value)/2)
However, I just need the ID where the sum of upper and lower values is almost equal. Is there a solution without using the split as using the split function is very inefficient on a large dataset?
As given in the example dt, the number of rows is even, so the expected result in this particular case should be 43.5.
With which.min:
which.min(abs(cumsum(dt$value) - (sum(dt$value) / 2)))
#[1] 43
In the case where IDs are different from row numbers, you can index:
dt$ID[which.min(abs(cumsum(dt$value) - (sum(dt$value) / 2)))]
#[1] 43

Basic use of approx() for lookup table with linear interpolation

I have a vector, my_points and a dataframe that describes the almost linear relationship between points and values.
How do I obtain the vector, my_values, from the relationship described in the dataframe and my_points using linear interpolation?
Assume the relationship beyond the last point in the data frame remains linear.
my_points <- c(4400, 8800, 13200, 37600, 42000, 46400, 50800, 55200, 59600,
64000, 68400, 72800, 77200, 81600, 86000, 90400, 94800, 99200,
103600, 108000, 112400, 116800, 121200, 125600)
df <- structure(list(points = c(3000, 4500, 7500, 11000, 14500, 21500,
43000, 71500), values = c(20, 30, 50, 75, 100, 150, 300, 500),
points_per_value = c(150, 150, 150, 146.666666666667, 145,
143.333333333333, 143.333333333333, 143)), row.names = c(NA,
-8L), class = c("tbl_df", "tbl", "data.frame"))
You said "interpolation", in which case you can get:
cbind(
data.frame(my_points),
lapply(df[-1], function(z) approx(df$points, z, xout = my_points)$y)
)
# my_points values points_per_value
# 1 4400 29.33333 150.0000
# 2 8800 59.28571 148.7619
# 3 13200 90.71429 145.6190
# 4 37600 262.32558 143.3333
# 5 42000 293.02326 143.3333
# 6 46400 323.85965 143.2936
# 7 50800 354.73684 143.2421
# 8 55200 385.61404 143.1906
# 9 59600 416.49123 143.1392
# 10 64000 447.36842 143.0877
# 11 68400 478.24561 143.0363
# 12 72800 NA NA
# 13 77200 NA NA
# 14 81600 NA NA
# 15 86000 NA NA
# 16 90400 NA NA
# 17 94800 NA NA
# 18 99200 NA NA
# 19 103600 NA NA
# 20 108000 NA NA
# 21 112400 NA NA
# 22 116800 NA NA
# 23 121200 NA NA
# 24 125600 NA NA
But you also said "beyond the last point", suggesting you want "extrapolation":
cbind(
data.frame(my_points), lapply(df[-1], function(z)
Hmisc::approxExtrap(df$points, z, xout = my_points)$y)
)
# my_points values points_per_value
# 1 4400 29.33333 150.0000
# 2 8800 59.28571 148.7619
# 3 13200 90.71429 145.6190
# 4 37600 262.32558 143.3333
# 5 42000 293.02326 143.3333
# 6 46400 323.85965 143.2936
# 7 50800 354.73684 143.2421
# 8 55200 385.61404 143.1906
# 9 59600 416.49123 143.1392
# 10 64000 447.36842 143.0877
# 11 68400 478.24561 143.0363
# 12 72800 509.12281 142.9848
# 13 77200 540.00000 142.9333
# 14 81600 570.87719 142.8819
# 15 86000 601.75439 142.8304
# 16 90400 632.63158 142.7789
# 17 94800 663.50877 142.7275
# 18 99200 694.38596 142.6760
# 19 103600 725.26316 142.6246
# 20 108000 756.14035 142.5731
# 21 112400 787.01754 142.5216
# 22 116800 817.89474 142.4702
# 23 121200 848.77193 142.4187
# 24 125600 879.64912 142.3673
If all you need is the vector of one of these columns, then
Hmisc::approxExtrap(df$points, df$my_values, xout = my_points)$y

Rolling prediction in a data frame using dplyr and rollapply

My first question here :)
My goal is: Given a data frame with predictors (each column a predictor / rows observations) fit a regression using lm and then predict the value using the last observation using a rolling window.
The data frame looks like:
> DfPredictor[1:40,]
Y X1 X2 X3 X4 X5
1 3.2860 192.5115 2.1275 83381 11.4360 8.7440
2 3.2650 190.1462 2.0050 88720 11.4359 8.8971
3 3.2213 192.9773 2.0500 74130 11.4623 8.8380
4 3.1991 193.7058 2.1050 73930 11.3366 8.7536
5 3.2224 193.5407 2.0275 80875 11.3534 8.7555
6 3.2000 190.6049 2.0950 86606 11.3290 8.8555
7 3.1939 191.1390 2.0975 91402 11.2960 8.8433
8 3.1971 192.2921 2.2700 88181 11.2930 8.8681
9 3.1873 194.9700 2.3300 115959 1.9477 8.5245
10 3.2182 194.5396 2.4200 134754 11.3200 8.4990
11 3.2409 194.5396 2.2025 136685 1.9649 8.4192
12 3.2112 195.1362 2.1900 136316 1.9750 8.3752
13 3.2231 193.3560 2.2475 140295 1.9691 8.3546
14 3.2015 192.9649 2.2575 139474 1.9500 8.3116
15 3.1744 194.0154 2.1900 146202 1.8476 8.2225
16 3.1646 194.4423 2.2650 142983 1.8600 8.1948
17 3.1708 194.9473 2.2425 141377 1.8522 8.2589
18 3.1675 193.9788 2.2400 141377 1.8600 8.2600
19 3.1744 194.2563 2.3000 149875 1.8718 8.2899
20 3.1410 193.4316 2.2300 129561 1.8480 8.2395
21 3.1266 191.2633 2.2550 122636 1.8440 8.2396
22 3.1486 192.0354 2.3600 130996 1.8570 8.8640
23 3.1282 194.3351 2.4825 92430 1.7849 8.1291
24 3.1214 193.5196 2.4750 94814 1.7624 8.1991
25 3.1230 193.2017 2.3725 87590 1.7660 8.2310
26 3.1182 192.1642 2.4475 87715 1.6955 8.2414
27 3.1203 191.3744 2.3775 89857 1.6539 8.2480
28 3.1156 192.2646 2.3725 92159 1.5976 8.1676
29 3.1270 192.7555 2.3675 97425 1.5896 8.1162
30 3.1154 194.0375 2.3725 87598 1.5277 8.2640
31 3.1104 192.0596 2.3850 93236 1.5132 7.9999
32 3.0846 192.2792 2.2900 94608 1.4990 8.1600
33 3.0569 193.2573 2.3050 84663 1.4715 8.2200
34 3.0893 192.7632 2.2550 67149 1.4955 7.9590
35 3.0991 192.1229 2.3050 75519 1.4280 7.9183
36 3.0879 192.1229 2.3100 76756 1.3839 7.9133
37 3.0965 192.0502 2.2175 61748 1.3130 7.8750
38 3.0655 191.2274 2.2300 41490 1.2823 7.8656
39 3.0636 191.6342 2.1925 51049 1.1492 7.7447
40 3.1097 190.9312 2.2150 21934 1.1626 7.6895
For instance using the rolling window with width = 10 the regression should be estimate and then predict the 'Y' correspondent to the X1,X2,...,X5.
The predictions should be included in a new column 'Ypred'.
There's some way to do that using rollapply + lm/predict + mudate??
Many thanks!!
Using the data in the Note at the end and assuming that in a window of width 10 we want to predict the last Y (i..e. the 10th), then:
library(zoo)
pred <- function(x) tail(fitted(lm(Y ~., as.data.frame(x))), 1)
transform(DF, pred = rollapplyr(DF, 10, pred, by.column = FALSE, fill = NA))
giving:
Y X1 X2 X3 X4 X5 pred
1 3.2860 192.5115 2.1275 83381 11.4360 8.7440 NA
2 3.2650 190.1462 2.0050 88720 11.4359 8.8971 NA
3 3.2213 192.9773 2.0500 74130 11.4623 8.8380 NA
4 3.1991 193.7058 2.1050 73930 11.3366 8.7536 NA
5 3.2224 193.5407 2.0275 80875 11.3534 8.7555 NA
6 3.2000 190.6049 2.0950 86606 11.3290 8.8555 NA
7 3.1939 191.1390 2.0975 91402 11.2960 8.8433 NA
8 3.1971 192.2921 2.2700 88181 11.2930 8.8681 NA
9 3.1873 194.9700 2.3300 115959 1.9477 8.5245 NA
10 3.2182 194.5396 2.4200 134754 11.3200 8.4990 3.219764
11 3.2409 194.5396 2.2025 136685 1.9649 8.4192 3.241614
12 3.2112 195.1362 2.1900 136316 1.9750 8.3752 3.225423
13 3.2231 193.3560 2.2475 140295 1.9691 8.3546 3.217797
14 3.2015 192.9649 2.2575 139474 1.9500 8.3116 3.205856
15 3.1744 194.0154 2.1900 146202 1.8476 8.2225 3.177928
16 3.1646 194.4423 2.2650 142983 1.8600 8.1948 3.156405
17 3.1708 194.9473 2.2425 141377 1.8522 8.2589 3.176243
18 3.1675 193.9788 2.2400 141377 1.8600 8.2600 3.177165
19 3.1744 194.2563 2.3000 149875 1.8718 8.2899 3.177211
20 3.1410 193.4316 2.2300 129561 1.8480 8.2395 3.145533
21 3.1266 191.2633 2.2550 122636 1.8440 8.2396 3.127410
22 3.1486 192.0354 2.3600 130996 1.8570 8.8640 3.148792
23 3.1282 194.3351 2.4825 92430 1.7849 8.1291 3.124913
24 3.1214 193.5196 2.4750 94814 1.7624 8.1991 3.124992
25 3.1230 193.2017 2.3725 87590 1.7660 8.2310 3.117981
26 3.1182 192.1642 2.4475 87715 1.6955 8.2414 3.117679
27 3.1203 191.3744 2.3775 89857 1.6539 8.2480 3.119898
28 3.1156 192.2646 2.3725 92159 1.5976 8.1676 3.121039
29 3.1270 192.7555 2.3675 97425 1.5896 8.1162 3.123903
30 3.1154 194.0375 2.3725 87598 1.5277 8.2640 3.119438
31 3.1104 192.0596 2.3850 93236 1.5132 7.9999 3.113963
32 3.0846 192.2792 2.2900 94608 1.4990 8.1600 3.101229
33 3.0569 193.2573 2.3050 84663 1.4715 8.2200 3.076817
34 3.0893 192.7632 2.2550 67149 1.4955 7.9590 3.083266
35 3.0991 192.1229 2.3050 75519 1.4280 7.9183 3.089377
36 3.0879 192.1229 2.3100 76756 1.3839 7.9133 3.084225
37 3.0965 192.0502 2.2175 61748 1.3130 7.8750 3.075252
38 3.0655 191.2274 2.2300 41490 1.2823 7.8656 3.063025
39 3.0636 191.6342 2.1925 51049 1.1492 7.7447 3.068808
40 3.1097 190.9312 2.2150 21934 1.1626 7.6895 3.091819
Note: Input DF in reproducible form is:
Lines <- " Y X1 X2 X3 X4 X5
1 3.2860 192.5115 2.1275 83381 11.4360 8.7440
2 3.2650 190.1462 2.0050 88720 11.4359 8.8971
3 3.2213 192.9773 2.0500 74130 11.4623 8.8380
4 3.1991 193.7058 2.1050 73930 11.3366 8.7536
5 3.2224 193.5407 2.0275 80875 11.3534 8.7555
6 3.2000 190.6049 2.0950 86606 11.3290 8.8555
7 3.1939 191.1390 2.0975 91402 11.2960 8.8433
8 3.1971 192.2921 2.2700 88181 11.2930 8.8681
9 3.1873 194.9700 2.3300 115959 1.9477 8.5245
10 3.2182 194.5396 2.4200 134754 11.3200 8.4990
11 3.2409 194.5396 2.2025 136685 1.9649 8.4192
12 3.2112 195.1362 2.1900 136316 1.9750 8.3752
13 3.2231 193.3560 2.2475 140295 1.9691 8.3546
14 3.2015 192.9649 2.2575 139474 1.9500 8.3116
15 3.1744 194.0154 2.1900 146202 1.8476 8.2225
16 3.1646 194.4423 2.2650 142983 1.8600 8.1948
17 3.1708 194.9473 2.2425 141377 1.8522 8.2589
18 3.1675 193.9788 2.2400 141377 1.8600 8.2600
19 3.1744 194.2563 2.3000 149875 1.8718 8.2899
20 3.1410 193.4316 2.2300 129561 1.8480 8.2395
21 3.1266 191.2633 2.2550 122636 1.8440 8.2396
22 3.1486 192.0354 2.3600 130996 1.8570 8.8640
23 3.1282 194.3351 2.4825 92430 1.7849 8.1291
24 3.1214 193.5196 2.4750 94814 1.7624 8.1991
25 3.1230 193.2017 2.3725 87590 1.7660 8.2310
26 3.1182 192.1642 2.4475 87715 1.6955 8.2414
27 3.1203 191.3744 2.3775 89857 1.6539 8.2480
28 3.1156 192.2646 2.3725 92159 1.5976 8.1676
29 3.1270 192.7555 2.3675 97425 1.5896 8.1162
30 3.1154 194.0375 2.3725 87598 1.5277 8.2640
31 3.1104 192.0596 2.3850 93236 1.5132 7.9999
32 3.0846 192.2792 2.2900 94608 1.4990 8.1600
33 3.0569 193.2573 2.3050 84663 1.4715 8.2200
34 3.0893 192.7632 2.2550 67149 1.4955 7.9590
35 3.0991 192.1229 2.3050 75519 1.4280 7.9183
36 3.0879 192.1229 2.3100 76756 1.3839 7.9133
37 3.0965 192.0502 2.2175 61748 1.3130 7.8750
38 3.0655 191.2274 2.2300 41490 1.2823 7.8656
39 3.0636 191.6342 2.1925 51049 1.1492 7.7447
40 3.1097 190.9312 2.2150 21934 1.1626 7.6895"
DF <- read.table(text = Lines, header = TRUE)

Automate regression by rows

I have a data.frame
set.seed(100)
exp <- data.frame(exp = c(rep(LETTERS[1:2], each = 10)), re = c(rep(seq(1, 10, 1), 2)), age1 = seq(10, 29, 1), age2 = seq(30, 49, 1),
h = c(runif(20, 10, 40)), h2 = c(40 + runif(20, 4, 9)))
I'd like to make a lm for each row in a data set (h and h2 ~ age1 and age2)
I do it by loop
exp$modelh <- 0
for (i in 1:length(exp$exp)){
age = c(exp$age1[i], exp$age2[i])
h = c(exp$h[i], exp$h2[i])
model = lm(age ~ h)
exp$modelh[i] = coef(model)[1] + 100 * coef(model)[2]
}
and it works well but takes some time with very large files. Will be grateful for the faster solution f.ex. dplyr
Using dplyr, we can try with rowwise() and do. Inside the do, we concatenate (c) the 'age1', 'age2' to create 'age', likewise, we can create 'h', apply lm, extract the coef to create the column 'modelh'.
library(dplyr)
exp %>%
rowwise() %>%
do({
age <- c(.$age1, .$age2)
h <- c(.$h, .$h2)
model <- lm(age ~ h)
data.frame(., modelh = coef(model)[1] + 100*coef(model)[2])
} )
gives the output
# exp re age1 age2 h h2 modelh
#1 A 1 10 30 19.23298 46.67906 68.85506
#2 A 2 11 31 17.73018 47.55402 66.17050
#3 A 3 12 32 26.56967 46.69174 84.98486
#4 A 4 13 33 11.69149 47.74486 61.98766
#5 A 5 14 34 24.05648 46.10051 82.90167
#6 A 6 15 35 24.51312 44.85710 89.21053
#7 A 7 16 36 34.37208 47.85151 113.37492
#8 A 8 17 37 21.10962 48.40977 74.79483
#9 A 9 18 38 26.39676 46.74548 90.34187
#10 A 10 19 39 15.10786 45.38862 75.07002
#11 B 1 20 40 28.74989 46.44153 100.54666
#12 B 2 21 41 36.46497 48.64253 125.34773
#13 B 3 22 42 18.41062 45.74346 81.70062
#14 B 4 23 43 21.95464 48.77079 81.20773
#15 B 5 24 44 32.87653 47.47637 115.95097
#16 B 6 25 45 30.07065 48.44727 101.10688
#17 B 7 26 46 16.13836 44.90204 84.31080
#18 B 8 27 47 20.72575 47.14695 87.00805
#19 B 9 28 48 20.78425 48.94782 84.25406
#20 B 10 29 49 30.70872 44.65144 128.39415
We could do this with the devel version of data.table i.e. v1.9.5. Instructions to install the devel version are here.
We convert the 'data.frame' to 'data.table' (setDT), create a column 'rn' with the option keep.rownames=TRUE. We melt the dataset by specifying the patterns in the measure to convert from 'wide' to 'long' format. Grouped by 'rn', we do the lm and get the coef. This can be assigned as a new column in the original dataset ('exp') while removing the unwanted 'rn' column by assigning (:=) it to NULL.
library(data.table)#v1.9.5+
modelh <- melt(setDT(exp, keep.rownames=TRUE), measure=patterns('^age', '^h'),
value.name=c('age', 'h'))[, {model <- lm(age ~h)
coef(model)[1] + 100 * coef(model)[2]},rn]$V1
exp[, modelh:= modelh][, rn := NULL]
exp
# exp re age1 age2 h h2 modelh
# 1: A 1 10 30 19.23298 46.67906 68.85506
# 2: A 2 11 31 17.73018 47.55402 66.17050
# 3: A 3 12 32 26.56967 46.69174 84.98486
# 4: A 4 13 33 11.69149 47.74486 61.98766
# 5: A 5 14 34 24.05648 46.10051 82.90167
# 6: A 6 15 35 24.51312 44.85710 89.21053
# 7: A 7 16 36 34.37208 47.85151 113.37492
# 8: A 8 17 37 21.10962 48.40977 74.79483
# 9: A 9 18 38 26.39676 46.74548 90.34187
#10: A 10 19 39 15.10786 45.38862 75.07002
#11: B 1 20 40 28.74989 46.44153 100.54666
#12: B 2 21 41 36.46497 48.64253 125.34773
#13: B 3 22 42 18.41062 45.74346 81.70062
#14: B 4 23 43 21.95464 48.77079 81.20773
#15: B 5 24 44 32.87653 47.47637 115.95097
#16: B 6 25 45 30.07065 48.44727 101.10688
#17: B 7 26 46 16.13836 44.90204 84.31080
#18: B 8 27 47 20.72575 47.14695 87.00805
#19: B 9 28 48 20.78425 48.94782 84.25406
#20: B 10 29 49 30.70872 44.65144 128.39415
Great (double) answer from #akrun.
Just a suggestion for your future analysis as you mentioned "it's an example of a bigger problem". Obviously, if you are really interested in building models rowwise then you'll create more and more columns as your age and h observations increase. If you get N observations you'll have to use 2xN columns for those 2 variables only.
I'd suggest to use a long data format in order to increase your rows instead of your columns.
Something like:
exp[1,] # how your first row (model building info) looks like
# exp re age1 age2 h h2
# 1 A 1 10 30 19.23298 46.67906
reshape(exp[1,], # how your model building info is transformed
varying = list(c("age1","age2"),
c("h","h2")),
v.names = c("age_value","h_value"),
direction = "long")
# exp re time age_value h_value id
# 1.1 A 1 1 10 19.23298 1
# 1.2 A 1 2 30 46.67906 1
Apologies if the "bigger problem" refers to something else and this answer is irrelevant.
With base R, the function sprintf can help us create formulas. And lapply carries out the calculation.
strings <- sprintf("c(%f,%f) ~ c(%f,%f)", exp$age1, exp$age2, exp$h, exp$h2)
lst <- lapply(strings, function(x) {model <- lm(as.formula(x));coef(model)[1] + 100 * coef(model)[2]})
exp$modelh <- unlist(lst)
exp
# exp re age1 age2 h h2 modelh
# 1 A 1 10 30 19.23298 46.67906 68.85506
# 2 A 2 11 31 17.73018 47.55402 66.17050
# 3 A 3 12 32 26.56967 46.69174 84.98486
# 4 A 4 13 33 11.69149 47.74486 61.98766
# 5 A 5 14 34 24.05648 46.10051 82.90167
# 6 A 6 15 35 24.51312 44.85710 89.21053
# 7 A 7 16 36 34.37208 47.85151 113.37493
# 8 A 8 17 37 21.10962 48.40977 74.79483
# 9 A 9 18 38 26.39676 46.74548 90.34187
# 10 A 10 19 39 15.10786 45.38862 75.07002
# 11 B 1 20 40 28.74989 46.44153 100.54666
# 12 B 2 21 41 36.46497 48.64253 125.34773
# 13 B 3 22 42 18.41062 45.74346 81.70062
# 14 B 4 23 43 21.95464 48.77079 81.20773
# 15 B 5 24 44 32.87653 47.47637 115.95097
# 16 B 6 25 45 30.07065 48.44727 101.10688
# 17 B 7 26 46 16.13836 44.90204 84.31080
# 18 B 8 27 47 20.72575 47.14695 87.00805
# 19 B 9 28 48 20.78425 48.94782 84.25406
# 20 B 10 29 49 30.70872 44.65144 128.39416
In the lapply function the expression as.formula(x) is what converts the formulas created in the first line into a format usable by the lm function.
Benchmark
library(dplyr)
library(microbenchmark)
set.seed(100)
big.exp <- data.frame(age1=sample(30, 1e4, T),
age2=sample(30:50, 1e4, T),
h=runif(1e4, 10, 40),
h2= 40 + runif(1e4,4,9))
microbenchmark(
plafort = {strings <- sprintf("c(%f,%f) ~ c(%f,%f)", big.exp$age1, big.exp$age2, big.exp$h, big.exp$h2)
lst <- lapply(strings, function(x) {model <- lm(as.formula(x));coef(model)[1] + 100 * coef(model)[2]})
big.exp$modelh <- unlist(lst)},
akdplyr = {big.exp %>%
rowwise() %>%
do({
age <- c(.$age1, .$age2)
h <- c(.$h, .$h2)
model <- lm(age ~ h)
data.frame(., modelh = coef(model)[1] + 100*coef(model)[2])
} )}
,times=5)
t: seconds
expr min lq mean median uq max neval cld
plafort 13.00605 13.41113 13.92165 13.56927 14.53814 15.08366 5 a
akdplyr 26.95064 27.64240 29.40892 27.86258 31.02955 33.55940 5 b
(Note: I downloaded the newest 1.9.5 devel version of data.table today, but continued to receive errors when trying to test it.
The results also differ fractionally (1.93 x 10^-8). Rounding likely accounts for the difference.)
all.equal(pl, ak)
[1] "Attributes: < Component “class”: Lengths (1, 3) differ (string compare on first 1) >"
[2] "Attributes: < Component “class”: 1 string mismatch >"
[3] "Component “modelh”: Mean relative difference: 1.933893e-08"
Conclusion
The lapply approach seems to perform well compared to dplyr with respect to speed, but it's 5 digit rounding may be an issue. Improvements may be possible. Perhaps using apply after converting to matrix to increase speed and efficiency.

how to fill stacked barplot with patterns or textures in R

I have used ggplot2 to draw a stacked barplot, and I want to fill the barplot with patterns. But it seems that the problem is very complicated to be solved by ggplot2.
So is there a way to fill stacked barplot with patterns or textures with base R or with another R package?
My plot is similar to this barplot:
and I want the barplot looks like this, fill with patterns or textures:
My data is from my previous post:
plant group n percentage
1 Cucumber-1 [3.19e-39,2] 14729 0.8667686695
2 Cucumber-1 (2,4] 1670 0.0982757606
3 Cucumber-1 (4,6] 447 0.0263049491
4 Cucumber-1 (6,8] 131 0.0077090567
5 Cucumber-1 (8,10] 16 0.0009415642
6 Cucumber-2 [3.19e-39,2] 20206 0.9410394933
7 Cucumber-2 (2,4] 1155 0.0537909836
8 Cucumber-2 (4,6] 90 0.0041915052
9 Cucumber-2 (6,8] 16 0.0007451565
10 Cucumber-2 (8,10] 5 0.0002328614
11 Eggplant-1 [3.19e-39,2] 11273 0.9012631916
12 Eggplant-1 (2,4] 960 0.0767508794
13 Eggplant-1 (4,6] 181 0.0144707387
14 Eggplant-1 (6,8] 31 0.0024784138
15 Eggplant-1 (8,10] 63 0.0050367765
16 Eggplant-2 [3.19e-39,2] 16483 0.9493721921
17 Eggplant-2 (2,4] 725 0.0417578620
18 Eggplant-2 (4,6] 140 0.0080635871
19 Eggplant-2 (6,8] 12 0.0006911646
20 Eggplant-2 (8,10] 2 0.0001151941
21 Pepper-1 [3.19e-39,2] 4452 0.9763157895
22 Pepper-1 (2,4] 97 0.0212719298
23 Pepper-1 (4,6] 11 0.0024122807
24 Pepper-2 [3.19e-39,2] 23704 0.9560763119
25 Pepper-2 (2,4] 905 0.0365022385
26 Pepper-2 (4,6] 184 0.0074214496
Most of the required work is to get your data in shape. The function ?barplot is simple to use, but you want to feed it a matrix. You can use vectors for the density= and angle= arguments to distinguish the elements of the stacked bar plot.
d = read.table(text="plant ...
... 184 0.0074214496", header=T)
d$group <- factor(d$group, levels=c(levels(d$group)[c(5,1:4)]),
labels=c("(0,2]", levels(d$group)[1:4]))
levels(d$group)
# [1] "(0,2]" "(2,4]" "(4,6]" "(6,8]" "(8,10]"
tab <- table(d$group, d$plant)
tab
# output omitted
d <- rbind(d,
c("Pepper-1", "(6,8]", 0, 0),
c("Pepper-1", "(8,10]", 0, 0),
c("Pepper-2", "(6,8]", 0, 0),
c("Pepper-2", "(8,10]", 0, 0) )
d <- d[order(d$plant, d$group),]
d
# output omitted
mat <- matrix(as.numeric(d$percentage), nrow=5, ncol=6)
rownames(mat) <- levels(d$group)
colnames(mat) <- levels(d$plant)
names(dimnames(mat)) <- c("group", "plant")
mat
# plant
# group Cucumber-1 Cucumber-2 Eggplant-1 Eggplant-2 Pepper-1 Pepper-2
# (0,2] 0.8667686695 0.9410394933 0.901263192 0.9493721921 0.976315789 0.95607631
# (2,4] 0.0982757606 0.0537909836 0.076750879 0.0417578620 0.021271930 0.03650224
# (4,6] 0.0263049491 0.0041915052 0.014470739 0.0080635871 0.002412281 0.00742145
# (6,8] 0.0077090567 0.0007451565 0.002478414 0.0006911646 0.000000000 0.00000000
# (8,10] 0.0009415642 0.0002328614 0.005036777 0.0001151941 0.000000000 0.00000000
barplot(mat, density=5:9, angle=seq(40, 90, 10), cex.names=.8)

Resources