This is my code. I need to apply a simple function on integer columns in a dataframe using purrr::map_df function, but I need to maintain the character column:
fun1 <- function(x){(x - mean(x))/sd(x)}
df <- mtcars %>% rownames_to_column()
df %>% map_df(~ fun1(.x))
What is your expected output exactly? Something like this?
library(tidyverse)
fun1 <- function(x) {
(x - mean(x)) / sd(x)
}
df <- mtcars %>%
rownames_to_column() %>%
as_tibble()
df %>%
mutate(across(where(is.integer), fun1))
# A tibble: 32 × 12
rowname mpg cyl disp hp drat wt qsec vs am gear carb
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Mazda RX4 0.151 -0.105 -0.571 -0.535 0.568 -0.610 -0.777 -0.868 1.19 0.424 0.735
2 Mazda RX4 Wag 0.151 -0.105 -0.571 -0.535 0.568 -0.350 -0.464 -0.868 1.19 0.424 0.735
3 Datsun 710 0.450 -1.22 -0.990 -0.783 0.474 -0.917 0.426 1.12 1.19 0.424 -1.12
4 Hornet 4 Drive 0.217 -0.105 0.220 -0.535 -0.966 -0.00230 0.890 1.12 -0.814 -0.932 -1.12
5 Hornet Sportabout -0.231 1.01 1.04 0.413 -0.835 0.228 -0.464 -0.868 -0.814 -0.932 -0.503
6 Valiant -0.330 -0.105 -0.0462 -0.608 -1.56 0.248 1.33 1.12 -0.814 -0.932 -1.12
7 Duster 360 -0.961 1.01 1.04 1.43 -0.723 0.361 -1.12 -0.868 -0.814 -0.932 0.735
8 Merc 240D 0.715 -1.22 -0.678 -1.24 0.175 -0.0278 1.20 1.12 -0.814 0.424 -0.503
9 Merc 230 0.450 -1.22 -0.726 -0.754 0.605 -0.0687 2.83 1.12 -0.814 0.424 -0.503
10 Merc 280 -0.148 -0.105 -0.509 -0.345 0.605 0.228 0.253 1.12 -0.814 0.424 0.735
# … with 22 more rows
# ℹ Use `print(n = ...)` to see more rows
Update(removed prior answer):
library(dplyr)
library(purrr)
mtcars %>%
select_if(is.numeric) %>%
map_df(~ fun1(.)) %>%
bind_cols(mtcars %>%
rownames_to_column() %>%
select(rowname))
With this output:
mpg cyl disp hp drat wt qsec vs am gear carb rowname
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 0.151 -0.105 -0.571 -0.535 0.568 -0.610 -0.777 -0.868 1.19 0.424 0.735 Mazda RX4
2 0.151 -0.105 -0.571 -0.535 0.568 -0.350 -0.464 -0.868 1.19 0.424 0.735 Mazda RX4 Wag
3 0.450 -1.22 -0.990 -0.783 0.474 -0.917 0.426 1.12 1.19 0.424 -1.12 Datsun 710
4 0.217 -0.105 0.220 -0.535 -0.966 -0.00230 0.890 1.12 -0.814 -0.932 -1.12 Hornet 4 Drive
5 -0.231 1.01 1.04 0.413 -0.835 0.228 -0.464 -0.868 -0.814 -0.932 -0.503 Hornet Sportabout
6 -0.330 -0.105 -0.0462 -0.608 -1.56 0.248 1.33 1.12 -0.814 -0.932 -1.12 Valiant
7 -0.961 1.01 1.04 1.43 -0.723 0.361 -1.12 -0.868 -0.814 -0.932 0.735 Duster 360
8 0.715 -1.22 -0.678 -1.24 0.175 -0.0278 1.20 1.12 -0.814 0.424 -0.503 Merc 240D
9 0.450 -1.22 -0.726 -0.754 0.605 -0.0687 2.83 1.12 -0.814 0.424 -0.503 Merc 230
10 -0.148 -0.105 -0.509 -0.345 0.605 0.228 0.253 1.12 -0.814 0.424 0.735 Merc 280
# … with 22 more rows
# ℹ Use `print(n = ...)` to see more rows
Using map_if
library(purrr)
library(dplyr)
mtcars %>%
type.convert(as.is = TRUE) %>%
rownames_to_column() %>%
map_if(.p = ~ inherits(.x, "integer"), .f = fun1) %>%
bind_cols()
you may alternatively use mutate_if as below
df %>% mutate_if(is.numeric, ~fun1(.x))
Created on 2023-02-19 with reprex v2.0.2
rowname mpg cyl disp hp drat wt qsec
1 Mazda RX4 0.15088482 -0.1049878 -0.57061982 -0.53509284 0.56751369 -0.610399567 -0.77716515
2 Mazda RX4 Wag 0.15088482 -0.1049878 -0.57061982 -0.53509284 0.56751369 -0.349785269 -0.46378082
3 Datsun 710 0.44954345 -1.2248578 -0.99018209 -0.78304046 0.47399959 -0.917004624 0.42600682
4 Hornet 4 Drive 0.21725341 -0.1049878 0.22009369 -0.53509284 -0.96611753 -0.002299538 0.89048716
5 Hornet Sportabout -0.23073453 1.0148821 1.04308123 0.41294217 -0.83519779 0.227654255 -0.46378082
6 Valiant -0.33028740 -0.1049878 -0.04616698 -0.60801861 -1.56460776 0.248094592 1.32698675
7 Duster 360 -0.96078893 1.0148821 1.04308123 1.43390296 -0.72298087 0.360516446 -1.12412636
8 Merc 240D 0.71501778 -1.2248578 -0.67793094 -1.23518023 0.17475447 -0.027849959 1.20387148
9 Merc 230 0.44954345 -1.2248578 -0.72553512 -0.75387015 0.60491932 -0.068730634 2.82675459
10 Merc 280 -0.14777380 -0.1049878 -0.50929918 -0.34548584 0.60491932 0.227654255 0.25252621
11 Merc 280C -0.38006384 -0.1049878 -0.50929918 -0.34548584 0.60491932 0.227654255 0.58829513
12 Merc 450SE -0.61235388 1.0148821 0.36371309 0.48586794 -0.98482035 0.871524874 -0.25112717
13 Merc 450SL -0.46302456 1.0148821 0.36371309 0.48586794 -0.98482035 0.524039143 -0.13920420
14 Merc 450SLC -0.81145962 1.0148821 0.36371309 0.48586794 -0.98482035 0.575139986 0.08464175
15 Cadillac Fleetwood -1.60788262 1.0148821 1.94675381 0.85049680 -1.24665983 2.077504765 0.07344945
16 Lincoln Continental -1.60788262 1.0148821 1.84993175 0.99634834 -1.11574009 2.255335698 -0.01608893
17 Chrysler Imperial -0.89442035 1.0148821 1.68856165 1.21512565 -0.68557523 2.174596366 -0.23993487
18 Fiat 128 2.04238943 -1.2248578 -1.22658929 -1.17683962 0.90416444 -1.039646647 0.90727560
19 Honda Civic 1.71054652 -1.2248578 -1.25079481 -1.38103178 2.49390411 -1.637526508 0.37564148
20 Toyota Corolla 2.29127162 -1.2248578 -1.28790993 -1.19142477 1.16600392 -1.412682800 1.14790999
21 Toyota Corona 0.23384555 -1.2248578 -0.89255318 -0.72469984 0.19345729 -0.768812180 1.20946763
22 Dodge Challenger -0.76168319 1.0148821 0.70420401 0.04831332 -1.56460776 0.309415603 -0.54772305
23 AMC Javelin -0.81145962 1.0148821 0.59124494 0.04831332 -0.83519779 0.222544170 -0.30708866
24 Camaro Z28 -1.12671039 1.0148821 0.96239618 1.43390296 0.24956575 0.636460997 -1.36476075
25 Pontiac Firebird -0.14777380 1.0148821 1.36582144 0.41294217 -0.96611753 0.641571082 -0.44699237
26 Fiat X1-9 1.19619000 -1.2248578 -1.22416874 -1.17683962 0.90416444 -1.310481114 0.58829513
27 Porsche 914-2 0.98049211 -1.2248578 -0.89093948 -0.81221077 1.55876313 -1.100967659 -0.64285758
28 Lotus Europa 1.71054652 -1.2248578 -1.09426581 -0.49133738 0.32437703 -1.741772228 -0.53093460
29 Ford Pantera L -0.71190675 1.0148821 0.97046468 1.71102089 1.16600392 -0.048290296 -1.87401028
30 Ferrari Dino -0.06481307 -0.1049878 -0.69164740 0.41294217 0.04383473 -0.457097039 -1.31439542
31 Maserati Bora -0.84464392 1.0148821 0.56703942 2.74656682 -0.10578782 0.360516446 -1.81804880
32 Volvo 142E 0.21725341 -1.2248578 -0.88529152 -0.54967799 0.96027290 -0.446876870 0.42041067
vs am gear carb
1 -0.8680278 1.1899014 0.4235542 0.7352031
2 -0.8680278 1.1899014 0.4235542 0.7352031
3 1.1160357 1.1899014 0.4235542 -1.1221521
4 1.1160357 -0.8141431 -0.9318192 -1.1221521
5 -0.8680278 -0.8141431 -0.9318192 -0.5030337
6 1.1160357 -0.8141431 -0.9318192 -1.1221521
7 -0.8680278 -0.8141431 -0.9318192 0.7352031
8 1.1160357 -0.8141431 0.4235542 -0.5030337
9 1.1160357 -0.8141431 0.4235542 -0.5030337
10 1.1160357 -0.8141431 0.4235542 0.7352031
11 1.1160357 -0.8141431 0.4235542 0.7352031
12 -0.8680278 -0.8141431 -0.9318192 0.1160847
13 -0.8680278 -0.8141431 -0.9318192 0.1160847
14 -0.8680278 -0.8141431 -0.9318192 0.1160847
15 -0.8680278 -0.8141431 -0.9318192 0.7352031
16 -0.8680278 -0.8141431 -0.9318192 0.7352031
17 -0.8680278 -0.8141431 -0.9318192 0.7352031
18 1.1160357 1.1899014 0.4235542 -1.1221521
19 1.1160357 1.1899014 0.4235542 -0.5030337
20 1.1160357 1.1899014 0.4235542 -1.1221521
21 1.1160357 -0.8141431 -0.9318192 -1.1221521
22 -0.8680278 -0.8141431 -0.9318192 -0.5030337
23 -0.8680278 -0.8141431 -0.9318192 -0.5030337
24 -0.8680278 -0.8141431 -0.9318192 0.7352031
25 -0.8680278 -0.8141431 -0.9318192 -0.5030337
26 1.1160357 1.1899014 0.4235542 -1.1221521
27 -0.8680278 1.1899014 1.7789276 -0.5030337
28 1.1160357 1.1899014 1.7789276 -0.5030337
29 -0.8680278 1.1899014 1.7789276 0.7352031
30 -0.8680278 1.1899014 1.7789276 1.9734398
31 -0.8680278 1.1899014 1.7789276 3.2116766
32 1.1160357 1.1899014 0.4235542 -0.5030337
Related
I have a data frame and I want to do rowise normalization for each row.
For example:
row1_new = (row1_old - mean_of_row1)/standard_dev_of_row1.
I wrote the following code for that:
normalize_df <- function(x){
mean1<- mean(unlist(as.list(x)))
std1<- sd(unlist(as.list(x)))
y = (x - mean1)/std1
return(y)
}
n_rows <- length(row.names(query_data))
for(i in seq(1, n_rows)){
query_data[i,]<- query_data[i,]
}
But it seems to be a lot slower and I didn't succeed in using apply function.
How can I use apply function to row-wise normalize dataframe?
As an alternative, using the built-in function scale t(scale(t(df))).
Here is another option using rowMeans and rowSds function.
library(matrixStats)
df <- mtcars
(df - rowMeans(df))/rowSds(as.matrix(df))
# mpg cyl disp hp drat wt qsec vs am gear carb
#Mazda RX4 -0.16637 -0.447 2.43 1.496 -0.486 -0.510 -0.25117 -0.559 -0.540 -0.484 -0.484
#Mazda RX4 Wag -0.16784 -0.448 2.43 1.495 -0.487 -0.507 -0.24221 -0.560 -0.542 -0.486 -0.486
#Datsun 710 -0.02053 -0.504 2.17 1.785 -0.508 -0.547 -0.12833 -0.581 -0.581 -0.504 -0.581
#Hornet 4 Drive -0.21836 -0.412 2.76 0.897 -0.449 -0.447 -0.24304 -0.475 -0.488 -0.450 -0.475
#Hornet Sportabout -0.30751 -0.402 2.69 1.067 -0.444 -0.442 -0.32228 -0.472 -0.472 -0.446 -0.454
#Valiant -0.24228 -0.415 2.72 1.000 -0.462 -0.452 -0.21197 -0.487 -0.501 -0.458 -0.487
#...
#...
It is simple to use an anonymous function:
t(apply(mtcars, 1, function(x) (x-mean(x))/sd(x)))
Mazda RX4 Mazda RX4 Wag Datsun 710 Hornet 4 Drive Hornet Sportabout Valiant Duster 360 Merc 240D Merc 230 Merc 280 Merc 280C
mpg -0.1663702 -0.1678380 -0.02053465 -0.2183565 -0.3075069 -0.2422777 -0.3696702 -0.005278277 -0.09496285 -0.2208754 -0.2439523
cyl -0.4465404 -0.4481484 -0.50419828 -0.4122884 -0.4016114 -0.4152404 -0.4209455 -0.464365595 -0.49763495 -0.4511720 -0.4497564
disp 2.4298741 2.4297052 2.17138772 2.7611422 2.6941650 2.7152412 2.4439581 2.746995205 2.43244710 2.3682166 2.3687127
Using dapply
library(collapse)
dapply(df, MARGIN = 1, FUN = function(x) (x-mean(x))/sd(x))
Is there a simple way to combine tidyr::unite() and dplyr::merge(), when we have two data frames with the same structure?
Example
1 -- Very simple scenario
Say that we have two dataframes that we wish to join/merge/"superimpose":
df_1 <-
data.frame(
first_name = c("john", "rachel", "thomas", "rebecca")
)
df_2 <-
data.frame(
last_name = c("smith", "williams", "miller", "jones")
)
Then I would do:
cbind(df_1, df_2) %>%
tidyr::unite(full, first_name, last_name)
#> full
#> 1 john_smith
#> 2 rachel_williams
#> 3 thomas_miller
#> 4 rebecca_jones
2 -- More complex scenario
We might have a richer dataset than the ones shown above. Take for example my_mtcars (as seen below), that we wish to "merge-unite-superimpose" with another dataframe such as df_random_vals (seen below).
# my_mtcars
library(tibble)
my_mtcars <-
mtcars %>%
rownames_to_column("cars") %>%
as_tibble()
my_mtcars
#> # A tibble: 32 x 12
#> cars mpg cyl disp hp drat wt qsec vs am gear carb
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Mazda RX4 21 6 160 110 3.9 2.62 16.5 0 1 4 4
#> 2 Mazda RX4 ~ 21 6 160 110 3.9 2.88 17.0 0 1 4 4
#> 3 Datsun 710 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
#> 4 Hornet 4 D~ 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
#> 5 Hornet Spo~ 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
#> 6 Valiant 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
#> 7 Duster 360 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
#> 8 Merc 240D 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
#> 9 Merc 230 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
#> 10 Merc 280 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
#> # ... with 22 more rows
Created on 2021-07-27 by the reprex package (v2.0.0)
The df_random_vals table is going to be in the same dimensions as my_mtcars.
dim_my_mtcars <- dim(my_mtcars)
target_nrows <- dim_my_mtcars[1]
target_ncols <- dim_my_mtcars[2]
set.seed(2021)
my_mat <-
matrix(data = rnorm(target_nrows*target_ncols),
ncol = target_ncols,
nrow = target_nrows)
df_random_vals <-
my_mat %>%
as.data.frame() %>%
as_tibble()
df_random_vals
#> # A tibble: 32 x 12
#> V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.122 1.02 0.231 -0.647 1.02 0.828 1.22 1.92 0.850 -1.99
#> 2 0.552 -1.42 -0.984 -0.868 2.45 0.355 -0.561 -0.237 -1.53 -1.49
#> 3 0.349 -0.605 0.565 -0.509 -0.246 0.159 0.337 1.57 -0.0316 0.255
#> 4 0.360 -1.58 1.62 -2.08 0.542 0.955 -1.54 0.481 1.43 -0.814
#> 5 0.898 -1.29 -0.252 -0.260 0.197 -0.340 -0.240 0.0426 -0.927 0.834
#> 6 -1.92 -1.45 -1.06 0.450 -2.07 -0.727 0.515 0.440 1.01 -1.15
#> 7 0.262 -0.0871 -0.348 -0.143 0.513 -1.70 -0.239 -1.88 -0.0860 -0.140
#> 8 0.916 0.505 -0.0430 -0.487 -0.406 1.95 0.582 -1.72 0.938 1.18
#> 9 0.0138 0.116 -1.40 -1.20 0.356 2.67 0.270 1.88 -0.518 0.187
#> 10 1.73 1.76 1.49 0.0469 -0.332 2.06 -1.34 -0.0313 -0.935 -0.695
#> # ... with 22 more rows, and 2 more variables: V11 <dbl>, V12 <dbl>
identical(
dim(df_random_vals),
dim(my_mtcars)
)
#> [1] TRUE
My question.
Given that both my_mtcars and df_random_vals have the same dimensions, how can we "superimpose" df_random_vals over my_mtcars, to get the following desired output:
# A tibble: 32 x 12
cars mpg cyl disp hp drat wt qsec vs am gear carb
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Mazda RX4_-0.122 21_1.019 6_0.231 160_-0.647 110_1.015 3.9_0.828 2.62_1.219 16.46_1.917 0_0.85 1_-1.988 4_0.4 4_0.935
2 Mazda RX4 Wag_0.552 21_-1.421 6_-0.984 160_-0.868 110_2.454 3.9_0.355 2.875_-0.561 17.02_-0.237 0_-1.531 1_-1.492 4_0.748 4_-0.012
3 Datsun 710_0.349 22.8_-0.605 4_0.565 108_-0.509 93_-0.246 3.85_0.159 2.32_0.337 18.61_1.572 1_-0.032 1_0.255 4_-0.088 1_0.26
4 Hornet 4 Drive_0.36 21.4_-1.583 6_1.617 258_-2.078 110_0.542 3.08_0.955 3.215_-1.537 19.44_0.481 1_1.428 0_-0.814 3_-1.086 1_1.151
5 Hornet Sportabout_0.898 18.7_-1.286 8_-0.252 360_-0.26 175_0.197 3.15_-0.34 3.44_-0.24 17.02_0.043 0_-0.927 0_0.834 3_-0.085 2_1.168
6 Valiant_-1.923 18.1_-1.455 6_-1.056 225_0.45 105_-2.071 2.76_-0.727 3.46_0.515 20.22_0.44 1_1.006 0_-1.152 3_0.466 1_-1.188
7 Duster 360_0.262 14.3_-0.087 8_-0.348 360_-0.143 245_0.513 3.21_-1.698 3.57_-0.239 15.84_-1.881 0_-0.086 0_-0.14 3_0 4_0.221
8 Merc 240D_0.916 24.4_0.505 4_-0.043 146.7_-0.487 62_-0.406 3.69_1.954 3.19_0.582 20_-1.724 1_0.938 0_1.176 4_0.616 2_1.134
9 Merc 230_0.014 22.8_0.116 4_-1.398 140.8_-1.196 95_0.356 3.92_2.667 3.15_0.27 22.9_1.883 1_-0.518 0_0.187 4_-0.652 2_0.511
10 Merc 280_1.73 19.2_1.76 6_1.49 167.6_0.047 123_-0.332 3.92_2.063 3.44_-1.343 18.3_-0.031 1_-0.935 0_-0.695 4_0.447 4_-1.497
# ... with 22 more rows
Note how each value from df_random_vals is pasted onto the corresponding value from my_mtcars, as the separator is "_".
How can such "superimposing" be achieved, given two dataframes with the same dimensions?
library(purrr)
map2_dfc(my_mtcars, df_random_vals, paste, sep = "_")
If you are ok having the final result as dataframe (and not tibble) you can use paste directly converting the tibbles to matrix.
result <- data.frame(my_mtcars)
result[] <- paste(as.matrix(my_mtcars), as.matrix(df_random_vals), sep = '_')
So I have a function that I want to apply to a range of columns in my dataframe
My dataframe has 111 columns and I want to de-mean each column based on its ID number.
I have the following code for the column named esgscore
demeaned_df <- df %>%
group_by(ID) %>%
mutate(avg_esgscore = mean(esgscore)) %>%
mutate(demeaned_esgscore = esgscore - avg_esgscore)
This does exactly what I want so I am happy with it but I essentially want to apply this function to columns 3:111 in my dataframe (df) and put these into a new dataframe called demeaned_df
Any help would be greatly appreciated
Here is one idea. We can use across to apply the function to multiple columns. In this example, I applied the demean operation to columns from disp to carb.
library(dplyr)
mtcars2 <- mtcars %>%
group_by(cyl) %>%
mutate(across(disp:carb, .fns = function(x) x - mean(x)))
mtcars2
# # A tibble: 32 x 11
# # Groups: cyl [3]
# mpg cyl disp hp drat wt qsec vs am gear carb
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 21 6 -23.3 -12.3 0.314 -0.497 -1.52 -0.571 0.571 0.143 0.571
# 2 21 6 -23.3 -12.3 0.314 -0.242 -0.957 -0.571 0.571 0.143 0.571
# 3 22.8 4 2.86 10.4 -0.221 0.0343 -0.527 0.0909 0.273 -0.0909 -0.545
# 4 21.4 6 74.7 -12.3 -0.506 0.0979 1.46 0.429 -0.429 -0.857 -2.43
# 5 18.7 8 6.90 -34.2 -0.0793 -0.559 0.248 0 -0.143 -0.286 -1.5
# 6 18.1 6 41.7 -17.3 -0.826 0.343 2.24 0.429 -0.429 -0.857 -2.43
# 7 14.3 8 6.90 35.8 -0.0193 -0.429 -0.932 0 -0.143 -0.286 0.5
# 8 24.4 4 41.6 -20.6 -0.381 0.904 0.863 0.0909 -0.727 -0.0909 0.455
# 9 22.8 4 35.7 12.4 -0.151 0.864 3.76 0.0909 -0.727 -0.0909 0.455
# 10 19.2 6 -15.7 0.714 0.334 0.323 0.323 0.429 -0.429 0.143 0.571
# # ... with 22 more rows
Let us say I want to run the linear regression model on the mtcars dataset several times on different samples.
The idea is, for each iteration in a for loop, to store the results of the predict() method every time the linear regression is run
for a different sample. The small example follows for one run:
## Perform model once on a Sample and use model on full dataset:
Sample_Size <- 10
Sample <- mtcars[sample(nrow(mtcars), Sample_Size), ]
Model <- lm(formula = mpg ~ wt, data = Sample)
Predictions <- predict(Model,newdata=mtcars)
## Gets us a list with predicted wt for each car:
Predictions <- t(Predictions)
This yields
> Predictions
Mazda RX4 Mazda RX4 Wag Datsun 710 Hornet 4 Drive Hornet Sportabout
[1,] 25.80494 23.89161 28.05592 21.34051 19.65228
Valiant Duster 360 Merc 240D Merc 230 Merc 280 Merc 280C Merc 450SE
[1,] 19.50221 18.67685 21.52809 21.82822 19.65228 19.65228 14.92523
Merc 450SL Merc 450SLC Cadillac Fleetwood Lincoln Continental
[1,] 17.47633 17.10117 6.071394 4.765828
.... and so on for other cars
I would like to perform this procedure several times inside a for loop, every time
choosing a different sample and getting a correspondent Predictions() list,
and store all the Predictions() results by line in a dataframe.
Let's say I run the model for two different samples. Each row of the resulting dataframe should be the outcome above for that sample, like:
Mazda RX4 Mazda RX4 Wag Datsun 710 Hornet 4 Drive Hornet Sportabout
[1,] 25.80494 23.89161 28.05592 21.34051 19.65228
[2,] 22.80492 22.89147 28.05532 21.34231 20.65290
Valiant Duster 360 Merc 240D Merc 230 Merc 280 Merc 280C Merc 450SE
[1,] 19.50221 18.67685 21.52809 21.82822 19.65228 19.65228 14.92523
[2,] 21.83492 23.84147 29.02532 21.34231 20.35290 18.45228 13.92523
... and so on for other cars.
Any idea on how to go about doing this? I have developed something but it either
throws an error or only stores the last result...What am I missing here?
Here is what I have so far:
### Inside a for loop, to get a dataframe of Predictions:
Bootstrap_times <- 2
Sample_Size <- 10
Predictions <- list()
Results <-vector ("list",Bootstrap_times)## Stores the Predictions for each run
for(i in 1:Bootstrap_times){
### Take a sample
Sample[[i]] <- mtcars[sample(nrow(mtcars), Sample_Size), ]
### Do the regression on the sample
Model[[i]] <- lm(formula = mpg ~ wt, data = Sample[[i]])
### Perform the predict() on the sample
Predictions[[i]] <- predict(Model[[i]],newdata=mtcars)
### put the result as a line on the dataframe Results
Predictions[[i]] <- t(Predictions[[i]])
return(Predictions)
}
Howeever, I keep getting:
Error in [[<-.data.frame(*tmp*, i, value = list(mpg = c(13.3,
10.4, : replacement has 10 rows, data has 0
I prefer to use magic_for() however you can also do this with base R pretty easily.
Here's an example:
Bootstrap_times <- 2
Sample_Size <- 10
Sample <- mtcars[sample(nrow(mtcars), Sample_Size), ]
Model <- lm(formula = mpg ~ wt, data = Sample)
Predictions <- predict(Model,newdata=mtcars)
## You like how I line up arrows, right?
Predictions <- t(Predictions)
Predictions <- list()
Results <-vector ("list",Bootstrap_times)## Stores the Predictions for each run
magicfor::magic_for()
for(i in 1:Bootstrap_times){
### Take a sample
Sample[[i]] <- mtcars[sample(nrow(mtcars), Sample_Size), ]
### Do the regression on the sample
Model[[i]] <- lm(formula = mpg ~ wt, data = Sample[[i]])
### Perform the predict() on the sample
put(predict(Model[[i]],newdata=mtcars))
}
tmp<-magicfor::magic_result_as_dataframe()
tmp
i predict(Model[[i]],newdata=mtcars)
1 1 22.858806
2 2 20.922763
3 1 25.136504
4 2 18.341372
5 1 16.633098
6 2 16.481252
7 1 15.646096
8 2 18.531180
9 1 18.834873
10 2 16.633098
11 1 16.633098
12 2 11.849933
13 1 14.431324
14 2 14.051708
15 1 2.890988
16 2 1.569924
17 1 2.169717
18 2 26.047583
19 1 30.489093
20 2 28.818782
21 1 24.035616
22 2 16.025712
23 1 16.671060
24 2 13.596168
25 1 13.558206
26 2 28.059549
27 1 26.503122
28 2 31.263511
29 1 18.683026
30 2 21.719957
31 1 15.646096
32 2 21.644034
33 1 22.978374
34 2 21.584264
35 1 24.618503
36 2 19.725450
37 1 18.495353
38 2 18.386011
39 1 17.784630
40 2 19.862128
41 1 20.080812
42 2 18.495353
43 1 18.495353
44 2 15.051081
45 1 16.909894
46 2 16.636540
47 1 8.599905
48 2 7.648629
49 1 8.080530
50 2 25.274555
51 1 28.472808
52 2 27.270046
53 1 23.825774
54 2 18.057985
55 1 18.522689
56 2 16.308514
57 1 16.281178
58 2 26.723336
59 1 25.602581
60 2 29.030452
61 1 19.971470
62 2 22.158309
63 1 17.784630
64 2 22.103638
My version:
# load data
data(mtcars)
N <- nrow(mtcars)
# bootstrap parameters
sample_size <- 10
bootstrap_times <- 20
# create empty storage matrix of results
# one row per bootstrap sample, one column per predicted weight
res_mat <- matrix(NA, nrow=bootstrap_times, ncol=N)
colnames(res_mat) <- rownames(mtcars)
# do bootstrap
for (i in seq(bootstrap_times)) {
this_sample <- sample(N, sample_size, replace=FALSE)
reg_result <- lm(mpg ~ wt, data=mtcars[this_sample,])
res_mat[i,] <- predict(reg_result, mtcars)
}
Here is a tidyverse approach using nested data.frames:
library(tidyverse)
Bootstrap_times <- 2
Sample_Size <- 10
Predictions <- data.frame(SampleID = 1:Bootstrap_times) %>%
group_by(SampleID) %>%
nest() %>%
mutate(data = data %>% map(~mtcars[sample(nrow(mtcars), Sample_Size), ]),
Model = data %>% map(~lm(formula = mpg ~ wt, data = .)),
Predictions = map2(Model, data, ~predict(.x, newdata = .y))) %>%
select(SampleID, Predictions) %>%
unnest()
Result:
# A tibble: 20 x 2
SampleID Predictions
<int> <dbl>
1 1 22.7
2 1 16.2
3 1 19.7
4 1 21.5
5 1 18.7
6 1 17.4
7 1 23.3
8 1 10.7
9 1 18.8
10 1 19.8
11 2 11.4
12 2 19.6
13 2 11.7
14 2 18.1
15 2 21.1
16 2 18.6
17 2 16.2
18 2 23.5
19 2 19.7
20 2 20.7
The advantage of this method is that it is very easy to extract other information from the model (using broom) and combine as one single data.frame output:
library(broom)
data.frame(SampleID = 1:Bootstrap_times) %>%
group_by(SampleID) %>%
nest() %>%
mutate(data = data %>% map(~mtcars[sample(nrow(mtcars), Sample_Size), ]),
Model = data %>% map(~lm(formula = mpg ~ wt, data = .) %>% augment())) %>%
select(-data) %>%
unnest()
Result:
# A tibble: 20 x 11
SampleID .rownames mpg wt .fitted .se.fit .resid .hat .sigma .cooksd .std.resid
<int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 Dodge Challenger 15.5 3.52 17.2 0.689 -1.72 0.106 2.15 0.0442 -0.862
2 1 Datsun 710 22.8 2.32 23.5 0.940 -0.655 0.198 2.24 0.0148 -0.346
3 1 Cadillac Fleetwood 10.4 5.25 8.24 1.52 2.16 0.515 1.93 1.15 1.47
4 1 Merc 450SE 16.4 4.07 14.4 0.863 2.04 0.167 2.10 0.112 1.06
5 1 Ford Pantera L 15.8 3.17 19.0 0.672 -3.24 0.101 1.85 0.147 -1.62
6 1 Lotus Europa 30.4 1.51 27.6 1.39 2.75 0.432 1.79 1.14 1.73
7 1 Volvo 142E 21.4 2.78 21.1 0.751 0.334 0.126 2.26 0.00207 0.169
8 1 Merc 280C 17.8 3.44 17.6 0.678 0.163 0.103 2.26 0.000378 0.0812
9 1 Mazda RX4 Wag 21 2.88 20.6 0.724 0.428 0.117 2.25 0.00308 0.215
10 1 Camaro Z28 13.3 3.84 15.6 0.773 -2.26 0.134 2.06 0.102 -1.15
11 2 Merc 280 19.2 3.44 19.7 1.09 -0.470 0.108 3.53 0.00138 -0.151
12 2 Toyota Corolla 33.9 1.84 28.2 1.65 5.66 0.251 2.52 0.658 1.98
13 2 Hornet Sportabout 18.7 3.44 19.7 1.09 -0.970 0.108 3.51 0.00588 -0.311
14 2 Mazda RX4 Wag 21 2.88 22.7 1.07 -1.69 0.106 3.47 0.0173 -0.540
15 2 Chrysler Imperial 14.7 5.34 9.50 2.42 5.20 0.539 2.02 3.15 2.32
16 2 Camaro Z28 13.3 3.84 17.5 1.26 -4.23 0.145 3.08 0.163 -1.39
17 2 Valiant 18.1 3.46 19.6 1.09 -1.46 0.110 3.48 0.0136 -0.469
18 2 Porsche 914-2 26 2.14 26.6 1.43 -0.611 0.188 3.52 0.00490 -0.205
19 2 Merc 280C 17.8 3.44 19.7 1.09 -1.87 0.108 3.45 0.0219 -0.600
20 2 Lotus Europa 30.4 1.51 30.0 1.91 0.441 0.335 3.52 0.00677 0.164
Note:
Using this method, you don't even need the prediction step (unless you are using new data), since you have the .fitted values from augment.
The predictions are different between the first and second output because no seed was set.
Is there a better way to select rownames beginning with something?
Ex.
df
k1 k2 p1 p2 l perda lP lucroVar
C16-C12 6.02 12.12 5.35 0.48 4.87 -1.23 3.96 79.84
C47-C12 6.62 12.12 4.63 0.48 4.15 -1.35 3.07 75.45
C7-C12 7.02 12.12 4.30 0.48 3.82 -1.28 2.98 74.90
C21-C12 7.12 12.12 4.19 0.48 3.71 -1.29 2.88 74.20
C12-C13 12.12 13.12 0.48 0.24 0.24 -0.76 0.32 24.00
C12-C43 12.12 13.62 0.48 0.16 0.32 -1.18 0.27 21.33
* The real data frame has 8000 rows.
The 2 following options work:
df[substr(rownames(df),1,3)=='C12',]
or
df[grep('^C12',rownames(df)),]
I would like
df['C12*',]
k1 k2 p1 p2 l perda lP lucroVar
C12-C13 12.12 13.12 0.48 0.24 0.24 -0.76 0.32 24.00
C12-C43 12.12 13.62 0.48 0.16 0.32 -1.18 0.27 21.33
In SQL there is "like 'C12%'".
not that I would recommend doing this but..
`[.data.frame` <- function(x, i, ...) {
base::`[.data.frame`(x, if (is.character(i)) grepl(i, rownames(x)) else i, ...)
}
letters[1]
# [1] "a"
mtcars[1, ]
# mpg cyl disp hp drat wt qsec vs am gear carb
# Mazda RX4 21 6 160 110 3.9 2.62 16.46 0 1 4 4
mtcars['M', ]
# mpg cyl disp hp drat wt qsec vs am gear carb
# Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
# Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
# Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
# Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
# Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
# Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
# Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
# Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
# Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
# AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
# Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
And to return to normal
rm('[', '[.data.frame')
Why you dislike your approach and what do you mean by "better"? More coincise syntax or faster?
dplyr can do the same, but is actually more convoluted since I think you need to transform rownames to an explicit variable
library(dplyr)
a <- data.frame(a=(1:4), row.names=c("C12","CC12", "C1","12"))
tbl_df(cbind(a=a, b=rownames(a)))%>%
filter(grepl("^C12", b))