I am looking for a nice tidy/dplyr approach to compute the difference between all possible pair of columns (including repeats e.g A-B & B-A) in a dataframe.
I start with df and would like to end with end_df:
library(tidyverse)
#> Warning: package 'tidyverse' was built under R version 4.2.1
#> Warning: package 'tibble' was built under R version 4.2.1
df <- tibble(A = rnorm(1:10),
B = rnorm(1:10),
C = rnorm(1:10))
print(df)
#> # A tibble: 10 × 3
#> A B C
#> <dbl> <dbl> <dbl>
#> 1 -0.292 1.27 0.783
#> 2 -1.11 0.254 -0.410
#> 3 2.05 1.67 1.35
#> 4 1.31 0.0329 -1.29
#> 5 -1.67 -0.379 -0.696
#> 6 -1.02 -0.686 1.43
#> 7 -0.291 -0.0728 0.336
#> 8 -0.507 0.350 1.70
#> 9 -0.707 0.961 -0.493
#> 10 0.0459 -0.299 -0.0113
end_df <- df %>%
mutate( "A-B" = A-B,
"A-C" = A-C,
"B-A" = B-A,
"B-C" = B-C,
"C-A" = C-A,
"C-B" = C-B)
print(end_df)
#> # A tibble: 10 × 9
#> A B C `A-B` `A-C` `B-A` `B-C` `C-A` `C-B`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.292 1.27 0.783 -1.56 -1.08 1.56 0.482 1.08 -0.482
#> 2 -1.11 0.254 -0.410 -1.37 -0.703 1.37 0.664 0.703 -0.664
#> 3 2.05 1.67 1.35 0.380 0.702 -0.380 0.321 -0.702 -0.321
#> 4 1.31 0.0329 -1.29 1.28 2.60 -1.28 1.33 -2.60 -1.33
#> 5 -1.67 -0.379 -0.696 -1.29 -0.975 1.29 0.317 0.975 -0.317
#> 6 -1.02 -0.686 1.43 -0.334 -2.44 0.334 -2.11 2.44 2.11
#> 7 -0.291 -0.0728 0.336 -0.218 -0.627 0.218 -0.409 0.627 0.409
#> 8 -0.507 0.350 1.70 -0.857 -2.20 0.857 -1.35 2.20 1.35
#> 9 -0.707 0.961 -0.493 -1.67 -0.215 1.67 1.45 0.215 -1.45
#> 10 0.0459 -0.299 -0.0113 0.345 0.0572 -0.345 -0.288 -0.0572 0.288
Created on 2022-09-05 by the reprex package (v2.0.1)
You can get a list of all of the pairs of names, and then create a list of columns of the original dataframe mutated, the bind them:
pairs <- expand.grid(names(df), names(df)) %>%
filter(Var1 != Var2)
map2(pairs$Var1, pairs$Var2, function(x, y) as_tibble_col(df[[x]] - df[[y]], str_c(x, "-", y))) %>%
bind_cols(df, .)
# # A tibble: 10 × 9
# A B C `B-A` `C-A` `A-B` `C-B` `A-C` `B-C`
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 0.199 0.110 0.0148 -0.0895 -0.184 0.0895 -0.0948 0.184 0.0948
# 2 -0.851 -0.413 0.338 0.438 1.19 -0.438 0.751 -1.19 -0.751
# 3 -1.13 0.112 -1.97 1.24 -0.835 -1.24 -2.08 0.835 2.08
# 4 0.597 -2.89 -2.32 -3.49 -2.92 3.49 0.572 2.92 -0.572
# 5 -1.10 0.0953 0.996 1.19 2.09 -1.19 0.900 -2.09 -0.900
# 6 0.0191 0.500 1.17 0.481 1.15 -0.481 0.667 -1.15 -0.667
# 7 0.416 0.949 -0.865 0.533 -1.28 -0.533 -1.81 1.28 1.81
# 8 1.84 -1.66 -1.39 -3.50 -3.23 3.50 0.267 3.23 -0.267
# 9 0.406 -1.48 -1.33 -1.89 -1.74 1.89 0.149 1.74 -0.149
# 10 0.393 -0.491 -0.139 -0.884 -0.532 0.884 0.352 0.532 -0.352
Related
I have the following data
sim_model = "
x =~ 1.3*x1 + 1.2*x2 + 1.1*x3 + 1.2*x4
y =~ 1*y1 + 1.2*y2 + 1.3*y3 + 0.9*y4
y ~ 0.6*x
"
sim_data = lavaan::simulateData(sim_model)
model = "
x =~ x1 + x2 + x3 + x4
y =~ y1 + y2 + y3 + y4
y ~ x
"
sd_d = data.frame(sd_d = apply(sim_data, 2, sd))
I need to multiply each column of the sim_data with its corresponding standard deviation which is in sd_d.
Any help?
Perhaps this does work? This adds columns to the end of the data frame containing your requested values.
sim_data |>
mutate(SDxX1 = sd(x1)*x1,
SDxX2 = sd(x2)*x2,
SDxX3 = sd(x3)*x3,
SDxX4 = sd(x4)*x4,
SDxY1 = sd(y1)*y1,
SDxY2 = sd(y2)*y2,
SDxY3 = sd(y3)*y3,
SDxY4 = sd(y4)*y4
)
How about this:
library(dplyr)
sim_model = "
x =~ 1.3*x1 + 1.2*x2 + 1.1*x3 + 1.2*x4
y =~ 1*y1 + 1.2*y2 + 1.3*y3 + 0.9*y4
y ~ 0.6*x
"
sim_data = lavaan::simulateData(sim_model)
model = "
x =~ x1 + x2 + x3 + x4
y =~ y1 + y2 + y3 + y4
y ~ x
"
sd_d = data.frame(sd_d = apply(sim_data, 2, sd))
new_sim_data <- sapply(1:ncol(sim_data), function(i)
sim_data[[i]]*sd_d$sd_d[i])
head(new_sim_data)
#> [,1] [,2] [,3] [,4] [,5] [,6]
#> [1,] 3.9817268 4.4529682 1.7882380 2.621278 -0.3147092 0.7048940
#> [2,] -0.8972617 -0.4562149 -0.1165654 1.318948 0.4359371 0.4220787
#> [3,] 1.9188604 0.9183960 5.3265835 4.025215 1.6147254 1.8146241
#> [4,] -4.2811180 -0.4473838 -1.4982330 -1.325111 -2.5972828 -0.7700888
#> [5,] -2.8633480 2.4930664 1.9927546 -1.186898 3.8177569 4.4855348
#> [6,] 1.0197316 0.7887374 2.2055450 2.039363 2.8806220 9.2947559
#> [,7] [,8]
#> [1,] 2.2355215 -1.3586282
#> [2,] -1.0632624 -0.6658058
#> [3,] 0.1758628 0.1879555
#> [4,] -3.0958775 2.8376086
#> [5,] 4.5647521 3.5110156
#> [6,] 7.0123519 1.3295521
Created on 2022-11-26 by the reprex package (v2.0.1)
Here are a couple of tidyverse solutions. It would be easiest to just calculate everything you need from within the dataframe. If you really need to use sd_d then it is possible, but a little more convoluted:
library(tidyverse)
#easiest solution
sim_data |>
mutate(across(x1:y4, list(sd = \(x) sd(x)*x)))
#> # A tibble: 500 x 16
#> x1 x2 x3 x4 y1 y2 y3 y4 x1_sd x2_sd x3_sd
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.926 0.468 0.476 -0.252 -2.53 -0.374 0.690 0.354 -1.47 0.728 0.697
#> 2 -0.813 -0.782 -0.820 0.300 0.978 1.48 -0.566 1.45 -1.29 -1.22 -1.20
#> 3 -3.28 -2.18 -2.48 -1.12 -0.235 -0.562 0.919 1.78 -5.20 -3.39 -3.63
#> 4 1.18 2.49 2.52 2.46 2.97 3.31 0.881 -1.10 1.87 3.87 3.69
#> 5 1.33 0.328 1.83 -2.04 0.204 1.40 1.94 1.77 2.10 0.511 2.68
#> 6 -0.388 -3.12 -1.03 0.137 -0.333 -0.167 0.482 -0.417 -0.615 -4.85 -1.50
#> 7 -1.12 -1.83 0.143 -1.75 0.169 -1.51 -2.40 -0.793 -1.77 -2.84 0.209
#> 8 -3.04 -0.899 -2.77 -0.586 -1.98 -1.13 -2.97 -1.64 -4.81 -1.40 -4.06
#> 9 -0.856 1.46 1.37 -0.617 1.45 -0.149 -0.169 0.842 -1.36 2.26 2.00
#> 10 0.492 0.506 0.616 -1.23 -0.841 0.132 -0.528 -1.51 0.779 0.786 0.902
#> # ... with 490 more rows, and 5 more variables: x4_sd <dbl>, y1_sd <dbl>,
#> # y2_sd <dbl>, y3_sd <dbl>, y4_sd <dbl>
#less easy solution
map_dfc(colnames(sim_data), \(col){
tibble(!!sym(col) := pull(sim_data, {{col}}),
!!sym(paste0(col, "_sd")) := pull(sim_data, {{col}}) * sd_d[rownames(sd_d) == col,])
}) |>
select(colnames(sim_data), contains("_sd"))
#> # A tibble: 500 x 16
#> x1 x2 x3 x4 y1 y2 y3 y4 x1_sd x2_sd x3_sd
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.926 0.468 0.476 -0.252 -2.53 -0.374 0.690 0.354 -1.47 0.728 0.697
#> 2 -0.813 -0.782 -0.820 0.300 0.978 1.48 -0.566 1.45 -1.29 -1.22 -1.20
#> 3 -3.28 -2.18 -2.48 -1.12 -0.235 -0.562 0.919 1.78 -5.20 -3.39 -3.63
#> 4 1.18 2.49 2.52 2.46 2.97 3.31 0.881 -1.10 1.87 3.87 3.69
#> 5 1.33 0.328 1.83 -2.04 0.204 1.40 1.94 1.77 2.10 0.511 2.68
#> 6 -0.388 -3.12 -1.03 0.137 -0.333 -0.167 0.482 -0.417 -0.615 -4.85 -1.50
#> 7 -1.12 -1.83 0.143 -1.75 0.169 -1.51 -2.40 -0.793 -1.77 -2.84 0.209
#> 8 -3.04 -0.899 -2.77 -0.586 -1.98 -1.13 -2.97 -1.64 -4.81 -1.40 -4.06
#> 9 -0.856 1.46 1.37 -0.617 1.45 -0.149 -0.169 0.842 -1.36 2.26 2.00
#> 10 0.492 0.506 0.616 -1.23 -0.841 0.132 -0.528 -1.51 0.779 0.786 0.902
#> # ... with 490 more rows, and 5 more variables: x4_sd <dbl>, y1_sd <dbl>,
#> # y2_sd <dbl>, y3_sd <dbl>, y4_sd <dbl>
I am trying to read formula from textfile and execute. This will work.
writeLines(con = "/tmp/test.txt",
text = "new_cols_e = b + c
new_cols_f = (a*pi +b)/c - d
new_cols_g = log(b)
new_cols_h = b * a")
set.seed(1)
df<-letters[1:4] %>% set_names() %>% map_df(~rnorm(10))
read formula from text file, mutate
readLines(con = '/tmp/test.txt') %>%
set_names(.,str_trim(sub("(.*)=.*","\\1",.),"both")) %>%
map(~eval(parse(text=.x),df)) %>%
bind_cols(df,.)
# A tibble: 10 x 8
a b c d new_cols_e new_cols_f new_cols_g new_cols_h
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.951 -0.259 0.398 -0.390 0.139 7.24 NaN -0.246
2 -0.389 0.394 -0.408 0.376 -0.0131 1.66 -0.930 -0.154
3 -0.284 -0.852 1.32 0.244 0.472 -1.56 NaN 0.242
4 0.857 2.65 -0.701 -1.43 1.95 -6.19 0.974 2.27
5 1.72 0.156 -0.581 1.78 -0.425 -11.4 -1.86 0.268
6 0.270 1.13 -1.00 0.134 0.129 -2.11 0.122 0.305
7 -0.422 -2.29 -0.668 0.766 -2.96 4.65 NaN 0.966
8 -1.19 0.741 0.945 0.955 1.69 -4.12 -0.300 -0.881
9 -0.331 -1.32 0.434 -0.0506 -0.883 -5.38 NaN 0.436
10 -0.940 0.920 1.01 -0.306 1.92 -1.72 -0.0836 -0.864
but this will not work, because new_cols_g is not recognized
writeLines(con = "/tmp/test.txt",
text = "new_cols_e = b + c
new_cols_f = (a*pi +b)/c - d
new_cols_g = log(b)
new_cols_h = b * a
new_cols_i = new_cols_g - b")
What I want to do is...
a b c d new_cols_e new_cols_f new_cols_g new_cols_h new_cols_i
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.951 -0.259 0.398 -0.390 0.139 7.24 NaN -0.246 NaN
2 -0.389 0.394 -0.408 0.376 -0.0131 1.66 -0.930 -0.154 -1.32
3 -0.284 -0.852 1.32 0.244 0.472 -1.56 NaN 0.242 NaN
4 0.857 2.65 -0.701 -1.43 1.95 -6.19 0.974 2.27 -1.67
5 1.72 0.156 -0.581 1.78 -0.425 -11.4 -1.86 0.268 -2.01
6 0.270 1.13 -1.00 0.134 0.129 -2.11 0.122 0.305 -1.01
7 -0.422 -2.29 -0.668 0.766 -2.96 4.65 NaN 0.966 NaN
8 -1.19 0.741 0.945 0.955 1.69 -4.12 -0.300 -0.881 -1.04
9 -0.331 -1.32 0.434 -0.0506 -0.883 -5.38 NaN 0.436 NaN
10 -0.940 0.920 1.01 -0.306 1.92 -1.72 -0.0836 -0.864 -1.00
I hope my question is clear and feasable. Thank you a lot for your help !
It is usually not advised to evaluate code as string. For your case here is a way you could do it.
library(dplyr)
readLines(con = '/tmp/test.txt') %>%
paste0(collapse = ',') %>%
sprintf('df %%>%% mutate(%s)', .) -> string
eval(parse(text=string))
# a b c d new_cols_e new_cols_f new_cols_g new_cols_h new_cols_i
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 -0.626 1.51 0.919 1.36 2.43 -1.86 0.413 -0.947 -1.10
# 2 0.184 0.390 0.782 -0.103 1.17 1.34 -0.942 0.0716 -1.33
# 3 -0.836 -0.621 0.0746 0.388 -0.547 -43.9 NaN 0.519 NaN
# 4 1.60 -2.21 -1.99 -0.0538 -4.20 -1.35 NaN -3.53 NaN
# 5 0.330 1.12 0.620 -1.38 1.74 4.86 0.118 0.371 -1.01
# 6 -0.820 -0.0449 -0.0561 -0.415 -0.101 47.1 NaN 0.0369 NaN
# 7 0.487 -0.0162 -0.156 -0.394 -0.172 -9.33 NaN -0.00789 NaN
# 8 0.738 0.944 -1.47 -0.0593 -0.527 -2.16 -0.0578 0.697 -1.00
# 9 0.576 0.821 -0.478 1.10 0.343 -6.60 -0.197 0.473 -1.02
#10 -0.305 0.594 0.418 0.763 1.01 -1.64 -0.521 -0.181 -1.11
data
writeLines(con = "/tmp/test.txt",
text = "new_cols_e = b + c
new_cols_f = (a*pi +b)/c - d
new_cols_g = log(b)
new_cols_h = b * a
new_cols_i = new_cols_g - b")
I am working on a project which is updated every few years. We then get new data for every quarter. Usually a table, in which columns 1:5 are IDs and discriptive information necessary for further analysis. And then there are consecutive pairs of columns (e.g. 6:7, 8:9, ...) which contain the updated data (variables) for the specific quarter. So far so good.
The problem is that given the unregular update interval, the input data is quite different. Sometimes I get CSVs for one quarter, sometimes for a number of quarters, and sometimes the date is even in SPSS format (sav). I managed to automatically join all past and current files, when I have the input data in the following format: columns 1:5 descriptive data, columns 6:7 updated data.
So what's left to do is to import data from CSVs or SAVs that cointain more than one quarter, split them into seperate dataframes in the requested format (columns 1:5 descriptive data, columns 6:7 new variables), and export the new dataframe as CSV to a specific location - if possible even automatically naming them.
While this is fairly easy to do manually, I could not come up with an automated solution, as there are so many variable factors. Manually I would use the dplyr package:
tops_2007_1 <- tops_2007_1_2015_2 %>% select(1:5, 6:7) #and so forth
tops_2007_2 <- tops_2007_1_2015_2 %>% select(1:5, 8:9)
...
write_excel_csv2(tops_2007_1, "Data/Tops/tops_2007_1.csv") # and so forth
write_excel_csv2(tops_2007_2, "Data/Tops/tops_2007_2.csv")
...
Is there a way to automate this? Information on the new dataframe names would be in the column headers (e.g. 20071_X, 20071_Y). I got plenty of hints towards for loops and apply functions, but failed to put it together. I would appreciate any hints that could get me started on this!
Thanks!
Using the same random data generated by #Limey, you can do:
library(tidyverse)
library(purrr)
Generate some random data
df <- tibble(
id1=rnorm(10),id2=rnorm(10),id3=rnorm(10),id4=rnorm(10),id5=rnorm(10),
var1x=rnorm(10),var1y=rnorm(10),var2x=rnorm(10),var2y=rnorm(10),var3x=rnorm(10),
var3y=rnorm(10),var4x=rnorm(10),var4y=rnorm(10),var5x=rnorm(10),var5y=rnorm(10)
)
spits <- split(names(df)[6:length(names(df))], ceiling(seq_along(names(df)[6:length(names(df))])/2))
pmap(list(spits,1:length(spits)), ~df%>%select(1:5,all_of(..1))%>%write_excel_csv2(.,paste0('tops_2007_',as.character(..2),'.csv')))
Which outputs:
$`1`
# A tibble: 10 x 7
id1 id2 id3 id4 id5 var1x var1y
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.748 0.587 1.47 2.23 -1.06 0.618 0.642
2 -1.07 -1.29 1.03 0.222 -0.266 -0.824 0.862
3 0.872 -1.62 0.225 1.31 -0.737 0.885 -0.569
4 -1.85 0.653 0.279 -1.28 -1.04 0.0762 0.769
5 0.913 -2.27 0.0821 1.10 1.04 1.50 0.791
6 1.36 0.203 -0.310 1.43 1.43 -0.489 0.259
7 -0.961 1.10 0.642 -1.42 2.01 -1.79 -0.732
8 0.861 -0.831 1.29 0.684 -0.124 0.0549 0.755
9 1.20 1.43 1.20 -0.287 -1.35 -1.48 0.237
10 -0.645 1.01 0.411 0.383 -0.413 1.75 0.386
$`2`
# A tibble: 10 x 7
id1 id2 id3 id4 id5 var2x var2y
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.748 0.587 1.47 2.23 -1.06 -0.602 -0.938
2 -1.07 -1.29 1.03 0.222 -0.266 1.47 0.122
3 0.872 -1.62 0.225 1.31 -0.737 -0.600 0.584
4 -1.85 0.653 0.279 -1.28 -1.04 -0.835 -0.918
5 0.913 -2.27 0.0821 1.10 1.04 -0.322 0.623
6 1.36 0.203 -0.310 1.43 1.43 0.333 1.22
7 -0.961 1.10 0.642 -1.42 2.01 -0.287 -0.471
8 0.861 -0.831 1.29 0.684 -0.124 0.0853 -1.34
9 1.20 1.43 1.20 -0.287 -1.35 0.800 -0.528
10 -0.645 1.01 0.411 0.383 -0.413 0.982 1.76
$`3`
# A tibble: 10 x 7
id1 id2 id3 id4 id5 var3x var3y
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.748 0.587 1.47 2.23 -1.06 -0.00896 0.320
2 -1.07 -1.29 1.03 0.222 -0.266 -0.321 0.213
3 0.872 -1.62 0.225 1.31 -0.737 0.785 1.81
4 -1.85 0.653 0.279 -1.28 -1.04 0.965 0.695
5 0.913 -2.27 0.0821 1.10 1.04 1.18 1.22
6 1.36 0.203 -0.310 1.43 1.43 -1.64 0.804
7 -0.961 1.10 0.642 -1.42 2.01 0.648 1.25
8 0.861 -0.831 1.29 0.684 -0.124 -0.933 -0.483
9 1.20 1.43 1.20 -0.287 -1.35 1.56 0.710
10 -0.645 1.01 0.411 0.383 -0.413 -2.24 0.964
$`4`
# A tibble: 10 x 7
id1 id2 id3 id4 id5 var4x var4y
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.748 0.587 1.47 2.23 -1.06 0.172 -0.901
2 -1.07 -1.29 1.03 0.222 -0.266 -0.336 -0.720
3 0.872 -1.62 0.225 1.31 -0.737 2.18 -0.557
4 -1.85 0.653 0.279 -1.28 -1.04 1.18 1.30
5 0.913 -2.27 0.0821 1.10 1.04 0.0288 0.369
6 1.36 0.203 -0.310 1.43 1.43 -1.34 0.506
7 -0.961 1.10 0.642 -1.42 2.01 0.193 0.0254
8 0.861 -0.831 1.29 0.684 -0.124 0.209 0.168
9 1.20 1.43 1.20 -0.287 -1.35 0.181 0.658
10 -0.645 1.01 0.411 0.383 -0.413 1.40 -0.360
$`5`
# A tibble: 10 x 7
id1 id2 id3 id4 id5 var5x var5y
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.748 0.587 1.47 2.23 -1.06 0.0203 1.14
2 -1.07 -1.29 1.03 0.222 -0.266 -1.22 0.339
3 0.872 -1.62 0.225 1.31 -0.737 -0.423 -0.291
4 -1.85 0.653 0.279 -1.28 -1.04 0.815 -0.428
5 0.913 -2.27 0.0821 1.10 1.04 -0.00963 -0.690
6 1.36 0.203 -0.310 1.43 1.43 0.141 0.451
7 -0.961 1.10 0.642 -1.42 2.01 -1.77 1.60
8 0.861 -0.831 1.29 0.684 -0.124 0.370 0.438
9 1.20 1.43 1.20 -0.287 -1.35 -0.582 -2.92
10 -0.645 1.01 0.411 0.383 -0.413 1.33 -0.221
and creates the following files in your directory:
Something like:
library(tidyverse)
Generate some random data
df <- tibble(
id1=rnorm(10),id2=rnorm(10),id3=rnorm(10),id4=rnorm(10),id5=rnorm(10),
var1x=rnorm(10),var1y=rnorm(10),var2x=rnorm(10),var2y=rnorm(10),var3x=rnorm(10),
var3y=rnorm(10),var4x=rnorm(10),var4y=rnorm(10),var5x=rnorm(10),var5y=rnorm(10)
)
ans <- bind_rows(
lapply(
seq(6, ncol(df), 2),
function(x) {
df %>%
select(1:5, c(x, x+1)) %>%
rename(x=6, y=7) %>%
add_column(var=floor((x-5)/2)+1)
}
)
)
ans
Giving
# A tibble: 50 x 8
id1 id2 id3 id4 id5 x y var
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.816 0.315 -0.0932 0.232 -0.950 -1.17 -0.157 1
2 -0.0704 1.15 1.67 2.85 -2.61 -0.190 -1.60 1
3 -1.94 -0.343 0.873 -1.13 -0.638 0.802 -0.254 1
4 -1.19 0.247 0.0939 0.337 1.99 -0.806 1.43 1
5 0.100 -1.61 -1.39 -0.253 -1.53 -1.99 -1.04 1
6 -0.106 -0.244 -0.233 2.49 1.60 0.229 0.919 1
7 -0.979 -0.648 0.432 1.22 -1.16 -0.754 0.903 1
8 0.284 -0.220 -0.735 -0.804 0.0707 -0.714 -1.87 1
9 -0.0801 -1.53 -0.819 0.952 0.852 1.27 -0.702 1
10 -0.579 -0.0120 0.611 -0.621 -2.13 -0.812 1.43 1
# … with 40 more rows
How does it work?
seq(6, ncol(df), 2)
Defines the first of each pair of columns you wish to extract, starting with the 6th (the first after your first 5 id variables) and stepping through the columns in steps of two.
df %>%
select(1:5, c(x, x+1))
Selects the id columns and the current pair of data columns
rename(x=6, y=7) %>%
add_column(var=floor((x-5)/2)+1)
Renames the two data columns to x and y and adds a new id column identifying which pair of data columns are being handled. You will need to adapt this step to reflect your actual data.
lapply(...)
Performs the above steps on all pairs of data columns in the current data frame and returns the results as a list
bind_rows(...)
Converts the list of data frames to a single one.
Some comments:
Using column indexes to identify columns is fragile. I wouldn't do it. I'd use the column names themselves.
If your data column names are, for example, 20071_X then your data are not tidy because the column names contain information (here, the year and the quarter). This will likely cause you pain downstream. I'd recommend putting this right as soon as you can. By modifying my derivation of var, you can tidy your data as you extract it.
Please see this post for advice on how to write a good minimal working example.
A data frame have contains three variables:
from - character - the name of a measure
to - character - the name of another measure
covariance - numeric - the covariance between the two measures
Here's a link to the data. Below is the result of head(have):
from to covariance
a_airportscreener a_airportscreener 4.419285714
a_airportscreener e_airportscreener -1.328928571
a_airportscreener g_airportscreener -3.038928571
a_airportscreener p_airportscreener 0.3292857143
a_airportscreener pres_airportscreener 0.6452857143
a_automechanic a_automechanic 2.635535714
a_automechanic e_automechanic -0.3439285714
I want to create a data frame called need that records the covariances between prefixed versions of the same job title in separate columns. For example, the first row would look like:
job a_a a_e a_g a_p a_pres e_a e_e e_g e_p e_pres g_a g_e g_g g_p g_pres p_a p_e p_g p_p p_pres pres_a pres_e pres_g pres_p pres_pres
airportscreener 4.419 -1.329 -3.039 0.329 0.645 -1.329 2.333 2.441 -1.015 0.659 -3.039 2.441 14.253 3.070 0.977 0.329 -1.015 3.070 6.505 0.366 0.645 0.659 0.977 0.366 0.697
(I rounded the values in have to keep the example of need on the page, but this is not part of the question.)
Try this approach on your complete data
library(tidyverse)
cov_mat %>%
rownames_to_column() %>%
pivot_longer(cols =-rowname) %>%
mutate(key = paste0(sub("_.*", "\\1", name), "_", sub("_.*", "\\1", rowname)),
rowname = sub(".*_(.*)_.*", "\\1", rowname),
name = sub(".*_(.*)_.*", "\\1", name)) %>%
filter(rowname == name) %>%
select(-rowname) %>%
pivot_wider(names_from = key, values_from = value)
# A tibble: 58 x 26
# name a_a e_a g_a p_a pres_a a_e e_e g_e .....
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 airp… 4.42 -1.33 -3.04 0.329 0.645 -1.33 2.33 2.44
# 2 auto… 2.64 -0.344 6.26 -0.712 -0.595 -0.344 0.499 0.113
# 3 auto… 2.67 -0.466 2.36 -0.106 -0.878 -0.466 0.72 -5.95
# 4 blkj… 2.50 0.529 -6.79 0.0129 -0.0666 0.529 1.56 -8.58
# 5 blkt… 1.04 -0.00143 4.86 0.993 -0.194 -0.00143 0.229 -1.69
# 6 brid… 4.15 2.05 -11.5 -1.21 0.453 2.05 2.05 -9.09
# 7 cart… 1.79 0.458 -4.22 0.451 -0.410 0.458 1.23 3.54
# 8 chem… 2.29 0.479 12.4 -0.0384 -0.164 0.479 0.811 2.15
# 9 clth… 4.10 1.15 -18.9 1.77 0.728 1.15 1.7 -4.00
#10 coag… 2.23 -0.382 -7.79 -0.0190 0.460 -0.382 0.342 4.11
This is not as elegant as #Ronak Shah's answer, but I had been working on something similar, and thought it might be worth sharing for someone out there. It also uses pivot_longer and pivot_wider in latest tidyr.
library(readxl)
library(tidyr)
library(dplyr)
df <- read_excel("cov_data.xlsx")
need <- df %>%
separate(from, into = c('from1', 'job'), sep = '_') %>%
separate(to, into = 'to1', extra = 'drop', sep = '_') %>%
unite(comb1, from1, to1, remove = F) %>%
unite(comb2, to1, from1, remove = T) %>%
pivot_longer(c(comb1, comb2)) %>%
dplyr::select(-name) %>%
distinct() %>%
pivot_wider(names_from = value, values_from = covariance) %>%
dplyr::select(job, order(colnames(.)))
# A tibble: 58 x 26
job a_a a_e a_g a_p a_pres e_a e_e e_g e_p e_pres g_a g_e g_g g_p g_pres p_a p_e p_g
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 airp… 4.42 -1.33 -3.04 0.329 0.645 -1.33 2.33 2.44 -1.02 0.659 -3.04 2.44 14.3 3.07 0.977 0.329 -1.02 3.07
2 auto… 2.64 -0.344 6.26 -0.712 -0.595 -0.344 0.499 0.113 0.891 0.321 6.26 0.113 203. 5.16 0.645 -0.712 0.891 5.16
3 auto… 2.67 -0.466 2.36 -0.106 -0.878 -0.466 0.72 -5.95 0.431 0.194 2.36 -5.95 252. 4.65 -4.64 -0.106 0.431 4.65
4 blkj… 2.50 0.529 -6.79 0.0129 -0.0666 0.529 1.56 -8.58 -0.703 0.384 -6.79 -8.58 247. 2.11 1.68 0.0129 -0.703 2.11
5 blkt… 1.04 -0.00143 4.86 0.993 -0.194 -0.00143 0.229 -1.69 0.276 -0.0351 4.86 -1.69 260. 14.3 2.44 0.993 0.276 14.3
6 brid… 4.15 2.05 -11.5 -1.21 0.453 2.05 2.05 -9.09 -0.342 0.576 -11.5 -9.09 326. -2.07 0.992 -1.21 -0.342 -2.07
7 cart… 1.79 0.458 -4.22 0.451 -0.410 0.458 1.23 3.54 0.43 -0.0674 -4.22 3.54 478. 10.5 -1.21 0.451 0.43 10.5
8 chem… 2.29 0.479 12.4 -0.0384 -0.164 0.479 0.811 2.15 0.784 0.0469 12.4 2.15 238. 2.58 -2.05 -0.0384 0.784 2.58
9 clth… 4.10 1.15 -18.9 1.77 0.728 1.15 1.7 -4.00 1.65 0.133 -18.9 -4.00 193. -17.1 -6.81 1.77 1.65 -17.1
10 coag… 2.23 -0.382 -7.79 -0.0190 0.460 -0.382 0.342 4.11 0.161 0.0398 -7.79 4.11 444. 1.96 -7.55 -0.0190 0.161 1.96
Let's say I have a dataset with 10 numerical values and I want to normalize (apply any function in fact) each of these columns, so for column A, I´ll want to normalize and get "A normalized" in column ANorm. Same for the rest.
How can I program that in R?
I tried a for loop and lapply, which I think was the right approach but I couldn´t reach a final result.
Any help or direction is much appreciated!
This is perfect for a tidyverse solution
library(tidyverse)
set.seed(123)
matrix(runif(50),ncol=5) %>%
as_tibble() %>%
mutate_at(vars(V1:V5), funs(norm = ./mean(.)))
# A tibble: 10 x 10
V1 V2 V3 V4 V5 V1_norm V2_norm V3_norm V4_norm V5_norm
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.288 0.957 0.890 0.963 0.143 0.497 1.83 1.45 1.79 0.413
2 0.788 0.453 0.693 0.902 0.415 1.36 0.866 1.13 1.68 1.20
3 0.409 0.678 0.641 0.691 0.414 0.707 1.29 1.04 1.28 1.20
4 0.883 0.573 0.994 0.795 0.369 1.53 1.09 1.62 1.48 1.07
5 0.940 0.103 0.656 0.0246 0.152 1.63 0.197 1.07 0.0458 0.441
6 0.0456 0.900 0.709 0.478 0.139 0.0788 1.72 1.15 0.888 0.402
7 0.528 0.246 0.544 0.758 0.233 0.913 0.470 0.884 1.41 0.675
8 0.892 0.0421 0.594 0.216 0.466 1.54 0.0804 0.965 0.402 1.35
9 0.551 0.328 0.289 0.318 0.266 0.954 0.627 0.470 0.592 0.770
10 0.457 0.955 0.147 0.232 0.858 0.790 1.82 0.239 0.431 2.48