Get element number of list while iterating through it - r

I have a list of the following structure,
myList <- replicate(5, data.frame(id = 1:10, mean = runif(10)), simplify =F)
and I want to reduce it with a merge
myList %>% reduce(function(x, y) merge(x, y, by = 'id'))
That, however, leads to the following colnames:
id mean.x mean.y mean.x mean.y mean
While I would like something like
id mean1 mean2 mean3 mean4 mean5
Where the numbers are based on the order of myList.
Obviously I could iterate over 1:length(myList), but I find this solution unelegant. Other option would be to introduce a check in the reducing function, but that would indue a new linear time search for each element of the list, so I don't believe it to be very efficient.
Is there another way to achieve this?

New answer:
Using rbindlist and dcast from the data.table-package:
library(data.table)
mydata <- rbindlist(myList, idcol = 'df')
dcast(mydata, id ~ paste0('mean',df), value.var = 'mean')
Or with the tidyverse packages:
library(dplyr)
library(tidyr)
myList %>%
bind_rows(., .id = 'df') %>%
spread(df, mean) %>%
rename_at(-1, funs(paste0('mean',.)))
which both give (data.table-output is shown):
id mean1 mean2 mean3 mean4 mean5
1: 1 0.6937674 0.005642891 0.4155868 0.74184186 0.54513885
2: 2 0.3602352 0.569412043 0.8018570 0.29177043 0.34521060
3: 3 0.6353133 0.512876032 0.8711914 0.44660086 0.16338451
4: 4 0.2106574 0.555638598 0.8240744 0.37495213 0.57443740
5: 5 0.9530160 0.059930577 0.0930678 0.39862717 0.91568414
6: 6 0.3723244 0.598526326 0.4970844 0.01978011 0.07832631
7: 7 0.2923137 0.712971846 0.3805590 0.25676592 0.11682605
8: 8 0.6208868 0.426853621 0.5533876 0.64054247 0.78949419
9: 9 0.9032609 0.274705843 0.3525957 0.46994429 0.32883110
10: 10 0.9707088 0.351394642 0.1089803 0.97969335 0.77791085
When there are duplicates in id in one or more of the dataframes in myList, you have to adapt the dcast-step to dcast(mydata, id + rowid(id,df) ~ paste0('mean',df), value.var = 'mean') to get the correct outcome. Check the following example to see the result:
myList <- replicate(5, data.frame(id = sample(1:10, 10, TRUE), mean = runif(10)), simplify = FALSE)
mydata <- rbindlist(myList, idcol = 'df')
dcast(mydata, id + rowid(id,df) ~ paste0('mean',df), value.var = 'mean')
This also works when there are no duplicates in id.
The tidyverse-code has then to be adapted to:
myList %>%
bind_rows(., .id = 'df') %>%
group_by(df, id) %>%
mutate(ri = row_number()) %>%
ungroup() %>%
spread(df, mean) %>%
rename_at(3:7, funs(paste0('mean',.)))
Old answer (still valid):
A possible solution:
# option 1
myList <- mapply(function(x,y) {names(x)[2] = paste0('mean',y); x}, myList, 1:length(myList), SIMPLIFY = FALSE)
Reduce(function(x, y) merge(x, y, by = 'id'), myList)
# option 2 (quite similar to #zx8754's solution)
mydata <- Reduce(function(x, y) merge(x, y, by = 'id'), myList)
setNames(mydata, c('id', paste0('mean', seq_along(myList))))
which gives:
id mean1 mean2 mean3 mean4 mean5
1 1 0.1119114 0.4193226 0.86619590 0.52543072 0.52879193
2 2 0.4630863 0.8786721 0.02012432 0.77274088 0.09227344
3 3 0.9832522 0.4687838 0.49074271 0.01611625 0.69919423
4 4 0.7017467 0.7845002 0.44692958 0.64485570 0.40808345
5 5 0.6204856 0.1687563 0.54407165 0.54236973 0.09947167
6 6 0.1480965 0.7654041 0.43591864 0.22468554 0.84557988
7 7 0.0179509 0.3610114 0.45420122 0.20612154 0.76899342
8 8 0.9862083 0.5579173 0.13540519 0.97311401 0.13947602
9 9 0.3140737 0.2213044 0.05187671 0.07870425 0.23880332
10 10 0.4515313 0.2367271 0.65728768 0.22149073 0.90578043

You can also try to modify the function in the Reduce (or reduce) call to make the adding of indices automatic :
Reduce(function(x, y){
# get indices of columns that are not the common one, in x and y
col_noby_x <- which(colnames(x) != "id")
col_noby_y <- which(colnames(y) != "id")
# maximum of indices in x (at the end of the column names)
ind_x <- max(as.numeric(sub(".+(\\d+)$", "\\1", colnames(x)[col_noby_x])))
# if there is no indice yet, put 1 and 2, else modify names only in y, taking the max value of indices in x plus one.
if(!is.na(ind_x)) colnames(y)[col_noby_y] <- paste0(colnames(y)[col_noby_y], ind_x +1) else {colnames(x)[col_noby_x] <- paste0(colnames(x)[col_noby_x], 1); colnames(y)[col_noby_y] <- paste0(colnames(y)[col_noby_y], 2)}
# finally merge
merge(x, y, by="id")}, myList)
# id mean1 mean2 mean3 mean4 mean5
#1 1 0.10698388 0.0277198 0.5109345 0.8885772 0.79983437
#2 2 0.29750846 0.7951743 0.9558739 0.9691619 0.31805857
#3 3 0.07115142 0.2401011 0.8106464 0.5101563 0.78697618
#4 4 0.39564336 0.7225532 0.7583893 0.4275574 0.77151883
#5 5 0.55860511 0.4111913 0.8403031 0.4284490 0.51489116
#6 6 0.92191777 0.9142926 0.4708712 0.2451099 0.84142501
#7 7 0.08218166 0.2741819 0.6772842 0.7939364 0.86930336
#8 8 0.35392512 0.2088531 0.0801731 0.2734870 0.62963218
#9 9 0.64068537 0.8427225 0.1904426 0.2389339 0.73145206
#10 10 0.31304719 0.9898133 0.8173664 0.2013031 0.04658273

Merge with Reduce, then update column names:
res <- Reduce(function(...) merge(..., all = TRUE, by = "id"), myList)
colnames(res)[2:ncol(res)] <- paste0("mean", 1:length(myList))

We can use set_names
library(tidyverse)
myList %>%
reduce(merge, by = 'id') %>%
set_names(c("id", paste0("mean", 1:5)))
# id mean1 mean2 mean3 mean4 mean5
#1 1 0.07122593 0.480300675 0.34944190 0.48718226 0.9118796
#2 2 0.18375430 0.850652470 0.24780063 0.45148232 0.2587470
#3 3 0.18617054 0.526188340 0.48716956 0.53354343 0.9057241
#4 4 0.87838756 0.811985522 0.49024819 0.10412944 0.7830501
#5 5 0.29287646 0.974811919 0.31413846 0.01508965 0.4587954
#6 6 0.62304018 0.004421152 0.81053625 0.80032467 0.7630185
#7 7 0.78445890 0.006362844 0.73643248 0.15952795 0.4386658
#8 8 0.71568076 0.081139996 0.36933728 0.31771823 0.2794372
#9 9 0.25523328 0.081603285 0.00298272 0.33698950 0.2413859
#10 10 0.86274552 0.432177738 0.26064580 0.75639537 0.3125151

Here are two one liners
Using purrr:reduce2 and dplyr::inner_join in place of merge:
library(dplyr)
library(purrr)
myList %>% reduce2(map(2:length(.),~c("",.x)), inner_join, by = 'id',copy=F)
# id mean mean2 mean3 mean4 mean5
# 1 1 0.44560715 0.4575765 0.6075921 0.06504922 0.90410342
# 2 2 0.60606716 0.5004711 0.7866959 0.89632285 0.09890028
# 3 3 0.59928281 0.4894146 0.4495071 0.66090212 0.56046997
# 4 4 0.55630819 0.4166869 0.1984523 0.08040737 0.18375885
# 5 5 0.97714203 0.1223497 0.7923596 0.53054508 0.93747149
# 6 6 0.07751312 0.6217220 0.3861749 0.30062805 0.03177210
# 7 7 0.22839323 0.3994350 0.6382234 0.98578452 0.27032222
# 8 8 0.73628572 0.8804618 0.8240999 0.44205508 0.73901477
# 9 9 0.81894510 0.2186181 0.9317510 0.60035660 0.65002083
# 10 10 0.26197059 0.5569660 0.9167330 0.58912675 0.81367176
Or using plyr::join_all and tibble::repair_names(same output):
myList %>% join_all('id','inner') %>% repair_names

Related

Split 600 columns into 2 new columns for each one, with old and new column names in vectors

I want to split 600 columns (listed in a vector) at a delimiter (in this case a /) into 2 new columns for each one (also listed as vectors). I've worked out basic logic for the split as shown below, but is there a better way than having hundreds of lines of code to get this done? Any ideas?
df1 <- data.frame(codes1 = c('H5394/6N938', '49J62/P82', '142X394/652876','1057C83/25394', '45N564/3558', '49405/A8174'),
codes2 = c('W34142/83X652', '5L622/482S3', '44N574/358866','1P47484/724A94', '454N64/3458', '49A05/28774'),
codes3 = c('19K4/139D54', '4T3962/78D43', '18V94/682876','P083/28394', '434/8558', '43405/2387N'))
originalvar <- c('codes1', 'codes2', 'codes3')
newleftvar <- c('leftcode_0hr', 'leftcode_2hr', 'leftcode_4hr')
newrightvar <- c('rightcode_0hr', 'rightcode_2hr', 'rightcode_4hr')
df1
codes1 codes2 codes3
1 H5394/6N938 W34142/83X652 19K4/139D54
2 49J62/P82 5L622/482S3 4T3962/78D43
3 142X394/652876 44N574/358866 18V94/682876
4 1057C83/25394 1P47484/724A94 P083/28394
5 45N564/3558 454N64/3458 434/8558
6 49405/A8174 49A05/28774 43405/2387N
#my lame approach - lol
df1$leftcode_0hr <- substr(df1$codes1, 1, stringr::str_locate(df1$codes1, "/")-1)
df1$rightcode_0hr <- substr(df1$codes1, stringr::str_locate(df1$codes1, "/")+1, nchar(df1$codes1))
df1$leftcode_2hr <- substr(df1$codes2, 1, stringr::str_locate(df1$codes2, "/")-1)
df1$rightcode_2hr <- substr(df1$codes2, stringr::str_locate(df1$codes2, "/")+1, nchar(df1$codes2))
df1$leftcode_4hr <- substr(df1$codes3, 1, stringr::str_locate(df1$codes3, "/")-1)
df1$rightcode_4hr <- substr(df1$codes3, stringr::str_locate(df1$codes3, "/")+1, nchar(df1$codes3))
df1
codes1 codes2 codes3 leftcode_0hr rightcode_0hr leftcode_2hr rightcode_2hr leftcode_4hr rightcode_4hr
1 H5394/6N938 W34142/83X652 19K4/139D54 H5394 6N938 W34142 83X652 19K4 139D54
2 49J62/P82 5L622/482S3 4T3962/78D43 49J62 P82 5L622 482S3 4T3962 78D43
3 142X394/652876 44N574/358866 18V94/682876 142X394 652876 44N574 358866 18V94 682876
4 1057C83/25394 1P47484/724A94 P083/28394 1057C83 25394 1P47484 724A94 P083 28394
5 45N564/3558 454N64/3458 434/8558 45N564 3558 454N64 3458 434 8558
6 49405/A8174 49A05/28774 43405/2387N 49405 A8174 49A05 28774 43405 2387N
You can do this:
separate(
df1 %>% pivot_longer(everything()),
value, into=c("left", "right"), sep = "/"
) %>%
pivot_wider(names_from="name", values_from = left:right, values_fn = list,names_sep = "") %>%
unnest(everything()) %>%
rename_with(~paste0(
str_extract(.x,"^(left|right)code"),
paste0("_",as.numeric(str_extract(.x,"\\d+"))*2-2,"hr")
)) %>%
relocate(unlist(lapply(1:ncol(df1),\(i) c(i,i+ncol(df1)))))
Output:
leftcode_0hr rightcode_0hr leftcode_2hr rightcode_2hr leftcode_4hr rightcode_4hr
<chr> <chr> <chr> <chr> <chr> <chr>
1 H5394 6N938 W34142 83X652 19K4 139D54
2 49J62 P82 5L622 482S3 4T3962 78D43
3 142X394 652876 44N574 358866 18V94 682876
4 1057C83 25394 1P47484 724A94 P083 28394
5 45N564 3558 454N64 3458 434 8558
6 49405 A8174 49A05 28774 43405 2387N
Here is another approach, which simply takes your initial idea, and wraps it in a function, which can then be applied to each column name of df1:
f <- function(code) {
suffix = paste0("_", as.numeric(str_extract(code,"\\d+"))*2-2,"hr")
setNames(
list(
substr(df1[[code]], 1, stringr::str_locate(df1[[code]], "/")-1),
substr(df1[[code]], stringr::str_locate(df1[[code]], "/")+1, nchar(df1[[code]]))
),paste0(c("leftcode", "rightcode"),suffix)
)
}
as.data.frame(lapply(colnames(df1), f))
Output:
leftcode_0hr rightcode_0hr leftcode_2hr rightcode_2hr leftcode_4hr rightcode_4hr
1 H5394 6N938 W34142 83X652 19K4 139D54
2 49J62 P82 5L622 482S3 4T3962 78D43
3 142X394 652876 44N574 358866 18V94 682876
4 1057C83 25394 1P47484 724A94 P083 28394
5 45N564 3558 454N64 3458 434 8558
6 49405 A8174 49A05 28774 43405 2387N
Finally, here is a simpler functional version
f <- function(d,n) {
setNames(
as.list(as.data.frame(str_split(d,"/",simplify = T))),
paste0(c("left","right"),"code_",2*(n-1),"hr")
)
}
as.data.frame(lapply(seq_along(df1),\(i) f(df1[[i]],i)))
And in base:
how_many <- seq(1, length(unlist(strsplit(df1$codes1, '/'))), 1)
df1$left_0hr <- unlist(strsplit(df1$codes1, '/'))[which((how_many %% 2 == 0) == FALSE)]
df1
codes1 codes2 codes3 left_0hr
1 H5394/6N938 W34142/83X652 19K4/139D54 H5394
2 49J62/P82 5L622/482S3 4T3962/78D43 49J62
3 142X394/652876 44N574/358866 18V94/682876 142X394
4 1057C83/25394 1P47484/724A94 P083/28394 1057C83
5 45N564/3558 454N64/3458 434/8558 45N564
6 49405/A8174 49A05/28774 43405/2387N 49405
and so on.

Make bootstrap function more efficient with lapply

I have a data frame with numeric columns and a character column with labels. See example:
library(tidyverse)
a <- c(0.036210845, 0.005546561, 0.004394322 ,0.006635205, 2.269306824 ,0.013542101, 0.006580308 ,0.006854309,0.009076331 ,0.006577178 ,0.099406840 ,0.010962796, 0.011491922,0.007454443 ,0.004463684,0.005836916,0.011119906 ,0.009543205, 0.003990476, 0.007793532 ,0.020776231, 0.011713687, 0.010045341, 0.008411304, 0.032514994)
b <- c(0.030677829, 0.005210211, 0.004164294, 0.006279456 ,1.095908581 ,0.012029876, 0.006193405 ,0.006486812, 0.008589699, 0.006167356, 0.068956516 ,0.010140064 ,0.010602171 ,0.006898081 ,0.004193735, 0.005447855 ,0.009936211, 0.008743681, 0.003774822, 0.007375678, 0.019695336, 0.010827791, 0.009258572, 0.007960328,0.026956408)
c <- c(0.025855453, 0.004882746 ,0.003946182, 0.005929399 ,0.466284591 ,0.010704604 ,0.005815709, 0.006125196, 0.008110854, 0.005769223, 0.046847336, 0.009356712, 0.009803620 ,0.006366758, 0.003936953 ,0.005072295, 0.008885989 ,0.007989028, 0.003565631, 0.006964512, 0.018636187, 0.010009413, 0.008540876, 0.007516569,0.022227924)
label <- c("fa05","fa05" ,"fa05", "fa10", "fa10", "fa10", "fa20","fa20", "faflat", "faflat", "sa05", "sa05", "sa10" , "sa10" , "sa10" , "sa10", "sa10", "sa10", "sa20", "sa20", "sa20" ,"sa20", "saflat", "saflat", "saflat")
dataframe <- as.data.frame(cbind(a,b,c,label))
dataframe <- dataframe %>%
transform(a = as.numeric(a)) %>%
transform(b = as.numeric(b)) %>%
transform(c = as.numeric(c))
I have written a function that takes a sample of rows for each label (number of rows in sample = number of rows for the specific label) and as output gives the average of the samples. Example: in the source data (dataframe) there are 3 rows of the label "fa05". Lets call them fa05_1, fa05_2, fa05_3 (just for explaining it). The function takes a sample of these three rows that each consist of 3 columns (a,b and c). The number of fa05 in the sample equals the number fa05 in the source data, so 3 in this case. The function takes a sample with replacement so it could fx be fa05_3, fa05_1, fa05_1. Then it takes the average of those three samples for each of the three columns a,b and c and gives the output. It looks like this:
samp <- function(df, col1, var){
df %>%
group_by(!!col1) %>%
nest() %>%
ungroup() %>%
mutate(n = !!var) %>%
mutate(samp = map2(data, n, sample_n, replace=T)) %>%
select(-data) %>%
unnest(samp) %>%
group_by(!!col1) %>%
dplyr::summarise(across("a":"c", mean))
}
list <- c(3,3,2,2,2,6,4,3) # The number of times each label occur in the data
samp(dataframe, quo(label), quo(list))
label a b c
<chr> <dbl> <dbl> <dbl>
1 fa05 0.00439 0.00416 0.00395
2 fa10 0.00894 0.00820 0.00752
3 fa20 0.00672 0.00634 0.00597
4 faflat 0.00908 0.00859 0.00811
5 sa05 0.0552 0.0395 0.0281
6 sa10 0.00715 0.00657 0.00603
7 sa20 0.0101 0.00956 0.00903
8 saflat 0.0250 0.0211 0.0177
I would like to use this function on some data and repeat it 1000 times efficiently. At first it was not a function and I used rerun() but that was very inefficient. I read that I could write it as a function and the use lapply which should be more efficient, but it does not work when I do like this:
lapply(dataframe, samp, col1=quo(Pattern), var=quo(list))
Error in UseMethod("group_by_") :
no applicable method for 'group_by_' applied to an object of class "c('double', 'numeric')"
How do I make this work with lapply? And how to I tell lapply to rerun the function 1000 times? I hope you can help.
You can just do this
replicate(1000, samp(dataframe, quo(label), quo(list)), simplify = FALSE)
However, this is really slow.
> system.time(replicate(1000, samp(dataframe, quo(label), quo(list)), simplify = FALSE))
user system elapsed
33.83 0.03 33.87
To make it faster, we need to rewrite your samp function. Here is a tidyverse approach
group_sample_size <- c("fa05" = 3, "fa10" = 3, "fa20" = 2, "faflat" = 2, "sa05" = 2, "sa10" = 6, "sa20" = 4, "saflat" = 3)
prep <- function(df, grp_var, sample_size) {
df %>%
mutate(size = sample_size[.data[[grp_var]]]) %>%
group_by(across(!!grp_var))
}
rep_sample <- function(df, n) {
replicate(
n,
df %>%
slice(sample.int(n(), size[[1L]], replace = TRUE)) %>%
summarise(across(a:c, mean), .groups = "drop"),
simplify = FALSE
)
}
dataframe %>%
prep("label", group_sample_size) %>%
rep_sample(1000)
Performance has improved significantly but is still suboptimal IMO. It takes about 5-6 seconds to finish the simulation.
> system.time(dataframe %>% prep("label", group_sample_size) %>% rep_sample(1000))
user system elapsed
5.80 0.01 5.81
For efficiency, I think the following data.table approach would be better.
library(data.table)
fsamp <- function(df, grp_var, size, nsim) {
df <- as.data.table(df)
group_info <- table(df[[grp_var]], dnn = list(grp_var))
simu_pool <- df[, -grp_var, with = FALSE]
simu_vars <- names(simu_pool)
simu_pool <- split(simu_pool, df[[grp_var]])
out <- data.table(
simu = rep(seq_len(nsim), each = length(group_info)),
group_info
)
out[
, size := size[out[[grp_var]]]
][
, (simu_vars) := lapply(simu_pool[[.BY[[grp_var]]]][sample.int(N, size, replace = TRUE)], mean),
by = c("simu", grp_var)
][]
}
This one is about four times faster than the optimised tidyverse approach.
> system.time(fsamp(dataframe, "label", group_sample_size, 1000))
user system elapsed
1.47 0.04 1.50
All three approaches produce the same set of results
> set.seed(124)
> # rbindlist converts a list of tibbles into a single data.table
> dataframe %>% prep("label", group_sample_size) %>% rep_sample(1000) %>% rbindlist()
label a b c
1: fa05 0.015383909 0.013350778 0.011561460
2: fa10 0.763161377 0.371405971 0.160972865
3: fa20 0.006717308 0.006340109 0.005970452
4: faflat 0.009076331 0.008589699 0.008110854
5: sa05 0.055184818 0.039548290 0.028102024
---
7996: faflat 0.007826754 0.007378527 0.006940039
7997: sa05 0.099406840 0.068956516 0.046847336
7998: sa10 0.006648513 0.006118159 0.005626362
7999: sa20 0.020776231 0.019695336 0.018636187
8000: saflat 0.008411304 0.007960328 0.007516569
> set.seed(124)
> fsamp(df, "label", group_sample_size, 1000)
simu label N size a b c
1: 1 fa05 3 3 0.015383909 0.013350778 0.011561460
2: 1 fa10 3 3 0.763161377 0.371405971 0.160972865
3: 1 fa20 2 2 0.006717308 0.006340109 0.005970452
4: 1 faflat 2 2 0.009076331 0.008589699 0.008110854
5: 1 sa05 2 2 0.055184818 0.039548290 0.028102024
---
7996: 1000 faflat 2 2 0.007826754 0.007378527 0.006940039
7997: 1000 sa05 2 2 0.099406840 0.068956516 0.046847336
7998: 1000 sa10 6 6 0.006648513 0.006118159 0.005626362
7999: 1000 sa20 4 4 0.020776231 0.019695336 0.018636187
8000: 1000 saflat 3 3 0.008411304 0.007960328 0.007516569
> set.seed(124)
> replicate(1000, samp(dataframe, quo(label), quo(list)), simplify = FALSE) %>% rbindlist()
label a b c
1: fa05 0.015383909 0.013350778 0.011561460
2: fa10 0.763161377 0.371405971 0.160972865
3: fa20 0.006717308 0.006340109 0.005970452
4: faflat 0.009076331 0.008589699 0.008110854
5: sa05 0.055184818 0.039548290 0.028102024
---
7996: faflat 0.007826754 0.007378527 0.006940039
7997: sa05 0.099406840 0.068956516 0.046847336
7998: sa10 0.006648513 0.006118159 0.005626362
7999: sa20 0.020776231 0.019695336 0.018636187
8000: saflat 0.008411304 0.007960328 0.007516569

How can i add more columns in dataframe by for loop

I am beginner of R. I need to transfer some Eviews code to R. There are some loop code to add 10 or more columns\variables with some function in data in Eviews.
Here are eviews example code to estimate deflator:
for %x exp con gov inv cap ex im
frml def_{%x} = gdp_{%x}/gdp_{%x}_r*100
next
I used dplyr package and use mutate function. But it is very hard to add many variables.
library(dplyr)
nominal_gdp<-rnorm(4)
nominal_inv<-rnorm(4)
nominal_gov<-rnorm(4)
nominal_exp<-rnorm(4)
real_gdp<-rnorm(4)
real_inv<-rnorm(4)
real_gov<-rnorm(4)
real_exp<-rnorm(4)
df<-data.frame(nominal_gdp,nominal_inv,
nominal_gov,nominal_exp,real_gdp,real_inv,real_gov,real_exp)
df<-df %>% mutate(deflator_gdp=nominal_gdp/real_gdp*100,
deflator_inv=nominal_inv/real_inv,
deflator_gov=nominal_gov/real_gov,
deflator_exp=nominal_exp/real_exp)
print(df)
Please help me to this in R by loop.
The answer is that your data is not as "tidy" as it could be.
This is what you have (with an added observation ID for clarity):
library(dplyr)
df <- data.frame(nominal_gdp = rnorm(4),
nominal_inv = rnorm(4),
nominal_gov = rnorm(4),
real_gdp = rnorm(4),
real_inv = rnorm(4),
real_gov = rnorm(4))
df <- df %>%
mutate(obs_id = 1:n()) %>%
select(obs_id, everything())
which gives:
obs_id nominal_gdp nominal_inv nominal_gov real_gdp real_inv real_gov
1 1 -0.9692060 -1.5223055 -0.26966202 0.49057546 2.3253066 0.8761837
2 2 1.2696927 1.2591910 0.04238958 -1.51398652 -0.7209661 0.3021453
3 3 0.8415725 -0.1728212 0.98846942 -0.58743294 -0.7256786 0.5649908
4 4 -0.8235101 1.0500614 -0.49308092 0.04820723 -2.0697008 1.2478635
Consider if you had instead, in df2:
obs_id variable real nominal
1 1 gdp 0.49057546 -0.96920602
2 2 gdp -1.51398652 1.26969267
3 3 gdp -0.58743294 0.84157254
4 4 gdp 0.04820723 -0.82351006
5 1 inv 2.32530662 -1.52230550
6 2 inv -0.72096614 1.25919100
7 3 inv -0.72567857 -0.17282123
8 4 inv -2.06970078 1.05006136
9 1 gov 0.87618366 -0.26966202
10 2 gov 0.30214534 0.04238958
11 3 gov 0.56499079 0.98846942
12 4 gov 1.24786355 -0.49308092
Then what you want to do is trivial:
df2 %>% mutate(deflator = real / nominal)
obs_id variable real nominal deflator
1 1 gdp 0.49057546 -0.96920602 -0.50616221
2 2 gdp -1.51398652 1.26969267 -1.19240392
3 3 gdp -0.58743294 0.84157254 -0.69801819
4 4 gdp 0.04820723 -0.82351006 -0.05853872
5 1 inv 2.32530662 -1.52230550 -1.52749012
6 2 inv -0.72096614 1.25919100 -0.57256297
7 3 inv -0.72567857 -0.17282123 4.19901294
8 4 inv -2.06970078 1.05006136 -1.97102841
9 1 gov 0.87618366 -0.26966202 -3.24919196
10 2 gov 0.30214534 0.04238958 7.12782060
11 3 gov 0.56499079 0.98846942 0.57158146
12 4 gov 1.24786355 -0.49308092 -2.53074800
So the question becomes: how do we get to the nice dplyr-compatible data.frame.
You need to gather your data using tidyr::gather. However, because you have 2 sets of variables to gather (the real and nominal values), it is not straightforward. I have done it in two steps, there may be a better way though.
real_vals <- df %>%
select(obs_id, starts_with("real")) %>%
# the line below is where the magic happens
tidyr::gather(variable, real, starts_with("real")) %>%
# extracting the variable name (by erasing up to the underscore)
mutate(variable = gsub(variable, pattern = ".*_", replacement = ""))
# Same thing for nominal values
nominal_vals <- df %>%
select(obs_id, starts_with("nominal")) %>%
tidyr::gather(variable, nominal, starts_with("nominal")) %>%
mutate(variable = gsub(variable, pattern = ".*_", replacement = ""))
# Merging them... Now we have something we can work with!
df2 <-
full_join(real_vals, nominal_vals, by = c("obs_id", "variable"))
Note the importance of the observation id when merging.
We can grep the matching names, and sort:
x <- colnames(df)
df[ sort(x[ (grepl("^nominal", x)) ]) ] /
df[ sort(x[ (grepl("^real", x)) ]) ] * 100
Similarly, if the columns were sorted, then we could just:
df[ 1:4 ] / df[ 5:8 ] * 100
We can loop over column names using purrr::map_dfc then apply a custom function over the selected columns (i.e. the columns that matched the current name from nms)
library(dplyr)
library(purrr)
#Replace anything before _ with empty string
nms <- unique(sub('.*_','',names(df)))
#Use map if you need the ouptut as a list not a dataframe
map_dfc(nms, ~deflator_fun(df, .x))
Custom function
deflator_fun <- function(df, x){
#browser()
nx <- paste0('nominal_',x)
rx <- paste0('real_',x)
select(df, matches(x)) %>%
mutate(!!paste0('deflator_',quo_name(x)) := !!ensym(nx) / !!ensym(rx)*100)
}
#Test
deflator_fun(df, 'gdp')
nominal_gdp real_gdp deflator_gdp
1 -0.3332074 0.181303480 -183.78433
2 -1.0185754 -0.138891362 733.36121
3 -1.0717912 0.005764186 -18593.97398
4 0.3035286 0.385280401 78.78123
Note: Learn more about quo_name, !!, and ensym which they are tools for programming with dplyr here

How to get summary statistics for multiple variables by multiple groups?

I know that there are many answers provided in this forum on how to get summary statistics (e.g. mean, se, N) for multiple groups using options like aggregate , ddply or data.table. I'm not sure, however, how to apply these functions over multiple columns at once.
More specifically, I would like to know how to extend the following ddply command over multiple columns (dv1, dv2, dv3) without re-typing the code with different variable name each time.
library(reshape2)
library(plyr)
group1 <- c(rep(LETTERS[1:4], c(4,6,6,8)))
group2 <- c(rep(LETTERS[5:8], c(6,4,8,6)))
group3 <- c(rep(LETTERS[9:10], c(12,12)))
my.dat <- data.frame(group1, group2, group3, dv1=rnorm(24),dv2=rnorm(24),dv3=rnorm(24))
my.dat
data1 <- ddply(my.dat, c("group1", "group2","group3"), summarise,
N = length(dv1),
mean = mean(dv1,na.rm=T),
sd = sd(dv1,na.rm=T),
se = sd / sqrt(N)
)
data1
How can I apply this ddply function over multiple columns such that the outcome will be data1, data2, data3... for each outcome variable? I thought this could be the solution:
dfm <- melt(my.dat, id.vars = c("group1", "group2","group3"))
lapply(list(.(group1, variable), .(group2, variable),.(group3, variable)),
ddply, .data = dfm, .fun = summarize,
mean = mean(value),
sd = sd(value),
N=length(value),
se=sd/sqrt(N))
Looks like it's in the right direction but not exactly what I need. This solution provides the statistics by each group separately. What I need an outcome as in data1 (e.g. first aggregated group is people who are at A, E and I; the second is those who are at group B, E and I etc...)
Here's an illustration of reshaping your data first. I've written a custom function to improve readability:
mysummary <- function(x,na.rm=F){
res <- list(mean=mean(x, na.rm=na.rm),
sd=sd(x,na.rm=na.rm),
N=length(x))
res$se <- res$sd/sqrt(res$N)
res
}
library(data.table)
res <- melt(setDT(my.dat),id.vars=c("group1","group2","group3"))[,mysummary(value),
by=.(group1,group2,group3,variable)]
> head(res)
group1 group2 group3 variable mean sd N se
1: A E I dv1 9.75 6.994045 4 3.497023
2: B E I dv1 9.50 7.778175 2 5.500000
3: B F I dv1 16.00 4.082483 4 2.041241
4: C G I dv1 14.50 10.606602 2 7.500000
5: C G J dv1 10.75 10.372239 4 5.186119
6: D G J dv1 13.00 4.242641 2 3.000000
Or without the custom function, thanks to #Jaap
melt(setDT(my.dat),
id=c("group1","group2","group3"))[, .(mean = mean(value),
sd = sd(value),
n = .N,
se = sd(value)/sqrt(.N)),
.(group1, group2, group3, variable)]
If you don't want to melt into long format, you can also do:
library(data.table)
setDT(my.dat)[, as.list(unlist(lapply(.SD, function(x) list(mean = mean(x),
sd = sd(x),
n = .N,
se = sd(x)/sqrt(.N))))),
by = .(group1, group2, group3), .SDcols=c("dv1","dv2","dv3")]
which gives:
group1 group2 group3 dv1.mean dv1.sd dv1.n dv1.se dv2.mean dv2.sd dv2.n dv2.se dv3.mean dv3.sd dv3.n dv3.se
1: A E I 0.09959774 0.4704498 4 0.23522491 0.05020096 0.8098882 4 0.40494412 -0.134137210 0.7832841 4 0.3916420
2: B E I 0.72726477 0.3651544 2 0.25820315 0.73743314 1.4260172 2 1.00834641 -0.120188202 0.5532434 2 0.3912022
3: B F I -0.68661572 0.7212631 4 0.36063157 0.06670216 0.7678781 4 0.38393905 0.096275469 0.8993015 4 0.4496508
4: C G I -0.54577363 0.0798962 2 0.05649515 0.18293371 0.1022325 2 0.07228926 -0.947603264 2.3118016 2 1.6346906
5: C G J 0.17434075 0.8503874 4 0.42519369 -0.11485558 1.4184031 4 0.70920154 -0.005140781 0.6871591 4 0.3435796
6: D G J 0.17943465 0.4943486 2 0.34955725 -0.22223273 0.3679613 2 0.26018796 -0.373289114 1.0737512 2 0.7592568
7: D H J 0.38090937 0.7904832 6 0.32271340 0.02107597 1.0094695 6 0.41211422 0.118277330 0.9024006 6 0.3684035
Here is a solution using dplyr. This gives the result in a "wide" format (i.e. the stats for dv1, dv2, dv3 are on the same line).
se <- function(x) { sd(x)/sqrt(length(x)) }
my.dat %>%
group_by(group1, group2, group3) %>%
summarise_each(funs(mean, sd, length, se), dv1, dv2, dv3) %>%
ungroup
If having the stats for dv1, dv2, and dv3 on separate lines is desired, this can be modified using melt or gather (from tidyr).

dplyr summarize: create variables from named vector

Here's my problem:
I am using a function that returns a named vector. Here's a toy example:
toy_fn <- function(x) {
y <- c(mean(x), sum(x), median(x), sd(x))
names(y) <- c("Right", "Wrong", "Unanswered", "Invalid")
y
}
I am using group_by in dplyr to apply this function for each group (typical split-apply-combine). So, here's my toy data.frame:
set.seed(1234567)
toy_df <- data.frame(id = 1:1000,
group = sample(letters, 1000, replace = TRUE),
value = runif(1000))
And here's the result I am aiming for:
toy_summary <-
toy_df %>%
group_by(group) %>%
summarize(Right = toy_fn(value)["Right"],
Wrong = toy_fn(value)["Wrong"],
Unanswered = toy_fn(value)["Unanswered"],
Invalid = toy_fn(value)["Invalid"])
> toy_summary
Source: local data frame [26 x 5]
group Right Wrong Unanswered Invalid
1 a 0.5038394 20.15358 0.5905526 0.2846468
2 b 0.5048040 15.64892 0.5163702 0.2994544
3 c 0.5029442 21.62660 0.5072733 0.2465612
4 d 0.5124601 14.86134 0.5382463 0.2681955
5 e 0.4649483 17.66804 0.4426197 0.3075080
6 f 0.5622644 12.36982 0.6330269 0.2850609
7 g 0.4675324 14.96104 0.4692404 0.2746589
It works! But it is just not cool to call four times the same function. I would rather like dplyr to get the named vector and create a new variable for each element in the vector. Something like this:
toy_summary <-
toy_df %>%
group_by(group) %>%
summarize(toy_fn(value))
This, unfortunately, does not work because "Error: expecting a single value".
I thought, ok, let's just convert the vector to a data.frame using data.frame(as.list(x)). But this does not work either. I tried many things but I couldn't trick dplyr into think it's actually receiving one single value (observation) for 4 different variables. Is there any way to help dplyr realize that?.
One possible solution is to use dplyr SE capabilities. For example, set you function as follows
dots <- setNames(list( ~ mean(value),
~ sum(value),
~ median(value),
~ sd(value)),
c("Right", "Wrong", "Unanswered", "Invalid"))
Then, you can use summarize_ (with a _) as follows
toy_df %>%
group_by(group) %>%
summarize_(.dots = dots)
# Source: local data table [26 x 5]
#
# group Right Wrong Unanswered Invalid
# 1 o 0.4490776 17.51403 0.4012057 0.2749956
# 2 s 0.5079569 15.23871 0.4663852 0.2555774
# 3 x 0.4620649 14.78608 0.4475117 0.2894502
# 4 a 0.5038394 20.15358 0.5905526 0.2846468
# 5 t 0.5041168 24.19761 0.5330790 0.3171022
# 6 m 0.4806628 21.14917 0.4805273 0.2825026
# 7 c 0.5029442 21.62660 0.5072733 0.2465612
# 8 w 0.4932484 17.75694 0.4891746 0.3309680
# 9 q 0.5350707 22.47297 0.5608505 0.2749941
# 10 g 0.4675324 14.96104 0.4692404 0.2746589
# .. ... ... ... ... ...
Though it looks nice, there is a big catch here. You have to know the column you are going to operate on a priori (value) when setting up the function, so it won't work on some other column name, if you won't set up dots properly.
As a bonus here's a simple solution using data.table using your original function
library(data.table)
setDT(toy_df)[, as.list(toy_fn(value)), by = group]
# group Right Wrong Unanswered Invalid
# 1: o 0.4490776 17.51403 0.4012057 0.2749956
# 2: s 0.5079569 15.23871 0.4663852 0.2555774
# 3: x 0.4620649 14.78608 0.4475117 0.2894502
# 4: a 0.5038394 20.15358 0.5905526 0.2846468
# 5: t 0.5041168 24.19761 0.5330790 0.3171022
# 6: m 0.4806628 21.14917 0.4805273 0.2825026
# 7: c 0.5029442 21.62660 0.5072733 0.2465612
# 8: w 0.4932484 17.75694 0.4891746 0.3309680
# 9: q 0.5350707 22.47297 0.5608505 0.2749941
# 10: g 0.4675324 14.96104 0.4692404 0.2746589
#...
You can also try this with do():
toy_df %>%
group_by(group) %>%
do(res = toy_fn(.$value))
This is not a dplyr solution, but if you like pipes:
library(magrittr)
toy_summary <-
toy_df %>%
split(.$group) %>%
lapply( function(x) toy_fn(x$value) ) %>%
do.call(rbind, .)
# > head(toy_summary)
# Right Wrong Unanswered Invalid
# a 0.5038394 20.15358 0.5905526 0.2846468
# b 0.5048040 15.64892 0.5163702 0.2994544
# c 0.5029442 21.62660 0.5072733 0.2465612
# d 0.5124601 14.86134 0.5382463 0.2681955
# e 0.4649483 17.66804 0.4426197 0.3075080
# f 0.5622644 12.36982 0.6330269 0.2850609
Apparently there's a problem when using median (not sure what's going on there) but apart from that you can normally use an approach like the following with summarise_each to apply multiple functions. Note that you can specify the names of resulting columns by using a named vector as input to funs_():
x <- c(Right = "mean", Wrong = "sd", Unanswered = "sum")
toy_df %>%
group_by(group) %>%
summarise_each(funs_(x), value)
#Source: local data frame [26 x 4]
#
# group Right Wrong Unanswered
#1 a 0.5038394 0.2846468 20.15358
#2 b 0.5048040 0.2994544 15.64892
#3 c 0.5029442 0.2465612 21.62660
#4 d 0.5124601 0.2681955 14.86134
#5 e 0.4649483 0.3075080 17.66804
#6 f 0.5622644 0.2850609 12.36982
#7 g 0.4675324 0.2746589 14.96104
#8 h 0.4921506 0.2879830 21.16248
#9 i 0.5443600 0.2945428 22.31876
#10 j 0.5276048 0.3236814 20.57659
#.. ... ... ... ...
using the sequence of list(as_tibble(as.list(...)) followed by an unnest from tidyr does the trick
toy_summary2 <- toy_df %>% group_by(group) %>%
summarize(Col = list(as_tibble(as.list(toy_fn(value))))) %>% unnest()

Resources