R: implement combn function with several m and define output variables - r
I am using the following code to obtain the mean of all possible combination (m=2) of the variables whose name starts with "form".
k=which(grepl("^form",colnames(data)))
combined <- combn(data[,k], 2, FUN = rowMeans)
colnames(combined) <- combn(names(data[,k]), 2, paste0, collapse="")
data <- cbind(data, combined)
The dataset "data" is the following:
structure(list(id = c(5309039, 5284969, 5300279, 5270289, 5259957,
5267086, 5173196, 5057536, 5246135, 5255558, 5241070, 5280194,
5112387, 444459, 5054590, 5048412, 5296390, 5093742, 5293520),
form13 = c(1300.81321145176, 1130.23869905075, 1292.03253463863,
1358.23586808642, 1250.66417156907, 1388.37813595599, 1277.89625553694,
1242.17552321015, 1275.95068420011, 1449.97932094858, 1494.93158409261,
1183.72005024492, 1319.72081010904, 1153.43556746197, 1451.47500658524,
1502.05308533551, 1641.66472289938, 1407.07852441646, 1444.3815517771
), form12 = c(1329.6, 1104.4, 1272, 1322.8, 1195.5, 1487.4,
1195.6, 1258, 1256.4, 1455, 1524, 1170, 1291.4, 1224.6, 1414,
1606, 1765.2, 1441, 1406.8), form11 = c(1325.578, 1201.752,
1346.42, 1424.884, 1328.03, 1367.262, 1294.928, 1278.99,
1330.482, 1493.54, 1524.19, 1242.21, 1379.522, 1178.458,
1438.37, 1475.15, 1611.236, 1426.11, 1431.014), form10 = c(1056.7264,
940.4956, 1076.29, 1149.9412, 1059.028, 1095.8536, 1027.9564,
1012.996, 1061.3296, 1214.386, 1243.156, 978.472, 1107.3616,
918.6304, 1162.6, 1197.124, 1324.8628, 1151.092, 1155.6952
), form9 = c(1265.95883621535, 1104.13796282321, 1292.61038190038,
1391.60226122629, 1269.10247448997, 1319.10781736395, 1226.47462059388,
1205.80097696249, 1272.24391797013, 1476.61400008329, 1514.11964245256,
1157.70450530205, 1334.62450699242, 1072.96302932, 1408.41424685422,
1453.98138963552, 1619.24856353662, 1393.1329826012, 1399.25113387699
), form8 = c(1482.14960970768, 1302.96011430734, 1455.11530997823,
1507.60187999797, 1403.62372119021, 1590.3115445541, 1392.70107590683,
1422.72772811208, 1440.68241714823, 1606.14610155669, 1656.53381495283,
1357.47229571355, 1476.63693689195, 1356.28387443873, 1567.80354390345,
1697.01564123702, 1829.93948069795, 1581.30521692185, 1561.45650301116
), form7 = c(1444.56088362196, 1256.09569669502, 1416.12716131828,
1471.33068319787, 1361.97012558123, 1558.32178921338, 1350.4820727773,
1382.06304580259, 1400.94715403591, 1574.97601740197, 1627.97203596215,
1313.42968513872, 1438.7628489193, 1312.17974558614, 1534.64866852904,
1670.54939207752, 1810.35399499291, 1548.84925168016, 1527.97307493173
), form6 = c(1199.39256844313, 1030.51525282711, 1173.91406615889,
1223.38008553142, 1125.38576782367, 1301.32988998026, 1115.09171006788,
1143.39035787661, 1160.31177216137, 1316.25318375141, 1363.74113364133,
1081.8903116367, 1194.19714454337, 1080.77028284113, 1280.11720270038,
1401.89327051093, 1527.16747332837, 1292.84186767351, 1274.13542778885
), form5 = c(1297.78687926793, 1159.12885718351, 1290.6491699916,
1344.46508388198, 1257.02131246849, 1368.96738018114, 1239.89545043121,
1250.12098970015, 1277.57642224122, 1419.04226152712, 1455.58342941928,
1202.60322079507, 1313.15664462902, 1177.98531965952, 1380.99558290387,
1461.37241431927, 1574.8610783177, 1384.16870680163, 1375.22939662201
), form4 = c(1335.97776730397, 1108.36308048125, 1324.2608292059,
1412.60257966574, 1269.05887158687, 1452.82443206729, 1240.94583733479,
1257.73161635649, 1302.80120256198, 1535.02507407783, 1595.00938916382,
1179.7286135352, 1361.20807332313, 1139.31698950533, 1472.56938122075,
1604.51232282192, 1790.81013902909, 1477.77823673001, 1463.10387273464
), form3 = c(1354.228, 1167.277, 1385.695, 1504.159, 1357.93,
1417.162, 1307.953, 1283.89, 1361.632, 1607.815, 1654.09,
1228.36, 1435.672, 1132.108, 1524.52, 1580.05, 1785.511,
1506.01, 1513.414), form2 = c(2275.7324829005, 1960.23260237236,
2259.163108513, 2384.94888103794, 2181.57337654262, 2442.86896126772,
2142.36120747078, 2165.7494001933, 2228.9072421228, 2562.48497832825,
2650.8148703194, 2057.68931533889, 2311.5302827576, 2002.33637794664,
2471.44922673607, 2664.88828208925, 2945.12448823488, 2479.00498842122,
2457.73611045874), form1 = c(1180.88828860349, 1056.82591443514,
1162.17101167316, 1198.5102427986, 1126.52065872992, 1255.77452231775,
1118.95833314255, 1139.74737411054, 1152.17835587263, 1266.73762443072,
1301.62370599969, 1094.56758356167, 1177.07157336578, 1093.7447765967,
1240.19104186727, 1329.65141749175, 1421.68162869499, 1249.53896489237,
1235.79664943772)), row.names = c(NA, -19L), class = c("tbl_df",
"tbl", "data.frame"))
>
The code works well and I am trying to implement it in order to take all possible combination with m from 2 to 8. I've tried the following code, but it doesn't work.
x<-2:8
k=which(grepl("^form",colnames(data)))
combined <- combn(data[,k], seq_along(x), FUN = rowMeans)
colnames(combined) <- combn(names(data[,k]), seq_along(x), paste0, collapse="")
data <- cbind(data, combined)
as I get the following error:
> x<-2:8
> k=which(grepl("^form",colnames(data)))
> combined <- combn(data[,k], seq_along(x), FUN = rowMeans)
**Error in combn(data[, k], seq_along(x), FUN = rowMeans) :
length(m) == 1L is not TRUE**
> colnames(combined) <- combn(names(data[,k]), seq_along(x), paste0, collapse="")
**Error in combn(names(data[, k]), seq_along(x), paste0, collapse = "") :
length(m) == 1L is not TRUE**
> data <- cbind(data, combined)
Where am I wrong?
Also, I would like to add the following prephix "comb_" to the name of all generated variables. How should I modify the above code?
Thank you!
The reason is simply that combn only takes one m at a time. Just use sapply to iterate over the ms. In order to get the column names in one step we can use 'colnames<-()'. 'colnames<-'(x, names) is actually the same as colnames(x) <- names but with the advantage that everything is on the RHS. "form" suffixes can be deleted with gsub.
k <- 2:14
combined.2.lst <- sapply(2:8, function(m)
`colnames<-`(combn(data[,k], m, rowMeans),
combn(names(data[,k]), m, function(x)
paste0("comb.", paste0(gsub("form", "", x), collapse=".")))))
This gives you a list which then can be cbinded.
combined.2 <- do.call(cbind, combined.2.lst)
dim(combined.2)
# [1] 19 7085
Result
combined.2[1:5, c(1, 50, 100, 500, 1000, 5000)] # example columns
# comb.13.12 comb.9.1 comb.13.10.9 comb.13.10.2.1 comb.9.5.4.3 comb.13.7.6.5.4.3.2
# [1,] 1315.207 1223.424 1207.833 1453.540 1313.488 1458.356
# [2,] 1117.319 1080.482 1058.291 1271.948 1134.727 1258.836
# [3,] 1282.016 1227.391 1220.311 1447.414 1323.304 1448.835
# [4,] 1340.518 1295.056 1299.926 1522.909 1413.207 1528.446
# [5,] 1223.082 1197.812 1192.932 1404.447 1288.278 1400.515
Finally just use cbind(data, combined.2).
The function combn, can only take 1 element for the number of combinations, so you, need to use lapply and finally combine them with do.call(cbind..):
First we define the function for combination x:
func = function(x,DATA){
mat = combn(DATA,x,FUN=rowMeans)
colnames(mat) = combn(names(DATA),x, paste0, collapse="")
mat
}
Then we iterate:
k=which(grepl("^form",colnames(data)))
combined = lapply(2:8,func,DATA=data[,k])
combined <- do.call(cbind, combined)
If you are familiar with purrr, you can also do:
library(purrr)
library(dplyr)
combined = 2:8 %>% map(~as.tibble(func(.x,DATA=data[,k]))) %>% bind_cols()
You need do iteration over m<-2:8, using lapply() or sapply(). I tried to keep your main structure of your code and make minimal changes to let it work:
m <- 2:8
k=which(grepl("^form",colnames(data)))
combined <- Reduce(cbind,lapply(m, function(m) combn(data[,k], m, FUN = rowMeans)))
colnames(combined) <-unlist(sapply(m, function(m) combn(names(data[,k]), m, paste0, collapse="")))
data <- cbind(data, combined)
Related
Condition to check values ​between lists and add to new list in R
If the last value of each sublist in the list ListResiduals (e.g: OptionAOptionD) is > than the value with the corresponding name in the ListSigma (e.g: OptionAOptionD), it adds the name (e.g: OptionAOptionD) to the Watchlist list. In the last line of the code I put "> 5" just for the example work, it's the "> 5" that I want to replace in the condition that I mentioned in the previous paragraph. DF <- data.frame("OptionA" = sample(1:100, 50), "OptionB" = sample(1:100, 50), "OptionC" = sample(1:100, 50), "OptionD" = sample(1:100, 50)) #Unfolding options and creating DF UnFolding <- data.frame( First = as.vector(sapply(names(DF[]), function(x) sapply(names(DF[]), function(y) paste0(x)))), Second = as.vector(sapply(names(DF[]), function(x) sapply(names(DF[]), function(y) paste0(y))))) #Deleting lines with the same names UnFolding <- UnFolding[UnFolding$First != UnFolding$Second, ] #Creating list with dependent and independent variables LMList <- apply(UnFolding, 1, function(x) as.formula(paste(x[1], "~", x[2]))) #Change list data to variable names names(LMList) <- substring(lapply(LMList, paste, collapse = ""), 2) #Linear regression - lm() LMListRegression <- lapply(LMList, function(x) { eval(call("lm", formula = x, data = DF)) }) #Residuals ListResiduals <- lapply(LMListRegression, residuals) #Sigma ListSigma <- lapply(LMListRegression, function(x) { sigma(x)*2 }) #Watchlist Watchlist <- as.list(unlist(lapply(ListResiduals, function(x) names(x)[1][tail(x, 1) > 5])))
I would gravitate towards converting your Simga and Residual values to a vector and compare the vectors. You could also use a data.frame approach to be sure the order of your lists/vectors doesn't change. # create a vector with the last value from the Residuals list. last_residual <- sapply(ListResiduals, `[`, 50) names(last_residual) <- substr(names(last_residual), 1, stop = -4) # Using sapply() rather than lapply, will return a named vector sigma_vector <- sapply(LMListRegression, function(x) { sigma(x)*2 }) Watchlist <- sigma_vector[last_residual > sigma_vector] Watchlist # named numeric(0) In your example, it returns an empty named vector because no values meet your condition max(last_residual) # [1] 31.70949 min(sigma_vector) # [1] 52.93234 # To demonstrate that it works, let's devide sigma by 2 so that at least some values will pass half_sigma <- sigma_vector/2 Watchlist2 <- sigma_vector[last_residual > half_sigma] Watchlist2 # OptionDOptionA OptionDOptionB OptionDOptionC # 54.52411 57.09503 56.79341
pairwise substring a data frame and save them in a list
# Example data dat <- matrix(runif(2*300), ncol = 2, nrow = 20) group <- rep_len(LETTERS[1:3], 20) df <- cbind.data.frame(dat, Group = group) # Greate subset groups n <- levels(as.factor(group)) mylist <- combn(n, 2, simplify = FALSE) I would like to subset my data according to pairwise combinations of the group attribute, and then save the result in mylist. How can I do it? Thanks a lot.
We can use subset with %in% after looping over the mylist mylist2 <- lapply(mylist, function(x) subset(df, Group %in% x)) It can be also be done within combn by making use of FUN argument combn(n, 2, FUN = function(x) subset(df, Group %in% x), simplify = FALSE)
apply a function to some elements of a nested lists in R
I wish to apply a function only to some elements of a nested list l <- list() l$a$forecast <- rnorm(3) l$a$model <- "arima" l$b$forecast <- rnorm(3) l$b$model <- "prophet" The desired output would be like this: applying the sum function to the $forecast element of the list fcst <- c(sum(l$a$forecast),sum(l$b$forecast)) mdl <- c(l$a$model,l$b$model) df <- data.frame(fcst,mdl) I tried something like this: df <- lapply(l$forecast, function(x) sum(x)) df <- do.call(rbind, Map(cbind, sku = names(df)))
Another approach using rrapply() in the rrapply-package combined with dplyr's bind_rows(). This also extends to lists containing deeper nested levels. rrapply::rrapply(l, condition = function(x, .xname) .xname == "forecast", f = sum) %>% dplyr::bind_rows() #> # A tibble: 2 x 2 #> forecast model #> <dbl> <chr> #> 1 -1.28 arima #> 2 1.10 prophet Data set.seed(1) l <- list( a = list(forecast = rnorm(3), model = "arima"), b = list(forecast = rnorm(3), model = "prophet") )
do.call( rbind, lapply( l, function(x) list(fcst = sum(x$forecast), model = x$model) ) ) Since you know the exact dimensions of your returned object you can use vapply in cases like this for a minor performance improvement: vapply( l, FUN = function(x) list(fcst = sum(x$forecast), model = x$model), FUN.VALUE = list(fcst = numeric(1), model = character(1)) ) However, the resulting object can be hard to work with.
You can get the letters with the object letters, then using its output in a loop: n = 2 #number of lists you have sumfore = model = vector() for(i in letters[seq(1,n,1)]){ sumfore[i] = sum(l[[i]]$forecast) model[i] =l[[i]]$model} newdf = data.frame(sumfore, model)
Using an apply function to subset, match, and correlate multiple sets of variables in R
I'd like to use one of the apply functions to compactly subset, match, and correlate only the set of variables with the following strings: "hpi", "cpi", "eh". Specifically, I'd like to apply all of the lines below the third comment (which only applies to "hip) below to each of the other strings. Can you please advise? MWE: #Dataset 1 alias <- paste("v", seq( from = 1, to = 25 ), sep="" ) df1 = data.frame(replicate(25,sample(0:100,1000,rep=TRUE))) colnames(df1) = alias #Dataset 2 df2 = data.frame(replicate(3,sample(0:1,25,rep=TRUE))) colnames(df2) = c("hpi","cpi","eh") df2$alias = alias df2$name = rep ( c("hpi housig", "cpi inflation", "eh econhealth", "unem unemployment", "inc personal income"), 5) #I would like to use an apply function to do this to each of "hpi", "cpi", "eh" df2$hpi = grepl("hpi", df2$name) hpisub = df2[df2$hpi == 1, ] hpisubvar = hpisub$alias hpidf = df1[, hpisubvar] corrhpi = cor(hpidf)
The *apply functions apply function FUN to their first argument, so you need to write a function to be applied. fun <- function(x){ dfsub = df2[grepl(x, df2$name), ] cor(df1[, dfsub$alias]) } Now we test it against your result, corrhpi. identical(corrhpi, fun("hpi")) [1] TRUE And, finally, apply it to the vector you need. vec <- c("hpi", "cpi", "eh") result <- lapply(vec, fun) names(result) <- vec
Apply function or Loop in R: Not numerical, returning NA
I am working with a resampling procedure in R (just like a bootstrap). I have a matrix of response/explanatory variables and would like to make 999 samples of this matrix to calculate for each statistic I am working their mean, sd and confidence interval. So, I wrote a function to calculate and to return a list: mydata <- data.frame(a=rnorm(20, 1, 1), b = rnorm(20,1,1)) myfun <- function(data, n){ sample <- data[sample(n, replace = T),] model1 <- lm(sample[,1]~sample[,2]) return(list(model1[[1]][[1]], model1[[1]][[2]])) } result <- as.numeric() result <- replicate(99, myfun(mydata, 10)) Then, I have a matrix as my output in which the rows are the statistics and the columns are the samplings (nrow = 2 and ncol = 99). I need the mean and sd for each row, but when I try to use the apply function or even a loop the following message shows up: In mean.default(newX[, i], ...) : argument is not numeric or logical: returning NA Moreover: is.numeric(result) [1] FALSE I found it strange, because I never had such problem with similar procedures. Any thoughts?
Use the following: myfun <- function(dat, n){ dat1 <- dat[sample(n, replace = T),] model1 <- lm(dat1[,1] ~ dat1[,2]) return(coef(model1)) } replicate(99, myfun(mydata, 10))
The reason is the 'result' is a list of 198 elements with dimension attributes. We need to unlist the 'result' and provide the dimension attributes result1 <- `dim<-`(unlist(result), dim(result)) and then use the apply
Just replace list() by c() in your myfun() function mydata <- data.frame(a=rnorm(20, 1, 1), b = rnorm(20,1,1)) myfun <- function(data, n){ sample <- data[sample(n, replace = T),] model1 <- lm(sample[,1]~sample[,2]) return(c(model1[[1]][[1]], model1[[1]][[2]])) } result <- as.numeric() result <- replicate(99, myfun(mydata, 10)) apply(result, FUN=mean, 1) apply(result, FUN=sd, 1)
This worked for me: mydata <- data.frame(a=rnorm(20, 1, 1), b = rnorm(20,1,1)) myfun <- function(data, n){ sample <- data[sample(n, replace = T),] model1 <- lm(sample[,1]~sample[,2]) return(data.frame(v1 = model1[[1]][[1]], v2 = model1[[1]][[2]])) } result <- do.call("rbind",(replicate(99, myfun(mydata, 10), simplify = FALSE)))