I need to execute this code many times in order to get 45 different matrices at the end: mat[j], j=1:45.
Not sure how to use "for-loop" to achieve that, will be grateful for any tips.
Data files are stored here, year-by-year https://intl-atlas-downloads.s3.amazonaws.com/index.html
library(readstata13)
library(diverse)
library(plyr)
for (j in 1:45) {
dat <- read.dta13(file.choose())
data = aggregate(dat$export_value, by = list(dat$exporter,dat$commoditycode), FUN = sum)
colnames(data) = c("land","product","value")
dt = split(data, f = data$product)
land = as.data.frame(sort(unique(data[, 1])))
nds = seq(1, nrow(land), by = 1)
texmat = cbind(nds, land)
colnames(texmat) = c("num", "land")
for (i in 1:length(unique(data[, 2]))) {
(join(texmat, dt[[i]], by = "land", type = "left")$value)
}
mt = sapply(1:length(unique(data[, 2])), function(i) join(texmat, dt[[i]], by = "land", type = "left")$value)
colnames(mt) = unique(data[, 2])
rownames(mt) = sort(unique(data[, 1]))
mt[is.na(mt)] = 0
rcamat=values(mt, category_row = FALSE, norm = "rca",filter = 1, binary = TRUE)
rcamat[is.na(rcamat)] = 0
tmat = rcamat[rowSums(rcamat) != 0, , drop = TRUE]
mat = t(tmat)
}
It looks like you're almost there with the for loop. You just need to add 2 concepts:
1) Creating a list of matrices to read at the start. A construction like:
filenames <- paste0('H0_',1995:2016,'.dta')
filenames <- c(filenames,paste0('S2_final_',1962:2016,'.dta'))
that creates a vector of the files you want to read will allow you to replace file.choose with something like the following (inside the loop):
dat <- read.dta13(paste0('/path/to/directory/with/files/',filenames[i]))
This way you can grab a new file with each loop iteration.
2) Storing the output matrices at the end of the loop. You can do this either by putting them all in a list, or by using assign to create a collection of objects. I prefer the list approach:
#before the for loop initialize a NULL list:
mats <- NULL
#at the end of the loop, (after mat = t(tmat) but before the close bracket) add this line to add it to the list
mats[[i]] <- mat
This will create a list mats with mats[[1]] holding the first matrix, mats[[2]] holding the second, and so on.
You could alternatively create a bunch of objects like so:
#at the end of the for loop add
assign(paste0('mat_',i),mat)
Which will create mat_1, mat_2, and so on as separate objects. A full implementation would look something like this:
library(readstata13)
library(diverse)
library(plyr)
setwd('/path/to/files/')
filenames <- paste0('H0_',1995:2016,'.dta')
filenames <- c(filenames,paste0('S2_final_',1962:2016,'.dta'))
#you'll have to prune this to the files you actually want, as this list is more than 45
finished_matrices <- NULL
for (j in 1:45) {
dat <- read.dta13(filenames[i]) #pickup
data = aggregate(dat$export_value, by = list(dat$exporter,dat$commoditycode), FUN = sum)
colnames(data) = c("land","product","value")
dt = split(data, f = data$product)
land = as.data.frame(sort(unique(data[, 1])))
nds = seq(1, nrow(land), by = 1)
texmat = cbind(nds, land)
colnames(texmat) = c("num", "land")
for (i in 1:length(unique(data[, 2]))) {
(join(texmat, dt[[i]], by = "land", type = "left")$value)
}
mt = sapply(1:length(unique(data[, 2])), function(i) join(texmat, dt[[i]], by = "land", type = "left")$value)
colnames(mt) = unique(data[, 2])
rownames(mt) = sort(unique(data[, 1]))
mt[is.na(mt)] = 0
rcamat=values(mt, category_row = FALSE, norm = "rca",filter = 1, binary = TRUE)
rcamat[is.na(rcamat)] = 0
tmat = rcamat[rowSums(rcamat) != 0, , drop = TRUE]
mat = t(tmat)
finished_matrices[[i]] <- mat
}
Related
My task is to recreate the result of this for loop by using the map() function. Unfortunately, I can't get my head around this.
play_roulette <- function(bet, number) {
draw <- sample(0:36, 1)
tibble(
winning_number = draw,
your_number = number,
your_bet = bet,
your_return = if (number == draw) {
bet * 36
} else {
0
}
)
}
desired_length <- 10
list_w_for <- vector(mode = "list", length = desired_length)
for(i in seq_along(list_w_for)){
list_w_for[[i]] <- play_roulette(bet = 1, number = 5)
}
for_tibble <- bind_rows(list_w_for)
for_tibble
My current map code:
num_vec <- 1:10
bet_vec <- 1
tibble_2c <- tibble(x= bet_vec, y= num_vec)
map_dfc( tibble_2c,
play_roulette(bet = x, number = y))
You just have to call the function 10 times, since the iterator i is not used inside of the function.
# use *_dfr to row_bind the result
map_dfr(
# call the function ten times
1:10,
# note that .x, the default iterator in map, is not needed inside of the function
~play_roulette(bet = 1, number = 5))
Basically I'm trying to automate a scoring modeling workflow, and encountered a problem with inputting the results from smbinning() that are generated by a loop and hence recorded in a list. The result itself is a list, so I have a bunch of lists in a list. Problems arise when I'm trying to add the results (buckets for continuous variables) into a data frame. I just find it impossible to feed the syntax required to dive into the levels of the list. I tried work my way around this by referencing the column numbers and just trying to pass on the respective list names from the loop. Error I'm getting is:
Error in [.data.frame (df, , col_id) : undefined columns selected.
My code is as follows:
colcnt <- ncol(e_mod)
bucket_resultlist <- list()
for (i in 2:colcnt) {
#curvar = paste0('z', i)
curresult = smbinning(df = e_mod, y = "Bankrupt", x = colnames(e_mod)[i], p = 0.05)
bucket_resultlist[[paste0('Bin_Result_', colnames(e_mod)[i])]] = curresult #paste0('binresult', colnames(e)[i]) = curresult
}
e_mod2 = e_mod
for (i in 1:length(bucket_resultlist_trunc)) {
e_mod2 = smbinning.genCUSTOM(e_mod, bucket_resultlist_trunc[[i]] , chrname = i)
}
I've even tried to define a customer version of the smbinning.gen() function to allow for this, as in the standard form it just tries to concatenate $ivtable to the list reference, but I need to be able to skip one level from this generated list and then run the smbinning.gen() for each respective list in that list. Here's the custom code and the original definitions commented out:
smbinning.genCUSTOM = function(df, ivout, chrname = "NewChar") {
df = cbind(df, tmpname = NA)
ncol = ncol(df)
col_id = paste0(ivout, '[[6]]', collapse = NULL) # Original: ivout$col_id
# Updated 20160130
b = paste0(ivout, '[[4]]', collapse = NULL) # Original: ivout$bands
df[, ncol][is.na(df[, col_id])] = 0 # Missing
df[, ncol][df[, col_id] <= b[2]] = 1 # First valid
# Loop goes from 2 to length(b)-2 if more than 1 cutpoint
if (length(b) > 3) {
for (i in 2:(length(b) - 2)) {
df[, ncol][df[, col_id] > b[i] & df[, col_id] <= b[i + 1]] = i
}
}
df[, ncol][df[, col_id] > b[length(b) - 1]] = length(b) - 1 # Last
df[, ncol] = as.factor(df[, ncol]) # Convert to factor for modeling
blab = c(paste("01 <=", b[2]))
if (length(b) > 3) {
for (i in 3:(length(b) - 1)) {
blab = c(blab, paste(sprintf("%02d", i - 1), "<=", b[i]))
}
} else { i = 2 }
blab = c(blab, paste(sprintf("%02d", i), ">", b[length(b) - 1]))
# Are there ANY missing values
# any(is.na(df[,col_id]))
if (any(is.na(df[, col_id]))) {
blab = c("00 Miss", blab)
}
df[, ncol] = factor(df[, ncol], labels = blab)
names(df)[names(df) == "tmpname"] = chrname
return(df)
}
All help is much appreciated!
Here's the list structure
http://i.stack.imgur.com/iYau2.png
This is also posted in the Data Science section, but this only had 5 views during the whole of today
Thanks Stackoverflow for being my yellow rubber duck. The fix was to change the way of passing in the arguments:
smbinning.genCUSTOM = function(df, ivout, chrname = "NewChar") {
df = cbind(df, tmpname = NA)
ncol = ncol(df)
col_id = ivout[[6]] # paste0(ivout, '[[6]]', collapse = NULL) # Original: ivout$col_id
# Updated 20160130
b = ivout[[4]] # paste0(ivout, '[[4]]', collapse = NULL) # Original: ivout$bands
And to refer the new df e_mod2 instead of e_mod
for (i in 1:length(bucket_resultlist_trunc)) {
e_mod2 = smbinning.genCUSTOM(e_mod2, bucket_resultlist_trunc[[i]] , chrname = i)
}
I am trying to vectorize my nested for loop code using apply/mapply/lapply/sapply or any other way to reduce the running time. My code is as follows:
for (i in 1:dim){
for (j in i:dim){
if(mydist.fake[i,j] != d.hat.fake[i,j]){
if((mydist.fake[i,j]/d.hat.fake[i,j] > 1.5)|(d.hat.fake[i,j]/mydist.fake[i,j]>1.5)){
data1 = cbind(rowNames[i],rowNames[j], mydist.fake[i,j], d.hat.fake[i,j], 1)
colnames(data1) = NULL
row.names(data1) = NULL
data = rbind(data, data1)
}else{
data1 = cbind(rowNames[i],rowNames[j], mydist.fake[i,j], d.hat.fake[i,j], 0)
colnames(data1) = NULL
row.names(data1) = NULL
data = rbind(data, data1)
}
}
}
}
write.table(data, file = "fakeTest.txt", sep ="\t", col.names = FALSE, row.names = FALSE)
rowNames is the vector of rownames of all data points
data is a dataframe
mydist.fake and d.hat.fake are distance matrices (where the diagonal is zero and values of upper and lower triangle is same) and therefore, interested in the transversal of lower triangle (leaving values of diagonals too).
The dimensions of the both the matrices are the same.
The major problem I am facing is the vectorization of the j loop where j is initialized as i.
A vectorized version of your code is:
dist1 <- mydist.fake
dist2 <- d.hat.fake
data <- data.frame(i = rowNames[row(dist1)[lower.tri(dist1)]],
j = rowNames[col(dist1)[lower.tri(dist1)]],
d1 = dist1[lower.tri(dist1)],
d2 = dist2[lower.tri(dist2)])
data <- transform(data, outcome = d1/d2 > 1.5 | d2/d1 > 1.5)
I tested it successfully using the following sample data:
X <- matrix(runif(200), 20, 10)
Y <- matrix(runif(200), 20, 10)
rowNames <- paste0("var", seq_len(nrow(X)))
mydist.fake <- as.matrix(dist(X))
d.hat.fake <- as.matrix(dist(Y))
I just discovered the power of plyr frequency table with several variables in R
and I am still struggling to understand how it works and I hope some here can help me.
I would like to create a table (data frame) in which I can combine frequencies and summary stats but without hard-coding the values.
Here an example dataset
require(datasets)
d1 <- sleep
# I classify the variable extra to calculate the frequencies
extraClassified <- cut(d1$extra, breaks = 3, labels = c('low', 'medium', 'high') )
d1 <- data.frame(d1, extraClassified)
The results I am looking for should look like that :
require(plyr)
ddply(d1, "group", summarise,
All = length(ID),
nLow = sum(extraClassified == "low"),
nMedium = sum(extraClassified == "medium"),
nHigh = sum(extraClassified == "high"),
PctLow = round(sum(extraClassified == "low")/ length(ID), digits = 1),
PctMedium = round(sum(extraClassified == "medium")/ length(ID), digits = 1),
PctHigh = round(sum(extraClassified == "high")/ length(ID), digits = 1),
xmean = round(mean(extra), digits = 1),
xsd = round(sd(extra), digits = 1))
My question: how can I do this without hard-coding the values?
For the records:
I tried this code, but it does not work
ddply (d1, "group",
function(i) c(table(i$extraClassified),
prop.table(as.character(i$extraClassified))),
)
Thanks in advance
Here's an example to get you started:
foo <- function(x,colfac,colval){
tbl <- table(x[,colfac])
res <- cbind(n = nrow(x),t(tbl),t(prop.table(tbl)))
colnames(res)[5:7] <- paste(colnames(res)[5:7],"Pct",sep = "")
res <- as.data.frame(res)
res$mn <- mean(x[,colval])
res$sd <- sd(x[,colval])
res
}
ddply(d1,.(group),foo,colfac = "extraClassified",colval = "extra")
Don't take anything in that function foo as gospel. I just wrote that off the top of my head. Surely improvements/modifications are possible, but at least it's something to start with.
Thanks to Joran.
I slighlty modified your function to make it more generic (without reference to the position of the variables) .
require(plyr)
foo <- function(x,colfac,colval)
{
# table with frequencies
tbl <- table(x[,colfac])
# table with percentages
tblpct <- t(prop.table(tbl))
colnames( tblpct) <- paste(colnames(t(tbl)), 'Pct', sep = '')
# put the first part together
res <- cbind(n = nrow(x), t(tbl), tblpct)
res <- as.data.frame(res)
# add summary statistics
res$mn <- mean(x[,colval])
res$sd <- sd(x[,colval])
res
}
ddply(d1,.(group),foo,colfac = "extraClassified",colval = "extra")
and it works !!!
P.S : I still do not understand what (group) stands for but
I am trying to use for to create multiple objects from for, just example (not exact):
l_gr <- list (1:10, 11:20, 21:30)
for (i in 1:length(l_gr)){
grp <- NULL
grp[[i]] <- mean(l_gr[[i]])
}
This is not what I am expecting, rather I need to output multiple objects (of different class) however the name is different with i level for example: here grp1, grp2, grp3.
Each of these object has output of the function for particular i list. Sorry for simple question.
Edits: response to provide specific example:
install.packages("onemap")
require(onemap)
data(example.out)
twopts <- rf.2pts(example.out)
all.data <- make.seq(twopts,"all")
link_gr <- group(all.data)
link_gr$n.groups
starts the loop
# without loop:
# for 1
grp1 <- make.seq(link_gr, 1)
grp1.od <- order.seq(input.seq=grp1, n.init = 5, subset.search = "twopt",
twopt.alg = "rcd", THRES = 3, draw.try = TRUE, wait = 1, touchdown=TRUE)
# for 2
grp2 <- make.seq(link_gr, 2)
grp2.od <- order.seq(input.seq=grp2, n.init = 5, subset.search = "twopt",
twopt.alg = "rcd", THRES = 3, draw.try = TRUE, wait = 1, touchdown=TRUE)
same process report for 1:1:link_gr$n.groups
So I want create a for loop and output objects:
for (i in 1:link_gr$n.groups){
grp <- NULL
grp[i] <- make.seq(link_gr, i)
grp[i].od <- order.seq(input.seq=grp[i], n.init = 5, subset.search = "twopt",
twopt.alg = "rcd", THRES = 3, draw.try = TRUE, wait = 1, touchdown=TRUE)
}
Note that your for loops are wrong. If you set grp <- NULL within the loop, you'll just wipe your results variable with each iteration - probably not what you want. You need to put the variable initialisation outside the loop.
Note, too, that I'd suggest that you are still better off using a single variable instead of multiple ones. list objects are very flexible in R and can accomodate objects of different classes. You can do
require(onemap)
data(example.out)
twopts <- rf.2pts(example.out)
all.data <- make.seq(twopts,"all")
link_gr <- group(all.data)
link_gr$n.groups
# initialise list outputs
grp = list()
grp.od = list()
for (i in 1:2){
grp[[i]] <- make.seq(link_gr, i)
grp.od[[i]] <- order.seq(input.seq=grp[[i]], n.init = 5, subset.search = "twopt",
twopt.alg = "rcd", THRES = 3, draw.try = TRUE, wait = 1, touchdown=TRUE)
}
#check out output
str(grp)
str(grp.od)
grp[[1]]
grp[[2]
If you must insist on using different variables, consider ?assign and ?get. Something like this will work:
i = 1
assign(paste("grp", i, sep = ""), grp[[1]])
exists("grp1")
str(get(paste("grp", i, sep = "")))