I'm attempting to use formula to generate a model.matrix object to be used in a custom optimizer function.
It works great for the most part, but when it comes to factor-factor interactions, I'd like to specify the interaction as dummy coded rather than effects coded.
Take for example the following data set:
set.seed(1987)
myDF <- data.frame(Y = rnorm(100),
X1 = factor(LETTERS[sample(1:3, 100, replace = TRUE)]),
X2 = factor(LETTERS[sample(1:3, 100, replace = TRUE)]))
head(myDF)
Both the : and / operators create an effects coded model matrix (the latter being an additive effects structure, I think).
head(model.matrix(formula(Y ~ X1 : X2), data = myDF))
head(model.matrix(formula(Y ~ X1 / X2), data = myDF))
But I am looking to generate a dummy coded model matrix, which would have the first level of X1 omitted for each level of X2. Resulting in these terms (columns):
X1B:X2A
X1C:X2A
X1B:X2B
X1C:X2B
X1B:X2C
X1C:X2C
Is there a way to achieve this?
Is ~X1:X2-1 what you're looking for?
Make test data (as above):
set.seed(1987)
myDF <- data.frame(Y = rnorm(100),
X1 = factor(LETTERS[sample(1:3, 100, replace = TRUE)]),
X2 = factor(LETTERS[sample(1:3, 100, replace = TRUE)]))
Generate model matrix:
mm1 <- model.matrix(formula(Y ~ X1 : X2 - 1), data = myDF)
head(mm1)
## X1A:X2A X1B:X2A X1C:X2A X1A:X2B X1B:X2B X1C:X2B X1A:X2C X1B:X2C X1C:X2C
## 1 0 0 0 0 1 0 0 0 0
## 2 1 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 1 0
## 4 0 0 0 0 0 1 0 0 0
## 5 0 0 0 1 0 0 0 0 0
## 6 0 0 0 0 0 0 1 0 0
Or perhaps you really do just want some columns excluded:
mm0 <- model.matrix(formula(Y ~ X1 : X2), data = myDF)
mm0B <- mm0[,!grepl("(Intercept|^X1A:)",colnames(mm0))]
## X1B:X2A X1C:X2A X1B:X2B X1C:X2B X1B:X2C X1C:X2C
## 1 0 0 1 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 1 0
## 4 0 0 0 1 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
I thought you also might be interested in sum-to-zero contrasts:
mm2 <- model.matrix(formula(Y ~ X1 : X2 - 1), data = myDF,
contrasts.arg=list(X1=contr.sum,X2=contr.sum))
Below is another trial.
set.seed(1987)
myDF <- data.frame(Y = rnorm(100),
X1 = factor(LETTERS[sample(1:3, 100, replace = TRUE)]),
X2 = factor(LETTERS[sample(1:3, 100, replace = TRUE)]))
# row subsetting to exclude A
modelMat <- model.matrix(formula(Y ~ X1 : X2), data = myDF[myDF$X1 != 'A',])
# column subsetting to eliminate all columns including X1A
modelMat <- modelMat[,substring(colnames(modelMat), 1, 3) != "X1A"]
head(modelMat)
(Intercept) X1B:X2A X1C:X2A X1B:X2B X1C:X2B X1B:X2C X1C:X2C
1 1 0 0 1 0 0 0
3 1 0 0 0 0 1 0
4 1 0 0 0 1 0 0
8 1 0 0 0 0 1 0
10 1 0 0 0 0 0 1
11 1 0 0 0 0 0 1
Related
I'm trying to run this script in R2jags following the instructions provided in "Lahoz-Monfort JJ, Guillera-Arroita G, Tingley R (2015) Statistical approaches to account for false positive errors in environmental DNA
samples. Molecular Ecology Resources, 16, 673–685." and it seems that it worked ok, but I can't figure out the command to see the results... could anyone please help?
cat("model {
# Priors
psi ~ dunif(0,1)
p11 ~ dunif(0,1)
p10 ~ dunif(0,p10_max)
# Likelihood
for (i in 1:S){
z[i] ~ dbern(psi)
p[i] <- z[i]*p11 + (1-z[i])*p10
for (j in 1:K){
Y[i,j] ~ dbern(p[i])
}
}
} ",fill=TRUE)
sink()
Bayesian <- function(psi,p11,p10,S,K,nsims=100,doprint=TRUE,p10_max=0.05,
ni=100000,nt=2,nc=1,nb=50000,myparallel=TRUE) {
psihat<-p11hat<-p10hat<-rep(nsims)
modelSummaries<-list()
for(ii in 1:nsims){
if (doprint) cat("\r", ii, "of", nsims," ")
hh<-genSimData(psi,r11=0,p11,p10,S,K1=0,K2=K)
# fit the model
jags.inits <-function()(list(psi=runif(1,0.05,0.95),p11=runif(1,p10_max,1),p10=runif(1,0,p10_max)))
jags.data <-list(Y=hh,S=S,K=K,p10_max=p10_max)
jags.params<-c("psi","p11","p10")
Thoropa_model<-jags(data = jags.data, inits = jags.inits, parameters.to.save= jags.params,
model.file= "Thoropa.txt", n.thin= nt, n.chains= nc,
n.iter= ni, n.burnin = nb, parallel=myparallel) #, working.directory= getwd()
# extract results (medians of the marginal posteriors)
psihat[ii] <- model$summary["psi","50%"]
p11hat[ii] <- model$summary["p11","50%"]
p10hat[ii] <- model$summary["p10","50%"]
modelSummaries[[ii]]<-model$summary
}
if (doprint){
printsummres(psihat,thename="estimated psi")
printsummres(p11hat,thename="estimated p11")
printsummres(p10hat,thename="estimated p10")
}
return(list(psihat=psihat,p11hat=p11hat,p10hat=p10hat,modelSummaries=modelSummaries))
}
The file "Thoropa.txt" is a presence/absence matrix as follows:
PCR1 PCR2 PCR3 PCR4 PCR5 PCR6 PCR7 PCR8 PCR9 PCR10 PCR11 PCR12
1 0 0 0 0 0 0 0 0 0 0 0
0 0 0 1 0 0 0 1 0 0 0 0
0 0 0 0 0 0 1 0 0 0 0 0
1 1 1 1 1 1 1 1 1 1 1 1
0 0 0 0 0 0 0 0 0 0 0 0
1 0 1 0 1 1 1 1 1 1 1 1
0 0 1 0 0 0 1 0 0 0 0 0
1 0 1 0 0 0 0 0 1 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
1 0 0 0 1 1 0 0 0 1 0 0
1 1 0 1 0 1 0 1 0 0 1 0
1 1 0 0 0 0 0 1 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
1 1 1 1 0 1 1 1 1 0 1 1
1 1 1 1 1 1 1 1 1 1 1 1
0 0 1 0 1 0 0 0 0 0 1 1
1 1 1 1 1 1 1 1 1 1 1 1
Following the comment from Limey (thank you!) I changed the script to:
sink("Thoropa2.txt")
cat("model {
# Priors
psi ~ dunif(0,1)
p11 ~ dunif(0,1)
p10 ~ dunif(0,p10_max)
# Likelihood
for (i in 1:S){
z[i] ~ dbern(psi)
p[i] <- z[i]*p11 + (1-z[i])*p10
for (j in 1:K){
Y[i,j] ~ dbern(p[i])
}
}
} ",fill=TRUE)
sink()
y=Thoropa# the detection/non detection table
S=nrow(y)
K=ncol(y)
psi ~ dunif(0, 1)
p11 ~ dunif(0, 1)
p10 ~ dunif(0, p10_max)
p10_max=0.05
jags.data<-list(y=y, S=S, K=K, p10_max=p10_max)
jags.inits <-function()(list(psi=runif(0,1),p11=runif(0,1),p10=runif(0,p10_max)))
jags.params<-c("psi","p11","p10")
Thoropa_model<-jags.parallel(data = jags.data, inits = jags.inits, parameters.to.save= jags.params, model.file= "Thoropa2.txt", n.chains= 4, n.thin= 10, n.iter = 100000, n.burnin=50000, jags.seed = 333)
and the data file is as before.
Now I am getting the error message:
"Error in checkForRemoteErrors(val) :
4 nodes produced errors; first error: Indexing outside the bounds"
Could anyone help identify the error in my script? I'm no expert and I'm learning by myself, so sorry if it is a stupid question... (maybe there is something wrong with the format of the data...?)
Thank you all!
Your model is not working because some syntax errors in your R script. Note that the R syntax is different of the jags syntax, even if you are running the jags inside de R.
These are the errors:
The symbol "~" is not used for sampling in R. Delete the lines:
psi ~ dunif(0, 1)
p11 ~ dunif(0, 1)
p10 ~ dunif(0, p10_max)
The Y variable in the jags model is capitalized, so you must correct the syntax in jags.data.
jags.data<-list(Y=y, S=S, K=K, p10_max=p10_max)
In jags.inits, a) the body of the function must be inside of curly braces, and not parentheses, and b) the function runif takes 3 arguments: n (number of values you want to sample), min and max. The correct syntax is the following:
jags.inits <-function(){list(psi=runif(1,0,1),p11=runif(1,0,1),p10=runif(1,0,p10_max))}
Fixing those errors, your model should run without errors. After run the model, you can extract the median of the parameters "psi" using one of these two options:
Thoropa_model$BUGSoutput$median$psi
Thoropa_model$BUGSoutput$summary["psi","50%"]
Updated with dummy data and dummycode - apologies, I assumed my question was simple and you could advice the best way without a reproducible example.
dummy<-data.frame(prodA=c(0,0,0,1,1,0,0,1),
prodB=c(0,0,1,1,0,1,1,0),
prodC=c(1,1,1,0,0,0,0,1))
dummy[,4:6]<-dummy[,1:3]
for (j in (1:nrow(dummy))){
for (i in 4:6){
dummy[j,i]<-ifelse(dummy[j,i]==1,colnames(dummy[i]),"")}
}
dummy2<-dummy[,4:6]
dummy$NewProds<-apply(dummy2,1,paste,collapse="")
dummy$NewProds<-gsub(".1","//",dummy$NewProds)
My second attempt is as:
prods<-dummy[,1:3]
prods[,4:6]<-dummy[,1:3]
for (i in 4:6){
prods[,i]<-colnames(prods[i-3])
}
prods[,7:9]<-prods[,4:6]
#works, but I will need multiple ifs for this to work, suggesting this
#won't be very efficient
prods[,10]<-ifelse(prods[,1]==1,prods[,4],"")
Original Post Follows:
I am playing with the Santander Product recommendation dataset from Kaggle. I have identified which products have been purchased from one month to another. This means I have 23 columns of 1's ( when a new product is added) and 0's (when not).
I created the following code to return the column name when a product has been purchased. It works great on a sample of 6 lines, but it runs forever when I try this on the 48k customers who changed, let alone the million in the dataset.
Is there another way to do this?
df2[,99:122]<-df2[,72:95]
for (j in (1:nrow(df2))){
for (i in 99:122){
df2[j,i]<-ifelse(df2[j,i]==1,colnames(df2[i]),"")}
}
df22<-df2[,99:122]
df2$NewProds<-apply(df22,1,paste,collapse="")
df2$NewProds<-gsub("change.1","//",df2$NewProds)
I figured the challenge was that I am looking at every variable and so started with another approach whereby I would take a couple of versions of the data, and then do an if variable is 1 then take the name. However I couldn't get this to work, and I think I come to the same issue.
#copy a bunch of 1's and 0's
prods<-df2[,72:95]
#repeat and overwrite with colnames
prods[,25:48]<-df2[,72:95]
for (i in 25:48){
prods[,i]<-colnames(prods[i-24])
}
prods[,49:72]<-prods[,25:48]
#attempt to only populate colnames if it was originally a 1 - doesn't work
prod[,49]<-ifelse(prod[,1]==1,prod[,25],"")
I haven't provided any data but I hope you can see what I am tring to do and can advise on efficient ways of doing this.
Thanks in advance,
J
Using apply as #AndersEllernBilgrau illustrated is one obvious way to do it, but it will be slow for data sets with many rows.
dummy[["NewProds"]] <- do.call(
paste,
c(mapply(ifelse,
dummy,
names(dummy),
MoreArgs = list(no = ""),
SIMPLIFY = FALSE),
sep = "//"))
is a bit harder to follow, but it will be much faster:
library(microbenchmark)
n <- 10000
dummy <- data.frame(prodA = rep(c(0,0,0,1,1,0,0,1), n),
prodB = rep(c(0,0,1,1,0,1,1,0), n),
prodC = rep(c(1,1,1,0,0,0,0,1), n))
microbenchmark(
do.call = do.call(
paste,
c(mapply(ifelse,
dummy,
names(dummy),
MoreArgs = list(no = ""),
SIMPLIFY = FALSE),
sep = "//")),
apply = apply(
dummy == 1,
1,
function(x) paste0(names(which(x)), collapse = "//")
))
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## do.call 63.92695 65.44777 72.07261 67.8667 73.3850 184.5151 100 a
## apply 296.81323 364.31947 404.71894 397.0927 443.7223 683.3892 100 b
Without data, I have a hard time understanding precisely what you want to do.
A couple of things are (almost) certain however:
You probably do not need for loops.
You should used R's vectorized functions, the dataset is not that big
Using some toy data, does the following do what you want?
d <- 23
n <- 46e3
# Simulate some toy data
df <- data.frame(matrix(rbinom(d*n, 1, 0.1), n, d),
row.names = paste0("row", 1:n))
head(df)
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23
row1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
row2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
row3 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
row4 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0
row5 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
row6 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0
# Paste together the colnames of all non-zero rows
res <- apply(df == 1, 1, function(x) paste0(names(which(x)), collapse = "-"))
head(res)
# row1 row2 row3 row4 row5 row6
#"X8-X16" "X1" "X8-X20" "X4-X11-X20" "X7-X15" "X4-X18-X21"
I.e. res is here a character vector of length n with the colnames of each row the corresponding to 1 entries pasted together (with separator -). This it at least what it appears to me what your code is doing conceptually.
I am using 'neuralnet' package in R to train a model for 'wine' dataset.
Below is the code that I have come up with so far-
library(neuralnet)
library(rattle)
library(rattle.data)
# load 'wine' dataset-
data(wine)
D <- as.data.frame(wine, stringsAsFactors=FALSE)
# replace 'Type' response variable (with values- 1, 2, 3) by 3 dummy variables-
D$wine1 <- 0
D$wine1[D$Type == 1] <- 1
D$wine2 <- 0
D$wine2[D$Type == 2] <- 1
D$wine3 <- 0
D$wine3[D$Type == 3] <- 1
# create formula to be used-
wine_formula <- as.formula(wine1 + wine2 + wine3 ~ Alcohol + Malic + Ash + Alcalinity + Magnesium + Phenols + Flavanoids + Nonflavanoids + Proanthocyanins + Color + Hue + Dilution + Proline)
# split dataset into training and testing datasets-
train_indices <- sample(1:nrow(wine), floor(0.7 * nrow(wine)), replace = F)
training <- D[train_indices, ]
testing <- D[-train_indices, ]
# train neural network model-
wine_nn <- neuralnet(wine_formula, data = training, hidden = c(5, 3), linear.output = FALSE, stepmax = 1e+07)
# make predictions using 'compute()'-
preds <- compute(wine_nn, testing[, 2:14])
# create a final data frame 'results' containing predicted & actual values-
results <- as.data.frame(preds$net.result)
results <- cbind(results, testing$wine1, testing$wine2, testing$wine3)
# rename the data frame-
names(results) <- c("Pred_Wine1", "Pred_Wine2", "Pred_Wine3", "Actual_Wine1", "Actual_Wine2", "Actual_Wine3")
The task that I have now is to convert the values in attributes "Pred_Wine1", "Pred_Wine2" and "Pred_Wine3" to 1s and 0s so that I can create a confusion matrix and test for model accuracy.
How should I go about it because "Pred_Wine1", "Pred_Wine2", "Pred_Wine3" contain calculated values which are in between 0 and 1.
Any suggestions?
Thanks!
I think you need label encoding here.
Let's say your data frame is called df. This will convert the values in your features into numeric. So, if Pred_Wine1 contains a,b it will convert it to 0,1 or vice-versa.
Try this:
features <- c("Pred_Wine1", "Pred_Wine2","Pred_Wine3")
for(f in features)
{
levels <- unique(df[[f]])
df[[i]] <- as.integer(factor(df[[i]], levels=levels))
}
Something like:
> head(results)
Pred_Wine1
1 1.00000000000000000
14 1.00000000000000000
17 1.00000000000000000
21 0.00000001901851182
26 0.21287781596598065
27 1.00000000000000000
Pred_Wine2
1 0.00000000000000000000000000000000000000000000000000015327712484
14 0.00000000000000000000000000000000000000000000000000009310376079
17 0.00000000000000000000000000000000000000000000000000009435487922
21 0.99999999363562386278658777882810682058334350585937500000000000
26 0.78964805454441211463034733242238871753215789794921875000000000
27 0.00000000000000000000000000000000000000000000000000009310386461
Pred_Wine3 Actual_Wine1 Actual_Wine2 Actual_Wine3
1 5.291055036e-10 1 0 0
14 1.336129635e-09 1 0 0
17 1.303396935e-09 1 0 0
21 8.968513318e-122 1 0 0
26 1.623066411e-111 1 0 0
27 1.336126866e-09 1 0 0
> class <- apply(results[1:3], 1, which.max)
> results[1:3] <- 0
> head(results)
Pred_Wine1 Pred_Wine2 Pred_Wine3 Actual_Wine1 Actual_Wine2 Actual_Wine3
1 0 0 0 1 0 0
14 0 0 0 1 0 0
17 0 0 0 1 0 0
21 0 0 0 1 0 0
26 0 0 0 1 0 0
27 0 0 0 1 0 0
> for (r in names(class)) {results[r,class[r]] <- 1}
> head(results)
Pred_Wine1 Pred_Wine2 Pred_Wine3 Actual_Wine1 Actual_Wine2 Actual_Wine3
1 1 0 0 1 0 0
14 1 0 0 1 0 0
17 1 0 0 1 0 0
21 0 1 0 1 0 0
26 0 1 0 1 0 0
27 1 0 0 1 0 0
In the package rnn there is an example of how to carry out the training of the network, which is described in this link (example 1). In the approach of this package the inputs are given in the format of a 3D array, where the dim 1: samples; Dim 2: time; Dim 3: variables, however not making explicit the division of inputs and targets (inputs and targets, which is a common approach in RNA packages). Moreover, in the package description both the entries and the targets must have the same dimension. So, how can I define my dataset for the recurrent neural network in the rnn package? Data to a reproductive example.
My train data (inputs) on DF, first five rows:
> data[1:5,2:14]
Ibiara.P_t Ibiara.P_t_1 Ibiara.P_t_2 Nova.Olinda.P_t_1 Princesa.Isabel.P_t_1 Boa.ventura.P_t_1 Boa.Ventura.P_t_2
1966-01-01 0 0 0 0 0 0 0
1966-01-02 0 0 0 0 0 0 0
1966-01-03 0 0 0 0 0 0 0
1966-01-04 0 0 0 0 0 0 0
1966-01-05 0 0 0 0 0 0 0
Piancó.P_t Piancó.P_t_1 Piancó.P_t_2 Q_t_1 Q_t_2 Q_t_3
1966-01-01 0 0 0 0 0 0
1966-01-02 0 0 0 0 0 0
1966-01-03 0 0 0 0 0 0
1966-01-04 0 0 0 0 0 0
1966-01-05 0 0 0 0 0 0
My target data, first five rows:
> data[1:5,1]
Q_t
1966-01-01 0
1966-01-02 0
1966-01-03 0
1966-01-04 0
1966-01-05 0
That was my code to define data:
# Scaling data for the NN
maxs <- apply(data, 2, max)
mins <- apply(data, 2, min)
scaled <- as.data.frame(scale(data, center = mins, scale = maxs - mins))
# Train-test split
train_ <- scaled[1:train_days,]
test_ <- scaled[(train_days+1):nrow(data),]
X <- t(as.matrix(train_[,1]))
Y <- t(as.matrix(test_[,2:14]))
# Train model
model <- trainr(Y = Y,
X = X,
learningrate = 0.01,
hidden_dim = 10,
numepochs = 10)
I am attempting to reformat the data set my.data to obtain the output shown below the my.data2 statement. Specifically, I want to put the last 4 columns of my.data on one line per record.id, where the last four
columns of my.data will occupy columns 2-5 of the new data matrix if group=1 and columns 6-9 if group=2.
I wrote the cumbersome code below, but the double for-loop is causing an error that I simply cannot locate.
Even if the double for-loop worked, I suspect there is a much more efficient way of accomplishing the
same thing - (maybe reshape?)
Thank you for any help correcting the double for-loop or with more efficient code.
my.data <- "record.id group s1 s2 s3 s4
1 1 2 0 1 3
1 2 0 0 0 12
2 1 0 0 0 0
3 1 10 0 0 0
4 1 1 0 0 0
4 2 0 0 0 0
8 2 0 2 2 0
9 1 0 0 0 0
9 2 0 0 0 0"
my.data2 <- read.table(textConnection(my.data), header=T)
# desired output
#
# 1 2 0 1 3 0 0 0 12
# 2 0 0 0 0 0 0 0 0
# 3 10 0 0 0 0 0 0 0
# 4 1 0 0 0 0 0 0 0
# 8 0 0 0 0 0 2 2 0
# 9 0 0 0 0 0 0 0 0
Code:
dat_sorted <- sort(unique(my.data2[,1]))
my.seq <- match(my.data2[,1],dat_sorted)
my.data3 <- cbind(my.seq, my.data2)
group.min <- tapply(my.data3$group, my.data3$my.seq, min)
group.max <- tapply(my.data3$group, my.data3$my.seq, max)
# my.min <- group.min[my.data3[,1]]
# my.max <- group.max[my.data3[,1]]
my.records <- matrix(0, nrow=length(unique(my.data3$record.id)), ncol=9)
x <- 1
for(i in 1:max(my.data3$my.seq)) {
for(j in group.min[i]:group.max[i]) {
if(my.data3[x,1] == i) my.records[i,1] = i
# the two lines below seem to be causing an error
if((my.data3[x,1] == i) & (my.data3[x,3] == 1)) (my.records[i,2:5] = my.data3[x,4:7])
if((my.data3[x,1] == i) & (my.data3[x,3] == 2)) (my.records[i,6:9] = my.data3[x,4:7])
x <- x + 1
}
}
You are right, reshape helps here.
library(reshape2)
m <- melt(my.data2, id.var = c("record.id", "group"))
dcast(m, record.id ~ group + variable, fill = 0)
record.id 1_s1 1_s2 1_s3 1_s4 2_s1 2_s2 2_s3 2_s4
1 1 2 0 1 3 0 0 0 12
2 2 0 0 0 0 0 0 0 0
3 3 10 0 0 0 0 0 0 0
4 4 1 0 0 0 0 0 0 0
5 8 0 0 0 0 0 2 2 0
6 9 0 0 0 0 0 0 0 0
Comparison:
dfTest <- data.frame(record.id = rep(1:10e5, each = 2), group = 1:2,
s1 = sample(1:10, 10e5 * 2, replace = TRUE),
s2 = sample(1:10, 10e5 * 2, replace = TRUE),
s3 = sample(1:10, 10e5 * 2, replace = TRUE),
s4 = sample(1:10, 10e5 * 2, replace = TRUE))
system.time({
...# Your code
})
Error in my.records[i, 1] = i : incorrect number of subscripts on matrix
Timing stopped at: 41.61 0.36 42.56
system.time({m <- melt(dfTest, id.var = c("record.id", "group"))
dcast(m, record.id ~ group + variable, fill = 0)})
user system elapsed
25.04 2.78 28.72
Julius' answer is better, but for completeness, I think I managed to get the following for-loop to work:
dat_x <- (unique(my.data2[,1]))
my.seq <- match(my.data2[,1],dat_x)
my.data3 <- as.data.frame(cbind(my.seq, my.data2))
my.records <- matrix(0, nrow=length(unique(my.data3$record.id)), ncol=9)
my.records <- as.data.frame(my.records)
my.records[,1] = unique(my.data3[,2])
for(i in 1:9) {
if(my.data3[i,3] == 1) (my.records[my.data3[i,1],c(2:5)] = my.data3[i,c(4:7)])
if(my.data3[i,3] == 2) (my.records[my.data3[i,1],c(6:9)] = my.data3[i,c(4:7)])
}