I am using 'neuralnet' package in R to train a model for 'wine' dataset.
Below is the code that I have come up with so far-
library(neuralnet)
library(rattle)
library(rattle.data)
# load 'wine' dataset-
data(wine)
D <- as.data.frame(wine, stringsAsFactors=FALSE)
# replace 'Type' response variable (with values- 1, 2, 3) by 3 dummy variables-
D$wine1 <- 0
D$wine1[D$Type == 1] <- 1
D$wine2 <- 0
D$wine2[D$Type == 2] <- 1
D$wine3 <- 0
D$wine3[D$Type == 3] <- 1
# create formula to be used-
wine_formula <- as.formula(wine1 + wine2 + wine3 ~ Alcohol + Malic + Ash + Alcalinity + Magnesium + Phenols + Flavanoids + Nonflavanoids + Proanthocyanins + Color + Hue + Dilution + Proline)
# split dataset into training and testing datasets-
train_indices <- sample(1:nrow(wine), floor(0.7 * nrow(wine)), replace = F)
training <- D[train_indices, ]
testing <- D[-train_indices, ]
# train neural network model-
wine_nn <- neuralnet(wine_formula, data = training, hidden = c(5, 3), linear.output = FALSE, stepmax = 1e+07)
# make predictions using 'compute()'-
preds <- compute(wine_nn, testing[, 2:14])
# create a final data frame 'results' containing predicted & actual values-
results <- as.data.frame(preds$net.result)
results <- cbind(results, testing$wine1, testing$wine2, testing$wine3)
# rename the data frame-
names(results) <- c("Pred_Wine1", "Pred_Wine2", "Pred_Wine3", "Actual_Wine1", "Actual_Wine2", "Actual_Wine3")
The task that I have now is to convert the values in attributes "Pred_Wine1", "Pred_Wine2" and "Pred_Wine3" to 1s and 0s so that I can create a confusion matrix and test for model accuracy.
How should I go about it because "Pred_Wine1", "Pred_Wine2", "Pred_Wine3" contain calculated values which are in between 0 and 1.
Any suggestions?
Thanks!
I think you need label encoding here.
Let's say your data frame is called df. This will convert the values in your features into numeric. So, if Pred_Wine1 contains a,b it will convert it to 0,1 or vice-versa.
Try this:
features <- c("Pred_Wine1", "Pred_Wine2","Pred_Wine3")
for(f in features)
{
levels <- unique(df[[f]])
df[[i]] <- as.integer(factor(df[[i]], levels=levels))
}
Something like:
> head(results)
Pred_Wine1
1 1.00000000000000000
14 1.00000000000000000
17 1.00000000000000000
21 0.00000001901851182
26 0.21287781596598065
27 1.00000000000000000
Pred_Wine2
1 0.00000000000000000000000000000000000000000000000000015327712484
14 0.00000000000000000000000000000000000000000000000000009310376079
17 0.00000000000000000000000000000000000000000000000000009435487922
21 0.99999999363562386278658777882810682058334350585937500000000000
26 0.78964805454441211463034733242238871753215789794921875000000000
27 0.00000000000000000000000000000000000000000000000000009310386461
Pred_Wine3 Actual_Wine1 Actual_Wine2 Actual_Wine3
1 5.291055036e-10 1 0 0
14 1.336129635e-09 1 0 0
17 1.303396935e-09 1 0 0
21 8.968513318e-122 1 0 0
26 1.623066411e-111 1 0 0
27 1.336126866e-09 1 0 0
> class <- apply(results[1:3], 1, which.max)
> results[1:3] <- 0
> head(results)
Pred_Wine1 Pred_Wine2 Pred_Wine3 Actual_Wine1 Actual_Wine2 Actual_Wine3
1 0 0 0 1 0 0
14 0 0 0 1 0 0
17 0 0 0 1 0 0
21 0 0 0 1 0 0
26 0 0 0 1 0 0
27 0 0 0 1 0 0
> for (r in names(class)) {results[r,class[r]] <- 1}
> head(results)
Pred_Wine1 Pred_Wine2 Pred_Wine3 Actual_Wine1 Actual_Wine2 Actual_Wine3
1 1 0 0 1 0 0
14 1 0 0 1 0 0
17 1 0 0 1 0 0
21 0 1 0 1 0 0
26 0 1 0 1 0 0
27 1 0 0 1 0 0
Related
I have a huge dataset and I want to compute the correlation of each item with the total score of the scale, but without containing the item. Now I could do it separately for each item, but I am trying to do a loop, so that it is a bit easier.
Example dataset:
dat <- read.table(header=TRUE, text="
ItemX1 ItemX2 ItemX3 ItemX4 ItemX5 ItemX6 ItemY1 ItemY2 ItemY3 ItemY4 ItemY5 ItemY6
1 1 0 1 0 1 1 1 0 1 0 1
0 1 0 0 0 1 0 1 0 0 0 1
1 1 0 1 0 1 1 1 0 1 0 0
1 0 1 0 0 1 1 0 1 0 0 1
1 1 0 1 1 1 1 1 0 1 1 0
0 0 1 1 0 0 0 0 1 1 0 0
")
xscore <- rowSums(select(dat, starts_with("ItemX")))
Now I could do it like the following, but as I have 107 Items it is a bit much.
cor(dat$ItemX1,rowSums(select(dat, starts_with("ItemX") & -"ItemX1")),use="pairwise.complete.obs")
cor(dat$ItemX2,rowSums(select(dat, starts_with("ItemX") & -"ItemX2")),use="pairwise.complete.obs")
cor(dat$ItemX3,rowSums(select(dat, starts_with("ItemX") & -"ItemX3")),use="pairwise.complete.obs")
cor(dat$ItemX4,rowSums(select(dat, starts_with("ItemX") & -"ItemX4")),use="pairwise.complete.obs")
cor(dat$ItemX5,rowSums(select(dat, starts_with("ItemX") & -"ItemX5")),use="pairwise.complete.obs")
cor(dat$ItemX6,rowSums(select(dat, starts_with("ItemX") & -"ItemX6")),use="pairwise.complete.obs")
That's why I'm trying out the following loop, but now I don't know how to specify that the rowSums is calculated without the item which is in use for the correlation.
variables <- names(dat)
names.item <- c(grep("ItemX", variables, value = TRUE))
item.diff.p <- data.frame(matrix(NA, ncol=2, nrow=(length(names.item)-1)))
names(item.diff.p) <- c("Item", "cor")
length(names.item)
for(i in 1:(length(names.item))-1){
item <- names.item[i]
par <- cor(dat[,names(dat)[grepl("ItemX",names(dat))]],
rowSums(select(dat, starts_with("ItemX"))),use="pairwise.complete.obs")
item.diff.p[i, c("cor")]
}
par
Thank you all!
You can iterate through the columns of a subsetted dataframe, and calculate:
X_dat = dat[,grep("^ItemX",colnames(dat))]
res = sapply(1:ncol(X_dat),function(i){
cor(X_dat[,i],rowSums(X_dat[,-i]),use="p")
})
names(res) = colnames(X_dat)
res
ItemX1 ItemX2 ItemX3 ItemX4 ItemX5 ItemX6
0.6324555 0.1250000 -0.7500000 0.1250000 0.4152274 0.2335497
A sample of data set:
testdf <- data.frame(risk_11111 = c(0,0,1,2,3,0,1,2,3,4,0), risk_11112 = c(0,0,1,2,3,0,1,2,0,1,0))
And I need output data set which would contain new column where only maximum values of cumulative sum will be maintained:
testdf <- data.frame(risk_11111 = c(0,0,1,2,3,0,1,2,3,4,0),
risk_11111_max = c(0,0,0,0,3,0,0,0,0,4,0),
risk_11112 = c(0,0,1,2,3,0,1,2,0,1,0),
risk_11112_max = c(0,0,0,0,3,0,0,2,0,1,0))
I am guessing some logical subseting of vectors colwise with apply and extracting max value with position index, and mutate into new variables.
I dont know how to extract values for new variable.
Thanks
Something like this with base R:
lapply(testdf, function(x) {
x[diff(x) > 0] <- 0
x
})
And to have all in one data.frame:
dfout <- cbind(testdf, lapply(testdf, function(x) {
x[diff(x) > 0] <- 0
x
}))
names(dfout) <- c(names(testdf), 'risk_1111_max', 'risk_1112_max')
Output:
risk_11111 risk_11112 risk_1111_max risk_1112_max
1 0 0 0 0
2 0 0 0 0
3 1 1 0 0
4 2 2 0 0
5 3 3 3 3
6 0 0 0 0
7 1 1 0 0
8 2 2 0 2
9 3 0 0 0
10 4 1 4 1
11 0 0 0 0
I'm attempting to use formula to generate a model.matrix object to be used in a custom optimizer function.
It works great for the most part, but when it comes to factor-factor interactions, I'd like to specify the interaction as dummy coded rather than effects coded.
Take for example the following data set:
set.seed(1987)
myDF <- data.frame(Y = rnorm(100),
X1 = factor(LETTERS[sample(1:3, 100, replace = TRUE)]),
X2 = factor(LETTERS[sample(1:3, 100, replace = TRUE)]))
head(myDF)
Both the : and / operators create an effects coded model matrix (the latter being an additive effects structure, I think).
head(model.matrix(formula(Y ~ X1 : X2), data = myDF))
head(model.matrix(formula(Y ~ X1 / X2), data = myDF))
But I am looking to generate a dummy coded model matrix, which would have the first level of X1 omitted for each level of X2. Resulting in these terms (columns):
X1B:X2A
X1C:X2A
X1B:X2B
X1C:X2B
X1B:X2C
X1C:X2C
Is there a way to achieve this?
Is ~X1:X2-1 what you're looking for?
Make test data (as above):
set.seed(1987)
myDF <- data.frame(Y = rnorm(100),
X1 = factor(LETTERS[sample(1:3, 100, replace = TRUE)]),
X2 = factor(LETTERS[sample(1:3, 100, replace = TRUE)]))
Generate model matrix:
mm1 <- model.matrix(formula(Y ~ X1 : X2 - 1), data = myDF)
head(mm1)
## X1A:X2A X1B:X2A X1C:X2A X1A:X2B X1B:X2B X1C:X2B X1A:X2C X1B:X2C X1C:X2C
## 1 0 0 0 0 1 0 0 0 0
## 2 1 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 1 0
## 4 0 0 0 0 0 1 0 0 0
## 5 0 0 0 1 0 0 0 0 0
## 6 0 0 0 0 0 0 1 0 0
Or perhaps you really do just want some columns excluded:
mm0 <- model.matrix(formula(Y ~ X1 : X2), data = myDF)
mm0B <- mm0[,!grepl("(Intercept|^X1A:)",colnames(mm0))]
## X1B:X2A X1C:X2A X1B:X2B X1C:X2B X1B:X2C X1C:X2C
## 1 0 0 1 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 1 0
## 4 0 0 0 1 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
I thought you also might be interested in sum-to-zero contrasts:
mm2 <- model.matrix(formula(Y ~ X1 : X2 - 1), data = myDF,
contrasts.arg=list(X1=contr.sum,X2=contr.sum))
Below is another trial.
set.seed(1987)
myDF <- data.frame(Y = rnorm(100),
X1 = factor(LETTERS[sample(1:3, 100, replace = TRUE)]),
X2 = factor(LETTERS[sample(1:3, 100, replace = TRUE)]))
# row subsetting to exclude A
modelMat <- model.matrix(formula(Y ~ X1 : X2), data = myDF[myDF$X1 != 'A',])
# column subsetting to eliminate all columns including X1A
modelMat <- modelMat[,substring(colnames(modelMat), 1, 3) != "X1A"]
head(modelMat)
(Intercept) X1B:X2A X1C:X2A X1B:X2B X1C:X2B X1B:X2C X1C:X2C
1 1 0 0 1 0 0 0
3 1 0 0 0 0 1 0
4 1 0 0 0 1 0 0
8 1 0 0 0 0 1 0
10 1 0 0 0 0 0 1
11 1 0 0 0 0 0 1
I have a dataframe (.txt) which looks like this [where "dayX" = the day of death in a survival assay in fruitflies, the numbers beneath are the number of flies to die in that treatment combination on that day, X or A are treaments, m & f are also treatments, the first number is the line, the second number is the block]
line day1 day2 day3 day4 day5
1 Xm1.1 0 0 0 2 0
2 Xm1.2 0 0 1 0 0
3 Xm2.1 1 1 0 0 0
4 Xm2.2 0 0 0 3 1
5 Xf1.1 0 3 0 0 1
6 Xf1.2 0 0 1 0 0
7 Xf2.1 2 0 2 0 0
8 Xf2.2 1 0 1 0 0
9 Am1.1 0 0 0 0 2
10 Am1.2 0 0 1 0 0
11 Am2.1 0 2 0 0 1
12 Am2.2 0 2 0 0 0
13 Af1.1 3 0 0 1 0
14 Af1.2 0 1 3 0 0
15 Af1.1 0 0 0 1 0
16 Af2.2 1 0 0 0 0
and want it to become this using R->
XA mf line block individual age
1 X m 1 1 1 4
2 X m 1 1 2 4
3 X m 1 2 1 3
and so on...
the resulting dataframe collects the "age" value from the day the individual died, as scored in the upper dataframe, for example there were two flies that died on the 4th day (day4) in treatment Xm1.1 therefore R creates two rows, one containing information extracted regarding the first individual and thus being labelled as individual "1", then another row with the same information except labelled as individual "2".. if a 3rd individual died in the same treatment on day 5, there would be a third row which is the same as the above two rows except the "age" would be "5" and individual would be "3". When it moves on to the next treatment row, in this case Xm1.2, the first individual to die within that treatment set would be labelled as individual "1" (which in this case dies on day 3). In my example there is a total of 38 deaths, therefore I am trying to get R to build a df which is 38*6 (excl. headers).
is there a way to take my dataframe [the real version is approx 50*640 with approx 50 individuals per unique combination of X/A, m/f, line (1:40), block (1-4) so ~32000 individual deaths] to an end dataframe of 6*~32000 in an automated way?
both of these example dataframes can be built using this code if it helps you to try out solutions:
test<-data.frame(1:16);colnames(test)=("line")
test$line=c("Xm1.1","Xm1.2","Xm2.1","Xm2.2","Xf1.1","Xf1.2","Xf2.1","Xf2.2","Am1.1","Am1.2","Am2.1","Am2.2","Af1.1","Af1.2","Af2.1","Af2.2")
test$day1=rep(0,16);test$day2=rep(0,16);test$day3=rep(0,16);test$day4=rep(0,16);test$day5=rep(0,16)
test$day4[1]=2;test$day3[2]=1;test$day2[3]=1;test$day4[4]=3;test$day5[5]=1;
test$day3[6]=1;test$day1[7]=2;test$day1[8]=1;test$day5[9]=3;test$day3[10]=1;
test$day2[11]=2;test$day2[12]=2;test$day4[13]=1;test$day3[14]=3;test$day4[15]=1;
test$day1[16]=1;test$day3[7]=2;test$day3[8]=1;test$day2[5]=3;test$day1[3]=1;
test$day5[11]=1;test$day5[9]=2;test$day5[4]=1;test$day1[13]=3;test$day2[14]=1;
test2=data.frame(rep(1:3),rep(1:3),rep(1:3),rep(1:3),rep(1:3),rep(1:3))
colnames(test2)=c("XA","mf","line","block","individual","age")
test2$XA[1]="X";test2$mf[1]="m";test2$line[1]=1;test2$block[1]=1;test2$individual[1]=1;test2$age[1]=4;
test2$XA[2]="X";test2$mf[2]="m";test2$line[2]=1;test2$block[2]=1;test2$individual[2]=2;test2$age[2]=4;
test2$XA[3]="X";test2$mf[3]="m";test2$line[3]=1;test2$block[3]=2;test2$individual[3]=1;test2$age[3]=3;
apologies for the awfully long way of making this dummy dataset, suffering from sleep deprivation and jetlag and haven't used R for months, if you run the code in R you will hopefully see better what I aim to do
-------------------------------------------------------------------------------------
By Rg255:
Currently stuck at this derived from #Arun's answer (I have added the strsplit (as.character(dt$line) , "" )) section to get around one error)
df=read.table("C:\\Users\\...\\data.txt",header=T)
require(data.table)
head(df[1:20])
dt <- as.data.table(df)
dt <- dt[, {dd <- unlist(.SD, use.names = FALSE);
list(individual = sequence(dd[dd>0]),
age = rep(which(dd>0), dd[dd>0])
)}, by=line]
out <- as.data.table(data.frame(do.call(rbind, strsplit(as.character(dt$line), ""))[, c(1:3,5)], stringsAsFactors=FALSE))
setnames(out, c("XA", "mf", "line", "block"))
out[, `:=`(line = as.numeric(line), block = as.numeric(block))]
out <- cbind(out, dt[, list(individual, age)])
Produces the following output:
> df=read.table("C:\\Users\\..\\data.txt",header=T)
> require(data.table)
> head(df[1:20])
line Day4 Day6 Day8 Day10 Day12 Day14 Day16 Day18 Day20 Day22 Day24 Day26 Day28 Day30 Day32 Day34 Day36 Day38 Day40
1 Xm1.1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 4 2
2 Xm2.1 0 0 0 0 0 0 0 0 0 2 0 0 0 1 2 1 0 2 0
3 Xm3.1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1
4 Xm4.1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 2 3 8
5 Xm5.1 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 3 3 3 6
6 Xm6.1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
> dt <- as.data.table(df)
> dt <- dt[, {dd <- unlist(.SD, use.names = FALSE);
+ list(individual = sequence(dd[dd>0]),
+ age = rep(which(dd>0), dd[dd>0])
+ )}, by=line]
> out <- as.data.table(data.frame(do.call(rbind, strsplit(as.character(dt$line), ""))[, c(1:3,5)], stringsAsFactors=FALSE))
Warning message:
In function (..., deparse.level = 1) :
number of columns of result is not a multiple of vector length (arg 1)
> setnames(out, c("XA", "mf", "line", "block"))
> out[, `:=`(line = as.numeric(line), block = as.numeric(block))]
Error in `[.data.table`(out, , `:=`(line = as.numeric(line), block = as.numeric(block))) :
LHS of := must be a single column name, when with=TRUE. When with=FALSE the LHS may be a vector of column names or positions.
In addition: Warning message:
In eval(expr, envir, enclos) : NAs introduced by coercion
> out <- cbind(out, dt[, list(individual, age)])
>
Here goes a data.table solution. The line column must have unique values.
require(data.table)
df <- read.table("data.txt", header=TRUE, stringsAsFactors=FALSE)
dt <- as.data.table(df)
dt <- dt[, {dd <- unlist(.SD, use.names = FALSE);
list(individual = sequence(dd[dd>0]),
age = rep(which(dd>0), dd[dd>0])
)}, by=line]
out <- as.data.table(data.frame(do.call(rbind,
strsplit(gsub("([[:alpha:]])([[:alpha:]])([0-9]+)\\.([0-9]+)$",
"\\1 \\2 \\3 \\4", dt$line), " ")), stringsAsFactors=FALSE))
setnames(out, c("XA", "mf", "line", "block"))
out[, `:=`(line = as.numeric(line), block = as.numeric(block))]
out <- cbind(out, dt[, list(individual, age)])
This works on your data.txt file.
I am attempting to reformat the data set my.data to obtain the output shown below the my.data2 statement. Specifically, I want to put the last 4 columns of my.data on one line per record.id, where the last four
columns of my.data will occupy columns 2-5 of the new data matrix if group=1 and columns 6-9 if group=2.
I wrote the cumbersome code below, but the double for-loop is causing an error that I simply cannot locate.
Even if the double for-loop worked, I suspect there is a much more efficient way of accomplishing the
same thing - (maybe reshape?)
Thank you for any help correcting the double for-loop or with more efficient code.
my.data <- "record.id group s1 s2 s3 s4
1 1 2 0 1 3
1 2 0 0 0 12
2 1 0 0 0 0
3 1 10 0 0 0
4 1 1 0 0 0
4 2 0 0 0 0
8 2 0 2 2 0
9 1 0 0 0 0
9 2 0 0 0 0"
my.data2 <- read.table(textConnection(my.data), header=T)
# desired output
#
# 1 2 0 1 3 0 0 0 12
# 2 0 0 0 0 0 0 0 0
# 3 10 0 0 0 0 0 0 0
# 4 1 0 0 0 0 0 0 0
# 8 0 0 0 0 0 2 2 0
# 9 0 0 0 0 0 0 0 0
Code:
dat_sorted <- sort(unique(my.data2[,1]))
my.seq <- match(my.data2[,1],dat_sorted)
my.data3 <- cbind(my.seq, my.data2)
group.min <- tapply(my.data3$group, my.data3$my.seq, min)
group.max <- tapply(my.data3$group, my.data3$my.seq, max)
# my.min <- group.min[my.data3[,1]]
# my.max <- group.max[my.data3[,1]]
my.records <- matrix(0, nrow=length(unique(my.data3$record.id)), ncol=9)
x <- 1
for(i in 1:max(my.data3$my.seq)) {
for(j in group.min[i]:group.max[i]) {
if(my.data3[x,1] == i) my.records[i,1] = i
# the two lines below seem to be causing an error
if((my.data3[x,1] == i) & (my.data3[x,3] == 1)) (my.records[i,2:5] = my.data3[x,4:7])
if((my.data3[x,1] == i) & (my.data3[x,3] == 2)) (my.records[i,6:9] = my.data3[x,4:7])
x <- x + 1
}
}
You are right, reshape helps here.
library(reshape2)
m <- melt(my.data2, id.var = c("record.id", "group"))
dcast(m, record.id ~ group + variable, fill = 0)
record.id 1_s1 1_s2 1_s3 1_s4 2_s1 2_s2 2_s3 2_s4
1 1 2 0 1 3 0 0 0 12
2 2 0 0 0 0 0 0 0 0
3 3 10 0 0 0 0 0 0 0
4 4 1 0 0 0 0 0 0 0
5 8 0 0 0 0 0 2 2 0
6 9 0 0 0 0 0 0 0 0
Comparison:
dfTest <- data.frame(record.id = rep(1:10e5, each = 2), group = 1:2,
s1 = sample(1:10, 10e5 * 2, replace = TRUE),
s2 = sample(1:10, 10e5 * 2, replace = TRUE),
s3 = sample(1:10, 10e5 * 2, replace = TRUE),
s4 = sample(1:10, 10e5 * 2, replace = TRUE))
system.time({
...# Your code
})
Error in my.records[i, 1] = i : incorrect number of subscripts on matrix
Timing stopped at: 41.61 0.36 42.56
system.time({m <- melt(dfTest, id.var = c("record.id", "group"))
dcast(m, record.id ~ group + variable, fill = 0)})
user system elapsed
25.04 2.78 28.72
Julius' answer is better, but for completeness, I think I managed to get the following for-loop to work:
dat_x <- (unique(my.data2[,1]))
my.seq <- match(my.data2[,1],dat_x)
my.data3 <- as.data.frame(cbind(my.seq, my.data2))
my.records <- matrix(0, nrow=length(unique(my.data3$record.id)), ncol=9)
my.records <- as.data.frame(my.records)
my.records[,1] = unique(my.data3[,2])
for(i in 1:9) {
if(my.data3[i,3] == 1) (my.records[my.data3[i,1],c(2:5)] = my.data3[i,c(4:7)])
if(my.data3[i,3] == 2) (my.records[my.data3[i,1],c(6:9)] = my.data3[i,c(4:7)])
}