How to one-hot-encode factor variables with data.table? - r

For those unfamiliar, one-hot encoding simply refers to converting a column of categories (i.e. a factor) into multiple columns of binary indicator variables where each new column corresponds to one of the classes of the original column. This example will explain it better:
dt <- data.table(
ID=1:5,
Color=factor(c("green", "red", "red", "blue", "green"), levels=c("blue", "green", "red", "purple")),
Shape=factor(c("square", "triangle", "square", "triangle", "cirlce"))
)
dt
ID Color Shape
1: 1 green square
2: 2 red triangle
3: 3 red square
4: 4 blue triangle
5: 5 green cirlce
# one hot encode the colors
color.binarized <- dcast(dt[, list(V1=1, ID, Color)], ID ~ Color, fun=sum, value.var="V1", drop=c(TRUE, FALSE))
# Prepend Color_ in front of each one-hot-encoded feature
setnames(color.binarized, setdiff(colnames(color.binarized), "ID"), paste0("Color_", setdiff(colnames(color.binarized), "ID")))
# one hot encode the shapes
shape.binarized <- dcast(dt[, list(V1=1, ID, Shape)], ID ~ Shape, fun=sum, value.var="V1", drop=c(TRUE, FALSE))
# Prepend Shape_ in front of each one-hot-encoded feature
setnames(shape.binarized, setdiff(colnames(shape.binarized), "ID"), paste0("Shape_", setdiff(colnames(shape.binarized), "ID")))
# Join one-hot tables with original dataset
dt <- dt[color.binarized, on="ID"]
dt <- dt[shape.binarized, on="ID"]
dt
ID Color Shape Color_blue Color_green Color_red Color_purple Shape_cirlce Shape_square Shape_triangle
1: 1 green square 0 1 0 0 0 1 0
2: 2 red triangle 0 0 1 0 0 0 1
3: 3 red square 0 0 1 0 0 1 0
4: 4 blue triangle 1 0 0 0 0 0 1
5: 5 green cirlce 0 1 0 0 1 0 0
This is something I do a lot, and as you can see it's pretty tedious (especially when my data has many factor columns). Is there an easier way to do this with data.table? In particular, I assumed dcast would allow me to one-hot-encode multiple columns at once, when I try doing something like
dcast(dt[, list(V1=1, ID, Color, Shape)], ID ~ Color + Shape, fun=sum, value.var="V1", drop=c(TRUE, FALSE))
I get column combinations
ID blue_cirlce blue_square blue_triangle green_cirlce green_square green_triangle red_cirlce red_square red_triangle purple_cirlce purple_square purple_triangle
1: 1 0 0 0 0 1 0 0 0 0 0 0 0
2: 2 0 0 0 0 0 0 0 0 1 0 0 0
3: 3 0 0 0 0 0 0 0 1 0 0 0 0
4: 4 0 0 1 0 0 0 0 0 0 0 0 0
5: 5 0 0 0 1 0 0 0 0 0 0 0 0

Here you go:
dcast(melt(dt, id.vars='ID'), ID ~ variable + value, fun = length)
# ID Color_blue Color_green Color_red Shape_cirlce Shape_square Shape_triangle
#1: 1 0 1 0 0 1 0
#2: 2 0 0 1 0 0 1
#3: 3 0 0 1 0 1 0
#4: 4 1 0 0 0 0 1
#5: 5 0 1 0 1 0 0
To get the missing factors you can do the following:
res = dcast(melt(dt, id = 'ID', value.factor = T), ID ~ value, drop = F, fun = length)
setnames(res, c("ID", unlist(lapply(2:ncol(dt),
function(i) paste(names(dt)[i], levels(dt[[i]]), sep = "_")))))
res
# ID Color_blue Color_green Color_red Color_purple Shape_cirlce Shape_square Shape_triangle
#1: 1 0 1 0 0 0 1 0
#2: 2 0 0 1 0 0 0 1
#3: 3 0 0 1 0 0 1 0
#4: 4 1 0 0 0 0 0 1
#5: 5 0 1 0 0 1 0 0

Using model.matrix:
> cbind(dt[, .(ID)], model.matrix(~ Color + Shape, dt))
ID (Intercept) Colorgreen Colorred Colorpurple Shapesquare Shapetriangle
1: 1 1 1 0 0 1 0
2: 2 1 0 1 0 0 1
3: 3 1 0 1 0 1 0
4: 4 1 0 0 0 0 1
5: 5 1 1 0 0 0 0
This makes the most sense if you're doing modelling.
If you want to suppress the intercept (and restore the aliased column for the 1st variable):
> cbind(dt[, .(ID)], model.matrix(~ Color + Shape - 1, dt))
ID Colorblue Colorgreen Colorred Colorpurple Shapesquare Shapetriangle
1: 1 0 1 0 0 1 0
2: 2 0 0 1 0 0 1
3: 3 0 0 1 0 1 0
4: 4 1 0 0 0 0 1
5: 5 0 1 0 0 0 0

Here's a more generalized version of eddi's solution:
one_hot <- function(dt, cols="auto", dropCols=TRUE, dropUnusedLevels=FALSE){
# One-Hot-Encode unordered factors in a data.table
# If cols = "auto", each unordered factor column in dt will be encoded. (Or specifcy a vector of column names to encode)
# If dropCols=TRUE, the original factor columns are dropped
# If dropUnusedLevels = TRUE, unused factor levels are dropped
# Automatically get the unordered factor columns
if(cols[1] == "auto") cols <- colnames(dt)[which(sapply(dt, function(x) is.factor(x) & !is.ordered(x)))]
# Build tempDT containing and ID column and 'cols' columns
tempDT <- dt[, cols, with=FALSE]
tempDT[, ID := .I]
setcolorder(tempDT, unique(c("ID", colnames(tempDT))))
for(col in cols) set(tempDT, j=col, value=factor(paste(col, tempDT[[col]], sep="_"), levels=paste(col, levels(tempDT[[col]]), sep="_")))
# One-hot-encode
if(dropUnusedLevels == TRUE){
newCols <- dcast(melt(tempDT, id = 'ID', value.factor = T), ID ~ value, drop = T, fun = length)
} else{
newCols <- dcast(melt(tempDT, id = 'ID', value.factor = T), ID ~ value, drop = F, fun = length)
}
# Combine binarized columns with the original dataset
result <- cbind(dt, newCols[, !"ID"])
# If dropCols = TRUE, remove the original factor columns
if(dropCols == TRUE){
result <- result[, !cols, with=FALSE]
}
return(result)
}
Note that for large datasets it's probably better to use Matrix::sparse.model.matrix
Update (2017)
This is now in the package mltools.

If no one posts a clean way to write this out by hand each time, you can always make a function/macro:
OHE <- function(dt, grp, encodeCols) {
grpSymb = as.symbol(grp)
for (col in encodeCols) {
colSymb = as.symbol(col)
eval(bquote(
dt[, .SD
][, V1 := 1
][, dcast(.SD, .(grpSymb) ~ .(colSymb), fun=sum, value.var='V1')
][, setnames(.SD, setdiff(colnames(.SD), grp), sprintf("%s_%s", col, setdiff(colnames(.SD), grp)))
][, dt <<- dt[.SD, on=grp]
]
))
}
dt
}
dtOHE = OHE(dt, 'ID', c('Color', 'Shape'))
dtOHE
ID Color Shape Color_blue Color_green Color_red Shape_cirlce Shape_square Shape_triangle
1: 1 green square 0 1 0 0 1 0
2: 2 red triangle 0 0 1 0 0 1
3: 3 red square 0 0 1 0 1 0
4: 4 blue triangle 1 0 0 0 0 1
5: 5 green cirlce 0 1 0 1 0 0

In few lines you can solve this problem:
library(tidyverse)
dt2 <- spread(dt,Color,Shape)
dt3 <- spread(dt,Shape,Color)
df <- cbind(dt2,dt3)
df2 <- apply(df, 2, function(x){sapply(x, function(y){
ifelse(is.na(y),0,1)
})})
df2 <- as.data.frame(df2)
df <- cbind(dt,df2[,-1])

Related

R remove columns from data.table that sum to 0 - still not working

This may seem like a duplicate question but maybe I am missing something here.
I have been trying to remove just the columns where the sum of absolute values add to zero from a data.table.
I searched and found many solutions on this site that claim to work, and in fact, when I copy/paste exact code, it does work. However, for some reason, I can not duplicate it with my data.table.
The result of almost anything I do turns my data.table into a list. I even tried to convert my data.table to data.frame to try these solutions with no luck.
from here:
SelectVar[, colSums(SelectVar != 0) > 0]
Does not work.
SelectVar[, !apply(SelectVar == 0, 2, all)]
Does not work either.
remove_zero_cols <- function(df) {
rem_vec <- NULL
for(i in 1:ncol(df)){
this_sum <- summary(df[,i])
zero_test <- length(which(this_sum == 0))
if(zero_test == 6) {
rem_vec[i] <- names(df)[i]
}
}
features_to_remove <- rem_vec[!is.na(rem_vec)]
rem_ind <- which(names(df) %in% features_to_remove)
df <- df[,-rem_ind]
return(df)
}
This function also does not work.
I checked the class of each parameter and they are all either numeric or integer types. I also checked for any NA's and found none.
Any suggestions?
Add with = FALSE to the first solution you referenced to if you are working on a data.table.
# Create example data frame
SelectVar <- read.table(text = " a b c d e f g h i j k l ll m n o p q r
1 Dxa8 Dxa8 0 Dxa8 Dxa8 0 Dxa8 Dxa8 0 0 0 0 0 0 0 0 0 Dxc8 0
2 Dxb8 Dxc8 0 Dxe8 Dxi8 0 tneg tpos 0 0 0 0 0 0 0 0 0 Dxi8 0",
header = TRUE, stringsAsFactors = FALSE)
# Convert to a data.table
library(data.table)
setDT(SelectVar)
SelectVar[, colSums(SelectVar != 0) > 0, with = FALSE]
# a b d e g h q
# 1: Dxa8 Dxa8 Dxa8 Dxa8 Dxa8 Dxa8 Dxc8
# 2: Dxb8 Dxc8 Dxe8 Dxi8 tneg tpos Dxi8
The OP has requested to remove just the columns where the sum of absolute values add to zero. Later on, he has clarified that he wants to drop data.table columns that ONLY contain 0's all the way down each row.
This can be achieved by using the any() function
library(data.table)
#create sample data
n_rows <- 10L
n_cols <- 5L
DT <- data.table(id = 1:n)
dat_cols <- sprintf("dat%i", seq.int(n_cols))
for (j in seq.int(n_cols)) set(DT, NULL, dat_cols[j], 0L)
set.seed(1L)
DT[sample.int(n_rows, 0.1 * n_rows), (sample.int(n_cols, 0.5 * n_cols)) := 1L]
DT
id dat1 dat2 dat3 dat4 dat5
1: 1 0 0 0 0 0
2: 2 0 1 1 0 0
3: 3 0 0 0 0 0
4: 4 0 0 0 0 0
5: 5 0 0 0 0 0
6: 6 0 0 0 0 0
7: 7 0 0 0 0 0
8: 8 0 0 0 0 0
9: 9 0 0 0 0 0
10: 10 0 0 0 0 0
# find columns which are all zero using any()
dat_cols <- sprintf("dat%i", seq.int(n_cols))
zero_cols <- setDT(DT)[, lapply(.SD, function(x) !any(x)),
.SDcols = dat_cols]
# remove columns in place
DT[, (names(which(unlist(zero_cols)))) := NULL][]
id dat2 dat3
1: 1 0 0
2: 2 1 1
3: 3 0 0
4: 4 0 0
5: 5 0 0
6: 6 0 0
7: 7 0 0
8: 8 0 0
9: 9 0 0
10: 10 0 0
Here's a tidyverse solution. You could convert your data.table into a tibble and then go from there.
library(tidyverse)
df <- tibble(a = 1:5, b = -1:3, c = 0)
selection_criteria <- (colSums(abs(df)) != 0)
df[selection_criteria]
Before:
library(tidy verse)
DT = as_tibble(list(x=c(1,0), y=c(0,0)))
DT
A tibble: 2 x 2
x y
<dbl> <dbl>
1 1 0
2 0 0
Use:
DT1 = DT %>% select_if(any)
DT1
After:
tibble: 2 x 1
x
<dbl>
1 1
2 0

sum up cells in matrix according to different hierarchical level

I am using R to make a heatmap from binary interactions. The matrix looks as following
9 401 562 68 71 569 700
9 0 1 0 0 0 0 1
401 0 0 1 0 0 na 1
562 0 1 0 1 1 0 1
68 1 1 0 0 0 0 1
71 1 na 0 0 na 0 1
569 1 1 0 1 0 0 0
700 0 0 0 0 0 0 0
Also, I have metadata corresponding to my Ids
compart group family category
9 Ex Prt A Ps
401 Ex Prt A Ps
562 Ex Prt B Rh
68 In Prt C En
71 In Act D Stp
569 In Act D Stp
700 Ex Act E Aqua
I would like to sum cells at different level, ex here according to family. The table looks then like
A B C D E
A 1 1 0 0 1
B 1 0 0 na 1
C 2 0 0 0 1
D 3 0 1 0 0
E 0 0 0 0 0
And also would like to do it at compart level and so on.
I am looking for solutions that would avoid me to do it manually and go for hours of work.
Your best bet is to flatten or "stretch out" the matrix. Try the following
library(magrittr)
library(data.table)
library(reshape2)
## Let IDs be the metadata data.frame
DT_ids <- as.data.table(Ids, keep.rownames=TRUE)
# DT_ids[, rn := as.numeric(rn)]
setkey(DT_ids, rn)
## Let M be the interactions matrix
## Reshape the interactions data into a tall data.table
DT_interactions <- M %>%
as.data.table(keep.rownames=TRUE) %>%
melt(id.vars = "rn", value.name="interaction")
## Clean up the column names
setnames(DT_interactions, c("rn", "variable"), c("rn.rows", "rn.cols"))
## Add in two copies of the meta data
## one for "rows" of M and one for "cols" of M
DT_interactions[, paste0(names(DT_ids), ".rows") := DT_ids[.(rn.rows)]]
DT_interactions[, paste0(names(DT_ids), ".cols") := DT_ids[.(rn.cols)]]
## Set the key of DT_interactions
setkey(DT_interactions, rn.rows, rn.cols)
## NOW TO SUM UP
DT_interactions[, sum(interaction), by=c("family.rows", "family.cols")]
I would wrap that last part in a nice function
sumByMeta <- function(..., na.rm=TRUE) {
byCols_simple <- list(...) %>% unlist
byCols <- byCols_simple %>%
lapply(paste0, c(".rows", ".cols")) %>%
unlist
L <- length(byCols)
formula <- paste( byCols[1:(L/2)], byCols[(L/2 + 1) : L]
, sep=ifelse(L > 2, " + ", "~"), collapse=" ~ ")
DT_interactions[, sum(interaction, na.rm=na.rm), by=byCols] %>%
dcast.data.table(formula=as.formula(formula), value.var="V1") %>%
setnames(old=seq_along(byCols_simple), new=byCols_simple) %>% {.}
}
## EG:
sumByMeta("family")
# family A B C D E
# 1: A 1 1 0 0 2
# 2: B 1 0 1 1 1
# 3: C 2 0 0 0 1
# 4: D 3 0 1 0 1
# 5: E 0 0 0 0 0
## Try running these
sumByMeta("family")
sumByMeta("group")
sumByMeta("family", "group")
sumByMeta("family", "group", "compart")
sumByMeta("family", "compart")

splitting dataframe with collated points in to individuals in R

I have a dataframe (.txt) which looks like this [where "dayX" = the day of death in a survival assay in fruitflies, the numbers beneath are the number of flies to die in that treatment combination on that day, X or A are treaments, m & f are also treatments, the first number is the line, the second number is the block]
line day1 day2 day3 day4 day5
1 Xm1.1 0 0 0 2 0
2 Xm1.2 0 0 1 0 0
3 Xm2.1 1 1 0 0 0
4 Xm2.2 0 0 0 3 1
5 Xf1.1 0 3 0 0 1
6 Xf1.2 0 0 1 0 0
7 Xf2.1 2 0 2 0 0
8 Xf2.2 1 0 1 0 0
9 Am1.1 0 0 0 0 2
10 Am1.2 0 0 1 0 0
11 Am2.1 0 2 0 0 1
12 Am2.2 0 2 0 0 0
13 Af1.1 3 0 0 1 0
14 Af1.2 0 1 3 0 0
15 Af1.1 0 0 0 1 0
16 Af2.2 1 0 0 0 0
and want it to become this using R->
XA mf line block individual age
1 X m 1 1 1 4
2 X m 1 1 2 4
3 X m 1 2 1 3
and so on...
the resulting dataframe collects the "age" value from the day the individual died, as scored in the upper dataframe, for example there were two flies that died on the 4th day (day4) in treatment Xm1.1 therefore R creates two rows, one containing information extracted regarding the first individual and thus being labelled as individual "1", then another row with the same information except labelled as individual "2".. if a 3rd individual died in the same treatment on day 5, there would be a third row which is the same as the above two rows except the "age" would be "5" and individual would be "3". When it moves on to the next treatment row, in this case Xm1.2, the first individual to die within that treatment set would be labelled as individual "1" (which in this case dies on day 3). In my example there is a total of 38 deaths, therefore I am trying to get R to build a df which is 38*6 (excl. headers).
is there a way to take my dataframe [the real version is approx 50*640 with approx 50 individuals per unique combination of X/A, m/f, line (1:40), block (1-4) so ~32000 individual deaths] to an end dataframe of 6*~32000 in an automated way?
both of these example dataframes can be built using this code if it helps you to try out solutions:
test<-data.frame(1:16);colnames(test)=("line")
test$line=c("Xm1.1","Xm1.2","Xm2.1","Xm2.2","Xf1.1","Xf1.2","Xf2.1","Xf2.2","Am1.1","Am1.2","Am2.1","Am2.2","Af1.1","Af1.2","Af2.1","Af2.2")
test$day1=rep(0,16);test$day2=rep(0,16);test$day3=rep(0,16);test$day4=rep(0,16);test$day5=rep(0,16)
test$day4[1]=2;test$day3[2]=1;test$day2[3]=1;test$day4[4]=3;test$day5[5]=1;
test$day3[6]=1;test$day1[7]=2;test$day1[8]=1;test$day5[9]=3;test$day3[10]=1;
test$day2[11]=2;test$day2[12]=2;test$day4[13]=1;test$day3[14]=3;test$day4[15]=1;
test$day1[16]=1;test$day3[7]=2;test$day3[8]=1;test$day2[5]=3;test$day1[3]=1;
test$day5[11]=1;test$day5[9]=2;test$day5[4]=1;test$day1[13]=3;test$day2[14]=1;
test2=data.frame(rep(1:3),rep(1:3),rep(1:3),rep(1:3),rep(1:3),rep(1:3))
colnames(test2)=c("XA","mf","line","block","individual","age")
test2$XA[1]="X";test2$mf[1]="m";test2$line[1]=1;test2$block[1]=1;test2$individual[1]=1;test2$age[1]=4;
test2$XA[2]="X";test2$mf[2]="m";test2$line[2]=1;test2$block[2]=1;test2$individual[2]=2;test2$age[2]=4;
test2$XA[3]="X";test2$mf[3]="m";test2$line[3]=1;test2$block[3]=2;test2$individual[3]=1;test2$age[3]=3;
apologies for the awfully long way of making this dummy dataset, suffering from sleep deprivation and jetlag and haven't used R for months, if you run the code in R you will hopefully see better what I aim to do
-------------------------------------------------------------------------------------
By Rg255:
Currently stuck at this derived from #Arun's answer (I have added the strsplit (as.character(dt$line) , "" )) section to get around one error)
df=read.table("C:\\Users\\...\\data.txt",header=T)
require(data.table)
head(df[1:20])
dt <- as.data.table(df)
dt <- dt[, {dd <- unlist(.SD, use.names = FALSE);
list(individual = sequence(dd[dd>0]),
age = rep(which(dd>0), dd[dd>0])
)}, by=line]
out <- as.data.table(data.frame(do.call(rbind, strsplit(as.character(dt$line), ""))[, c(1:3,5)], stringsAsFactors=FALSE))
setnames(out, c("XA", "mf", "line", "block"))
out[, `:=`(line = as.numeric(line), block = as.numeric(block))]
out <- cbind(out, dt[, list(individual, age)])
Produces the following output:
> df=read.table("C:\\Users\\..\\data.txt",header=T)
> require(data.table)
> head(df[1:20])
line Day4 Day6 Day8 Day10 Day12 Day14 Day16 Day18 Day20 Day22 Day24 Day26 Day28 Day30 Day32 Day34 Day36 Day38 Day40
1 Xm1.1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 4 2
2 Xm2.1 0 0 0 0 0 0 0 0 0 2 0 0 0 1 2 1 0 2 0
3 Xm3.1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1
4 Xm4.1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 2 3 8
5 Xm5.1 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 3 3 3 6
6 Xm6.1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
> dt <- as.data.table(df)
> dt <- dt[, {dd <- unlist(.SD, use.names = FALSE);
+ list(individual = sequence(dd[dd>0]),
+ age = rep(which(dd>0), dd[dd>0])
+ )}, by=line]
> out <- as.data.table(data.frame(do.call(rbind, strsplit(as.character(dt$line), ""))[, c(1:3,5)], stringsAsFactors=FALSE))
Warning message:
In function (..., deparse.level = 1) :
number of columns of result is not a multiple of vector length (arg 1)
> setnames(out, c("XA", "mf", "line", "block"))
> out[, `:=`(line = as.numeric(line), block = as.numeric(block))]
Error in `[.data.table`(out, , `:=`(line = as.numeric(line), block = as.numeric(block))) :
LHS of := must be a single column name, when with=TRUE. When with=FALSE the LHS may be a vector of column names or positions.
In addition: Warning message:
In eval(expr, envir, enclos) : NAs introduced by coercion
> out <- cbind(out, dt[, list(individual, age)])
>
Here goes a data.table solution. The line column must have unique values.
require(data.table)
df <- read.table("data.txt", header=TRUE, stringsAsFactors=FALSE)
dt <- as.data.table(df)
dt <- dt[, {dd <- unlist(.SD, use.names = FALSE);
list(individual = sequence(dd[dd>0]),
age = rep(which(dd>0), dd[dd>0])
)}, by=line]
out <- as.data.table(data.frame(do.call(rbind,
strsplit(gsub("([[:alpha:]])([[:alpha:]])([0-9]+)\\.([0-9]+)$",
"\\1 \\2 \\3 \\4", dt$line), " ")), stringsAsFactors=FALSE))
setnames(out, c("XA", "mf", "line", "block"))
out[, `:=`(line = as.numeric(line), block = as.numeric(block))]
out <- cbind(out, dt[, list(individual, age)])
This works on your data.txt file.

R: combine rows of a matrix by group

I am attempting to reformat the data set my.data to obtain the output shown below the my.data2 statement. Specifically, I want to put the last 4 columns of my.data on one line per record.id, where the last four
columns of my.data will occupy columns 2-5 of the new data matrix if group=1 and columns 6-9 if group=2.
I wrote the cumbersome code below, but the double for-loop is causing an error that I simply cannot locate.
Even if the double for-loop worked, I suspect there is a much more efficient way of accomplishing the
same thing - (maybe reshape?)
Thank you for any help correcting the double for-loop or with more efficient code.
my.data <- "record.id group s1 s2 s3 s4
1 1 2 0 1 3
1 2 0 0 0 12
2 1 0 0 0 0
3 1 10 0 0 0
4 1 1 0 0 0
4 2 0 0 0 0
8 2 0 2 2 0
9 1 0 0 0 0
9 2 0 0 0 0"
my.data2 <- read.table(textConnection(my.data), header=T)
# desired output
#
# 1 2 0 1 3 0 0 0 12
# 2 0 0 0 0 0 0 0 0
# 3 10 0 0 0 0 0 0 0
# 4 1 0 0 0 0 0 0 0
# 8 0 0 0 0 0 2 2 0
# 9 0 0 0 0 0 0 0 0
Code:
dat_sorted <- sort(unique(my.data2[,1]))
my.seq <- match(my.data2[,1],dat_sorted)
my.data3 <- cbind(my.seq, my.data2)
group.min <- tapply(my.data3$group, my.data3$my.seq, min)
group.max <- tapply(my.data3$group, my.data3$my.seq, max)
# my.min <- group.min[my.data3[,1]]
# my.max <- group.max[my.data3[,1]]
my.records <- matrix(0, nrow=length(unique(my.data3$record.id)), ncol=9)
x <- 1
for(i in 1:max(my.data3$my.seq)) {
for(j in group.min[i]:group.max[i]) {
if(my.data3[x,1] == i) my.records[i,1] = i
# the two lines below seem to be causing an error
if((my.data3[x,1] == i) & (my.data3[x,3] == 1)) (my.records[i,2:5] = my.data3[x,4:7])
if((my.data3[x,1] == i) & (my.data3[x,3] == 2)) (my.records[i,6:9] = my.data3[x,4:7])
x <- x + 1
}
}
You are right, reshape helps here.
library(reshape2)
m <- melt(my.data2, id.var = c("record.id", "group"))
dcast(m, record.id ~ group + variable, fill = 0)
record.id 1_s1 1_s2 1_s3 1_s4 2_s1 2_s2 2_s3 2_s4
1 1 2 0 1 3 0 0 0 12
2 2 0 0 0 0 0 0 0 0
3 3 10 0 0 0 0 0 0 0
4 4 1 0 0 0 0 0 0 0
5 8 0 0 0 0 0 2 2 0
6 9 0 0 0 0 0 0 0 0
Comparison:
dfTest <- data.frame(record.id = rep(1:10e5, each = 2), group = 1:2,
s1 = sample(1:10, 10e5 * 2, replace = TRUE),
s2 = sample(1:10, 10e5 * 2, replace = TRUE),
s3 = sample(1:10, 10e5 * 2, replace = TRUE),
s4 = sample(1:10, 10e5 * 2, replace = TRUE))
system.time({
...# Your code
})
Error in my.records[i, 1] = i : incorrect number of subscripts on matrix
Timing stopped at: 41.61 0.36 42.56
system.time({m <- melt(dfTest, id.var = c("record.id", "group"))
dcast(m, record.id ~ group + variable, fill = 0)})
user system elapsed
25.04 2.78 28.72
Julius' answer is better, but for completeness, I think I managed to get the following for-loop to work:
dat_x <- (unique(my.data2[,1]))
my.seq <- match(my.data2[,1],dat_x)
my.data3 <- as.data.frame(cbind(my.seq, my.data2))
my.records <- matrix(0, nrow=length(unique(my.data3$record.id)), ncol=9)
my.records <- as.data.frame(my.records)
my.records[,1] = unique(my.data3[,2])
for(i in 1:9) {
if(my.data3[i,3] == 1) (my.records[my.data3[i,1],c(2:5)] = my.data3[i,c(4:7)])
if(my.data3[i,3] == 2) (my.records[my.data3[i,1],c(6:9)] = my.data3[i,c(4:7)])
}

Convert a factor column to multiple boolean columns

Given data that looks like:
library(data.table)
DT <- data.table(x=rep(1:5, 2))
I would like to split this data into 5 boolean columns that indicate the presence of each number.
I can do this like this:
new.names <- sort(unique(DT$x))
DT[, paste0('col', new.names) := lapply(new.names, function(i) DT$x==i), with=FALSE]
But this uses a pesky lapply which is probably slower than the data.table alternative and this solutions strikes me as not very "data.table-ish".
Is there a better and/or faster way to create these new columns?
How about model.matrix?
model.matrix(~factor(x)-1,data=DT)
factor(x)1 factor(x)2 factor(x)3 factor(x)4 factor(x)5
1 1 0 0 0 0
2 0 1 0 0 0
3 0 0 1 0 0
4 0 0 0 1 0
5 0 0 0 0 1
6 1 0 0 0 0
7 0 1 0 0 0
8 0 0 1 0 0
9 0 0 0 1 0
10 0 0 0 0 1
attr(,"assign")
[1] 1 1 1 1 1
attr(,"contrasts")
attr(,"contrasts")$`factor(x)`
[1] "contr.treatment"
Apparently, you can put model.matrix into [.data.table to give the same results. Not sure if it would be faster:
DT[,model.matrix(~factor(x)-1)]
There is also nnet::class.ind
library(nnet)
cbind(DT, setnames(as.data.table(DT[, class.ind(x)]),paste0('col', unique(DT$x))))
library(data.table)
DT <- data.table(x=rep(1:5, 2))
# add column with id
DT[, id := seq.int(nrow(DT))]
# cast long table into wide
DT.wide <- dcast(DT, id ~ x, value.var = "x", fill = 0, fun = function(x) 1)

Resources