Convert a factor column to multiple boolean columns - r

Given data that looks like:
library(data.table)
DT <- data.table(x=rep(1:5, 2))
I would like to split this data into 5 boolean columns that indicate the presence of each number.
I can do this like this:
new.names <- sort(unique(DT$x))
DT[, paste0('col', new.names) := lapply(new.names, function(i) DT$x==i), with=FALSE]
But this uses a pesky lapply which is probably slower than the data.table alternative and this solutions strikes me as not very "data.table-ish".
Is there a better and/or faster way to create these new columns?

How about model.matrix?
model.matrix(~factor(x)-1,data=DT)
factor(x)1 factor(x)2 factor(x)3 factor(x)4 factor(x)5
1 1 0 0 0 0
2 0 1 0 0 0
3 0 0 1 0 0
4 0 0 0 1 0
5 0 0 0 0 1
6 1 0 0 0 0
7 0 1 0 0 0
8 0 0 1 0 0
9 0 0 0 1 0
10 0 0 0 0 1
attr(,"assign")
[1] 1 1 1 1 1
attr(,"contrasts")
attr(,"contrasts")$`factor(x)`
[1] "contr.treatment"
Apparently, you can put model.matrix into [.data.table to give the same results. Not sure if it would be faster:
DT[,model.matrix(~factor(x)-1)]

There is also nnet::class.ind
library(nnet)
cbind(DT, setnames(as.data.table(DT[, class.ind(x)]),paste0('col', unique(DT$x))))

library(data.table)
DT <- data.table(x=rep(1:5, 2))
# add column with id
DT[, id := seq.int(nrow(DT))]
# cast long table into wide
DT.wide <- dcast(DT, id ~ x, value.var = "x", fill = 0, fun = function(x) 1)

Related

extracting maximum value of cumulative sum into a new column

A sample of data set:
testdf <- data.frame(risk_11111 = c(0,0,1,2,3,0,1,2,3,4,0), risk_11112 = c(0,0,1,2,3,0,1,2,0,1,0))
And I need output data set which would contain new column where only maximum values of cumulative sum will be maintained:
testdf <- data.frame(risk_11111 = c(0,0,1,2,3,0,1,2,3,4,0),
risk_11111_max = c(0,0,0,0,3,0,0,0,0,4,0),
risk_11112 = c(0,0,1,2,3,0,1,2,0,1,0),
risk_11112_max = c(0,0,0,0,3,0,0,2,0,1,0))
I am guessing some logical subseting of vectors colwise with apply and extracting max value with position index, and mutate into new variables.
I dont know how to extract values for new variable.
Thanks
Something like this with base R:
lapply(testdf, function(x) {
x[diff(x) > 0] <- 0
x
})
And to have all in one data.frame:
dfout <- cbind(testdf, lapply(testdf, function(x) {
x[diff(x) > 0] <- 0
x
}))
names(dfout) <- c(names(testdf), 'risk_1111_max', 'risk_1112_max')
Output:
risk_11111 risk_11112 risk_1111_max risk_1112_max
1 0 0 0 0
2 0 0 0 0
3 1 1 0 0
4 2 2 0 0
5 3 3 3 3
6 0 0 0 0
7 1 1 0 0
8 2 2 0 2
9 3 0 0 0
10 4 1 4 1
11 0 0 0 0

How to one-hot-encode factor variables with data.table?

For those unfamiliar, one-hot encoding simply refers to converting a column of categories (i.e. a factor) into multiple columns of binary indicator variables where each new column corresponds to one of the classes of the original column. This example will explain it better:
dt <- data.table(
ID=1:5,
Color=factor(c("green", "red", "red", "blue", "green"), levels=c("blue", "green", "red", "purple")),
Shape=factor(c("square", "triangle", "square", "triangle", "cirlce"))
)
dt
ID Color Shape
1: 1 green square
2: 2 red triangle
3: 3 red square
4: 4 blue triangle
5: 5 green cirlce
# one hot encode the colors
color.binarized <- dcast(dt[, list(V1=1, ID, Color)], ID ~ Color, fun=sum, value.var="V1", drop=c(TRUE, FALSE))
# Prepend Color_ in front of each one-hot-encoded feature
setnames(color.binarized, setdiff(colnames(color.binarized), "ID"), paste0("Color_", setdiff(colnames(color.binarized), "ID")))
# one hot encode the shapes
shape.binarized <- dcast(dt[, list(V1=1, ID, Shape)], ID ~ Shape, fun=sum, value.var="V1", drop=c(TRUE, FALSE))
# Prepend Shape_ in front of each one-hot-encoded feature
setnames(shape.binarized, setdiff(colnames(shape.binarized), "ID"), paste0("Shape_", setdiff(colnames(shape.binarized), "ID")))
# Join one-hot tables with original dataset
dt <- dt[color.binarized, on="ID"]
dt <- dt[shape.binarized, on="ID"]
dt
ID Color Shape Color_blue Color_green Color_red Color_purple Shape_cirlce Shape_square Shape_triangle
1: 1 green square 0 1 0 0 0 1 0
2: 2 red triangle 0 0 1 0 0 0 1
3: 3 red square 0 0 1 0 0 1 0
4: 4 blue triangle 1 0 0 0 0 0 1
5: 5 green cirlce 0 1 0 0 1 0 0
This is something I do a lot, and as you can see it's pretty tedious (especially when my data has many factor columns). Is there an easier way to do this with data.table? In particular, I assumed dcast would allow me to one-hot-encode multiple columns at once, when I try doing something like
dcast(dt[, list(V1=1, ID, Color, Shape)], ID ~ Color + Shape, fun=sum, value.var="V1", drop=c(TRUE, FALSE))
I get column combinations
ID blue_cirlce blue_square blue_triangle green_cirlce green_square green_triangle red_cirlce red_square red_triangle purple_cirlce purple_square purple_triangle
1: 1 0 0 0 0 1 0 0 0 0 0 0 0
2: 2 0 0 0 0 0 0 0 0 1 0 0 0
3: 3 0 0 0 0 0 0 0 1 0 0 0 0
4: 4 0 0 1 0 0 0 0 0 0 0 0 0
5: 5 0 0 0 1 0 0 0 0 0 0 0 0
Here you go:
dcast(melt(dt, id.vars='ID'), ID ~ variable + value, fun = length)
# ID Color_blue Color_green Color_red Shape_cirlce Shape_square Shape_triangle
#1: 1 0 1 0 0 1 0
#2: 2 0 0 1 0 0 1
#3: 3 0 0 1 0 1 0
#4: 4 1 0 0 0 0 1
#5: 5 0 1 0 1 0 0
To get the missing factors you can do the following:
res = dcast(melt(dt, id = 'ID', value.factor = T), ID ~ value, drop = F, fun = length)
setnames(res, c("ID", unlist(lapply(2:ncol(dt),
function(i) paste(names(dt)[i], levels(dt[[i]]), sep = "_")))))
res
# ID Color_blue Color_green Color_red Color_purple Shape_cirlce Shape_square Shape_triangle
#1: 1 0 1 0 0 0 1 0
#2: 2 0 0 1 0 0 0 1
#3: 3 0 0 1 0 0 1 0
#4: 4 1 0 0 0 0 0 1
#5: 5 0 1 0 0 1 0 0
Using model.matrix:
> cbind(dt[, .(ID)], model.matrix(~ Color + Shape, dt))
ID (Intercept) Colorgreen Colorred Colorpurple Shapesquare Shapetriangle
1: 1 1 1 0 0 1 0
2: 2 1 0 1 0 0 1
3: 3 1 0 1 0 1 0
4: 4 1 0 0 0 0 1
5: 5 1 1 0 0 0 0
This makes the most sense if you're doing modelling.
If you want to suppress the intercept (and restore the aliased column for the 1st variable):
> cbind(dt[, .(ID)], model.matrix(~ Color + Shape - 1, dt))
ID Colorblue Colorgreen Colorred Colorpurple Shapesquare Shapetriangle
1: 1 0 1 0 0 1 0
2: 2 0 0 1 0 0 1
3: 3 0 0 1 0 1 0
4: 4 1 0 0 0 0 1
5: 5 0 1 0 0 0 0
Here's a more generalized version of eddi's solution:
one_hot <- function(dt, cols="auto", dropCols=TRUE, dropUnusedLevels=FALSE){
# One-Hot-Encode unordered factors in a data.table
# If cols = "auto", each unordered factor column in dt will be encoded. (Or specifcy a vector of column names to encode)
# If dropCols=TRUE, the original factor columns are dropped
# If dropUnusedLevels = TRUE, unused factor levels are dropped
# Automatically get the unordered factor columns
if(cols[1] == "auto") cols <- colnames(dt)[which(sapply(dt, function(x) is.factor(x) & !is.ordered(x)))]
# Build tempDT containing and ID column and 'cols' columns
tempDT <- dt[, cols, with=FALSE]
tempDT[, ID := .I]
setcolorder(tempDT, unique(c("ID", colnames(tempDT))))
for(col in cols) set(tempDT, j=col, value=factor(paste(col, tempDT[[col]], sep="_"), levels=paste(col, levels(tempDT[[col]]), sep="_")))
# One-hot-encode
if(dropUnusedLevels == TRUE){
newCols <- dcast(melt(tempDT, id = 'ID', value.factor = T), ID ~ value, drop = T, fun = length)
} else{
newCols <- dcast(melt(tempDT, id = 'ID', value.factor = T), ID ~ value, drop = F, fun = length)
}
# Combine binarized columns with the original dataset
result <- cbind(dt, newCols[, !"ID"])
# If dropCols = TRUE, remove the original factor columns
if(dropCols == TRUE){
result <- result[, !cols, with=FALSE]
}
return(result)
}
Note that for large datasets it's probably better to use Matrix::sparse.model.matrix
Update (2017)
This is now in the package mltools.
If no one posts a clean way to write this out by hand each time, you can always make a function/macro:
OHE <- function(dt, grp, encodeCols) {
grpSymb = as.symbol(grp)
for (col in encodeCols) {
colSymb = as.symbol(col)
eval(bquote(
dt[, .SD
][, V1 := 1
][, dcast(.SD, .(grpSymb) ~ .(colSymb), fun=sum, value.var='V1')
][, setnames(.SD, setdiff(colnames(.SD), grp), sprintf("%s_%s", col, setdiff(colnames(.SD), grp)))
][, dt <<- dt[.SD, on=grp]
]
))
}
dt
}
dtOHE = OHE(dt, 'ID', c('Color', 'Shape'))
dtOHE
ID Color Shape Color_blue Color_green Color_red Shape_cirlce Shape_square Shape_triangle
1: 1 green square 0 1 0 0 1 0
2: 2 red triangle 0 0 1 0 0 1
3: 3 red square 0 0 1 0 1 0
4: 4 blue triangle 1 0 0 0 0 1
5: 5 green cirlce 0 1 0 1 0 0
In few lines you can solve this problem:
library(tidyverse)
dt2 <- spread(dt,Color,Shape)
dt3 <- spread(dt,Shape,Color)
df <- cbind(dt2,dt3)
df2 <- apply(df, 2, function(x){sapply(x, function(y){
ifelse(is.na(y),0,1)
})})
df2 <- as.data.frame(df2)
df <- cbind(dt,df2[,-1])

Split vector into contiguous runs of equal values

I have a data table and one of the columns is a bunch of 0's and 1's, just like vec below.
vec = c(rep(1, times = 6), rep(0, times = 10), rep(1, times = 11), rep(0, times = 4))
> vec
[1] 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0
What I want to do is to split the data everytime there's a change in that column from 0 to 1 or vice-versa. Here is what I have done so far:
b = c(vec[1],diff(vec))
rowby = numeric(0)
for (i in 2:(length(b))) {
if (b[i] != 0) {
rowby <- c(rowby, i-1)
}
}
splitted_data <- split(vec, cumsum(c(TRUE,(1:length(vec) %in% rowby)[-length(vec)])))
There must be some thing right under my nose I can't see. What is a correct way to do this? This works for the example above, but not generally.
Try
split(vec,cumsum(c(1, abs(diff(vec)))))
#$`1`
#[1] 1 1 1 1 1 1
#$`2`
#[1] 0 0 0 0 0 0 0 0 0 0
#$`3`
#[1] 1 1 1 1 1 1 1 1 1 1 1
#$`4`
#[1] 0 0 0 0
Or use rle
split(vec,inverse.rle(within.list(rle(vec), values <- seq_along(values))))
With current versions of data.table, rleid is one function which can be used for this job:
library(data.table)#v1.9.5+
split(vec,rleid(vec))

how to subset a data frame based on list of multiple match case in columns

So I have a list that contains certain characters as shown below
list <- c("MY","GM+" ,"TY","RS","LG")
And I have a variable named "CODE" in the data frame as follows
code <- c("MY GM+","","LGTY", "RS","TY")
df <- data.frame(1:5,code)
df
code
1 MY GM+
2
3 LGTY
4 RS
5 TY
Now I want to create 5 new variables named "MY","GM+","TY","RS","LG"
Which takes binary value, 1 if there's a match case in the CODE variable
df
code MY GM+ TY RS LG
1 MY GM+ 1 1 0 0 0
2 0 0 0 0 0
3 LGTY 0 0 1 0 1
4 RS 0 0 0 1 0
5 TY 0 0 1 0 0
Really appreciate your help. Thank you.
Since you know how many values will be returned (5), and what you want their types to be (integer), you could use vapply() with grepl(). We can turn the resulting logical matrix into integer values by using integer() in vapply()'s FUN.VALUE argument.
cbind(df, vapply(List, grepl, integer(nrow(df)), df$code, fixed = TRUE))
# code MY GM+ TY RS LG
# 1 MY GM+ 1 1 0 0 0
# 2 0 0 0 0 0
# 3 LGTY 0 0 1 0 1
# 4 RS 0 0 0 1 0
# 5 TY 0 0 1 0 0
I think your original data has a couple of typos, so here's what I used:
List <- c("MY", "GM+" , "TY", "RS", "LG")
df <- data.frame(code = c("MY GM+", "", "LGTY", "RS", "TY"))

splitting dataframe with collated points in to individuals in R

I have a dataframe (.txt) which looks like this [where "dayX" = the day of death in a survival assay in fruitflies, the numbers beneath are the number of flies to die in that treatment combination on that day, X or A are treaments, m & f are also treatments, the first number is the line, the second number is the block]
line day1 day2 day3 day4 day5
1 Xm1.1 0 0 0 2 0
2 Xm1.2 0 0 1 0 0
3 Xm2.1 1 1 0 0 0
4 Xm2.2 0 0 0 3 1
5 Xf1.1 0 3 0 0 1
6 Xf1.2 0 0 1 0 0
7 Xf2.1 2 0 2 0 0
8 Xf2.2 1 0 1 0 0
9 Am1.1 0 0 0 0 2
10 Am1.2 0 0 1 0 0
11 Am2.1 0 2 0 0 1
12 Am2.2 0 2 0 0 0
13 Af1.1 3 0 0 1 0
14 Af1.2 0 1 3 0 0
15 Af1.1 0 0 0 1 0
16 Af2.2 1 0 0 0 0
and want it to become this using R->
XA mf line block individual age
1 X m 1 1 1 4
2 X m 1 1 2 4
3 X m 1 2 1 3
and so on...
the resulting dataframe collects the "age" value from the day the individual died, as scored in the upper dataframe, for example there were two flies that died on the 4th day (day4) in treatment Xm1.1 therefore R creates two rows, one containing information extracted regarding the first individual and thus being labelled as individual "1", then another row with the same information except labelled as individual "2".. if a 3rd individual died in the same treatment on day 5, there would be a third row which is the same as the above two rows except the "age" would be "5" and individual would be "3". When it moves on to the next treatment row, in this case Xm1.2, the first individual to die within that treatment set would be labelled as individual "1" (which in this case dies on day 3). In my example there is a total of 38 deaths, therefore I am trying to get R to build a df which is 38*6 (excl. headers).
is there a way to take my dataframe [the real version is approx 50*640 with approx 50 individuals per unique combination of X/A, m/f, line (1:40), block (1-4) so ~32000 individual deaths] to an end dataframe of 6*~32000 in an automated way?
both of these example dataframes can be built using this code if it helps you to try out solutions:
test<-data.frame(1:16);colnames(test)=("line")
test$line=c("Xm1.1","Xm1.2","Xm2.1","Xm2.2","Xf1.1","Xf1.2","Xf2.1","Xf2.2","Am1.1","Am1.2","Am2.1","Am2.2","Af1.1","Af1.2","Af2.1","Af2.2")
test$day1=rep(0,16);test$day2=rep(0,16);test$day3=rep(0,16);test$day4=rep(0,16);test$day5=rep(0,16)
test$day4[1]=2;test$day3[2]=1;test$day2[3]=1;test$day4[4]=3;test$day5[5]=1;
test$day3[6]=1;test$day1[7]=2;test$day1[8]=1;test$day5[9]=3;test$day3[10]=1;
test$day2[11]=2;test$day2[12]=2;test$day4[13]=1;test$day3[14]=3;test$day4[15]=1;
test$day1[16]=1;test$day3[7]=2;test$day3[8]=1;test$day2[5]=3;test$day1[3]=1;
test$day5[11]=1;test$day5[9]=2;test$day5[4]=1;test$day1[13]=3;test$day2[14]=1;
test2=data.frame(rep(1:3),rep(1:3),rep(1:3),rep(1:3),rep(1:3),rep(1:3))
colnames(test2)=c("XA","mf","line","block","individual","age")
test2$XA[1]="X";test2$mf[1]="m";test2$line[1]=1;test2$block[1]=1;test2$individual[1]=1;test2$age[1]=4;
test2$XA[2]="X";test2$mf[2]="m";test2$line[2]=1;test2$block[2]=1;test2$individual[2]=2;test2$age[2]=4;
test2$XA[3]="X";test2$mf[3]="m";test2$line[3]=1;test2$block[3]=2;test2$individual[3]=1;test2$age[3]=3;
apologies for the awfully long way of making this dummy dataset, suffering from sleep deprivation and jetlag and haven't used R for months, if you run the code in R you will hopefully see better what I aim to do
-------------------------------------------------------------------------------------
By Rg255:
Currently stuck at this derived from #Arun's answer (I have added the strsplit (as.character(dt$line) , "" )) section to get around one error)
df=read.table("C:\\Users\\...\\data.txt",header=T)
require(data.table)
head(df[1:20])
dt <- as.data.table(df)
dt <- dt[, {dd <- unlist(.SD, use.names = FALSE);
list(individual = sequence(dd[dd>0]),
age = rep(which(dd>0), dd[dd>0])
)}, by=line]
out <- as.data.table(data.frame(do.call(rbind, strsplit(as.character(dt$line), ""))[, c(1:3,5)], stringsAsFactors=FALSE))
setnames(out, c("XA", "mf", "line", "block"))
out[, `:=`(line = as.numeric(line), block = as.numeric(block))]
out <- cbind(out, dt[, list(individual, age)])
Produces the following output:
> df=read.table("C:\\Users\\..\\data.txt",header=T)
> require(data.table)
> head(df[1:20])
line Day4 Day6 Day8 Day10 Day12 Day14 Day16 Day18 Day20 Day22 Day24 Day26 Day28 Day30 Day32 Day34 Day36 Day38 Day40
1 Xm1.1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 4 2
2 Xm2.1 0 0 0 0 0 0 0 0 0 2 0 0 0 1 2 1 0 2 0
3 Xm3.1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1
4 Xm4.1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 2 3 8
5 Xm5.1 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 3 3 3 6
6 Xm6.1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
> dt <- as.data.table(df)
> dt <- dt[, {dd <- unlist(.SD, use.names = FALSE);
+ list(individual = sequence(dd[dd>0]),
+ age = rep(which(dd>0), dd[dd>0])
+ )}, by=line]
> out <- as.data.table(data.frame(do.call(rbind, strsplit(as.character(dt$line), ""))[, c(1:3,5)], stringsAsFactors=FALSE))
Warning message:
In function (..., deparse.level = 1) :
number of columns of result is not a multiple of vector length (arg 1)
> setnames(out, c("XA", "mf", "line", "block"))
> out[, `:=`(line = as.numeric(line), block = as.numeric(block))]
Error in `[.data.table`(out, , `:=`(line = as.numeric(line), block = as.numeric(block))) :
LHS of := must be a single column name, when with=TRUE. When with=FALSE the LHS may be a vector of column names or positions.
In addition: Warning message:
In eval(expr, envir, enclos) : NAs introduced by coercion
> out <- cbind(out, dt[, list(individual, age)])
>
Here goes a data.table solution. The line column must have unique values.
require(data.table)
df <- read.table("data.txt", header=TRUE, stringsAsFactors=FALSE)
dt <- as.data.table(df)
dt <- dt[, {dd <- unlist(.SD, use.names = FALSE);
list(individual = sequence(dd[dd>0]),
age = rep(which(dd>0), dd[dd>0])
)}, by=line]
out <- as.data.table(data.frame(do.call(rbind,
strsplit(gsub("([[:alpha:]])([[:alpha:]])([0-9]+)\\.([0-9]+)$",
"\\1 \\2 \\3 \\4", dt$line), " ")), stringsAsFactors=FALSE))
setnames(out, c("XA", "mf", "line", "block"))
out[, `:=`(line = as.numeric(line), block = as.numeric(block))]
out <- cbind(out, dt[, list(individual, age)])
This works on your data.txt file.

Resources