Related
I have an R data frame containing a factor that I want to "expand" so that for each factor level, there is an associated column in a new data frame, which contains a 1/0 indicator. E.g., suppose I have:
df.original <-data.frame(eggs = c("foo", "foo", "bar", "bar"), ham = c(1,2,3,4))
I want:
df.desired <- data.frame(foo = c(1,1,0,0), bar=c(0,0,1,1), ham=c(1,2,3,4))
Because for certain analyses for which you need to have a completely numeric data frame (e.g., principal component analysis), I thought this feature might be built in. Writing a function to do this shouldn't be too hard, but I can foresee some challenges relating to column names and if something exists already, I'd rather use that.
Use the model.matrix function:
model.matrix( ~ Species - 1, data=iris )
If your data frame is only made of factors (or you are working on a subset of variables which are all factors), you can also use the acm.disjonctif function from the ade4 package :
R> library(ade4)
R> df <-data.frame(eggs = c("foo", "foo", "bar", "bar"), ham = c("red","blue","green","red"))
R> acm.disjonctif(df)
eggs.bar eggs.foo ham.blue ham.green ham.red
1 0 1 0 0 1
2 0 1 1 0 0
3 1 0 0 1 0
4 1 0 0 0 1
Not exactly the case you are describing, but it can be useful too...
A quick way using the reshape2 package:
require(reshape2)
> dcast(df.original, ham ~ eggs, length)
Using ham as value column: use value_var to override.
ham bar foo
1 1 0 1
2 2 0 1
3 3 1 0
4 4 1 0
Note that this produces precisely the column names you want.
probably dummy variable is similar to what you want.
Then, model.matrix is useful:
> with(df.original, data.frame(model.matrix(~eggs+0), ham))
eggsbar eggsfoo ham
1 0 1 1
2 0 1 2
3 1 0 3
4 1 0 4
A late entry class.ind from the nnet package
library(nnet)
with(df.original, data.frame(class.ind(eggs), ham))
bar foo ham
1 0 1 1
2 0 1 2
3 1 0 3
4 1 0 4
Just came across this old thread and thought I'd add a function that utilizes ade4 to take a dataframe consisting of factors and/or numeric data and returns a dataframe with factors as dummy codes.
dummy <- function(df) {
NUM <- function(dataframe)dataframe[,sapply(dataframe,is.numeric)]
FAC <- function(dataframe)dataframe[,sapply(dataframe,is.factor)]
require(ade4)
if (is.null(ncol(NUM(df)))) {
DF <- data.frame(NUM(df), acm.disjonctif(FAC(df)))
names(DF)[1] <- colnames(df)[which(sapply(df, is.numeric))]
} else {
DF <- data.frame(NUM(df), acm.disjonctif(FAC(df)))
}
return(DF)
}
Let's try it.
df <-data.frame(eggs = c("foo", "foo", "bar", "bar"),
ham = c("red","blue","green","red"), x=rnorm(4))
dummy(df)
df2 <-data.frame(eggs = c("foo", "foo", "bar", "bar"),
ham = c("red","blue","green","red"))
dummy(df2)
Here is a more clear way to do it. I use model.matrix to create the dummy boolean variables and then merge it back into the original dataframe.
df.original <-data.frame(eggs = c("foo", "foo", "bar", "bar"), ham = c(1,2,3,4))
df.original
# eggs ham
# 1 foo 1
# 2 foo 2
# 3 bar 3
# 4 bar 4
# Create the dummy boolean variables using the model.matrix() function.
> mm <- model.matrix(~eggs-1, df.original)
> mm
# eggsbar eggsfoo
# 1 0 1
# 2 0 1
# 3 1 0
# 4 1 0
# attr(,"assign")
# [1] 1 1
# attr(,"contrasts")
# attr(,"contrasts")$eggs
# [1] "contr.treatment"
# Remove the "eggs" prefix from the column names as the OP desired.
colnames(mm) <- gsub("eggs","",colnames(mm))
mm
# bar foo
# 1 0 1
# 2 0 1
# 3 1 0
# 4 1 0
# attr(,"assign")
# [1] 1 1
# attr(,"contrasts")
# attr(,"contrasts")$eggs
# [1] "contr.treatment"
# Combine the matrix back with the original dataframe.
result <- cbind(df.original, mm)
result
# eggs ham bar foo
# 1 foo 1 0 1
# 2 foo 2 0 1
# 3 bar 3 1 0
# 4 bar 4 1 0
# At this point, you can select out the columns that you want.
I needed a function to 'explode' factors that is a bit more flexible, and made one based on the acm.disjonctif function from the ade4 package.
This allows you to choose the exploded values, which are 0 and 1 in acm.disjonctif. It only explodes factors that have 'few' levels. Numeric columns are preserved.
# Function to explode factors that are considered to be categorical,
# i.e., they do not have too many levels.
# - data: The data.frame in which categorical variables will be exploded.
# - values: The exploded values for the value being unequal and equal to a level.
# - max_factor_level_fraction: Maximum number of levels as a fraction of column length. Set to 1 to explode all factors.
# Inspired by the acm.disjonctif function in the ade4 package.
explode_factors <- function(data, values = c(-0.8, 0.8), max_factor_level_fraction = 0.2) {
exploders <- colnames(data)[sapply(data, function(col){
is.factor(col) && nlevels(col) <= max_factor_level_fraction * length(col)
})]
if (length(exploders) > 0) {
exploded <- lapply(exploders, function(exp){
col <- data[, exp]
n <- length(col)
dummies <- matrix(values[1], n, length(levels(col)))
dummies[(1:n) + n * (unclass(col) - 1)] <- values[2]
colnames(dummies) <- paste(exp, levels(col), sep = '_')
dummies
})
# Only keep numeric data.
data <- data[sapply(data, is.numeric)]
# Add exploded values.
data <- cbind(data, exploded)
}
return(data)
}
(The question is 10yo, but for the sake of completeness...)
The function i() from the fixest package does exactly that.
Beyond creating a design matrix from a factor-like variable, you can also very easily do two extra things on the fly:
binning values (with the argument 'bin'),
excluding some factor values (with the argument ref).
And since it is made for this task, if your variable happens to be numeric you don't need to wrap it with factor(x_num) (as opposed to the model.matrix solution).
Here's an example:
library(fixest)
data(airquality)
table(airquality$Month)
#> 5 6 7 8 9
#> 31 30 31 31 30
head(i(airquality$Month))
#> 5 6 7 8 9
#> [1,] 1 0 0 0 0
#> [2,] 1 0 0 0 0
#> [3,] 1 0 0 0 0
#> [4,] 1 0 0 0 0
#> [5,] 1 0 0 0 0
#> [6,] 1 0 0 0 0
#
# Binning (check out the help, there are many many ways to bin)
#
colSums(i(airquality$Month, bin = 5:6)))
#> 5 7 8 9
#> 61 31 31 30
#
# References
#
head(i(airquality$Month, ref = c(6, 9)), 3)
#> 5 7 8
#> [1,] 1 0 0
#> [2,] 1 0 0
#> [3,] 1 0 0
And here's a little wrapper expanding all non-numeric variables (by default):
library(fixest)
# data: data.frame
# var: vector of variable names // if missing, all non numeric variables
# no argument checking
expand_factor = function(data, var){
if(missing(var)){
var = names(data)[!sapply(data, is.numeric)]
if(length(var) == 0) return(data)
}
data_list = unclass(data)
new = lapply(var, \(x) i(data_list[[x]]))
data_list[names(data_list) %in% var] = new
do.call("cbind", data_list)
}
my_data = data.frame(eggs = c("foo", "foo", "bar", "bar"), ham = c(1,2,3,4))
expand_factor(my_data)
#> bar foo ham
#> [1,] 0 1 1
#> [2,] 0 1 2
#> [3,] 1 0 3
#> [4,] 1 0 4
Finally, for those wondering, the timing is equivalent to the model.matrix solution.
library(microbenchmark)
my_data = data.frame(x = as.factor(sample(100, 1e6, TRUE)))
microbenchmark(mm = model.matrix(~x, my_data),
i = i(my_data$x), times = 5)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> mm 155.1904 156.7751 209.2629 182.4964 197.9084 353.9443 5
#> i 154.1697 154.7893 159.5202 155.4166 163.9706 169.2550 5
In sapply == over eggs could be used to generate dummy vectors:
x <- with(df.original, data.frame(+sapply(unique(eggs), `==`, eggs), ham))
x
# foo bar ham
#1 1 0 1
#2 1 0 2
#3 0 1 3
#4 0 1 4
all.equal(x, df.desired)
#[1] TRUE
A maybe faster variant - Result best used as list or data.frame:
. <- unique(df.original$eggs)
with(df.original,
data.frame(+do.call(cbind, lapply(setNames(., .), `==`, eggs)), ham))
Indexing in a matrix - Result best used as matrix:
. <- unique(df.original$eggs)
i <- match(df.original$eggs, .)
nc <- length(.)
nr <- length(i)
cbind(matrix(`[<-`(integer(nc * nr), 1:nr + nr * (i - 1), 1), nr, nc,
dimnames=list(NULL, .)), df.original["ham"])
Using outer - Result best used as matrix:
. <- unique(df.original$eggs)
cbind(+outer(df.original$eggs, setNames(., .), `==`), df.original["ham"])
Using rep - Result best used as matrix:
. <- unique(df.original$eggs)
n <- nrow(df.original)
cbind(+matrix(df.original$eggs == rep(., each=n), n, dimnames=list(NULL, .)),
df.original["ham"])
I have an R data frame containing a factor that I want to "expand" so that for each factor level, there is an associated column in a new data frame, which contains a 1/0 indicator. E.g., suppose I have:
df.original <-data.frame(eggs = c("foo", "foo", "bar", "bar"), ham = c(1,2,3,4))
I want:
df.desired <- data.frame(foo = c(1,1,0,0), bar=c(0,0,1,1), ham=c(1,2,3,4))
Because for certain analyses for which you need to have a completely numeric data frame (e.g., principal component analysis), I thought this feature might be built in. Writing a function to do this shouldn't be too hard, but I can foresee some challenges relating to column names and if something exists already, I'd rather use that.
Use the model.matrix function:
model.matrix( ~ Species - 1, data=iris )
If your data frame is only made of factors (or you are working on a subset of variables which are all factors), you can also use the acm.disjonctif function from the ade4 package :
R> library(ade4)
R> df <-data.frame(eggs = c("foo", "foo", "bar", "bar"), ham = c("red","blue","green","red"))
R> acm.disjonctif(df)
eggs.bar eggs.foo ham.blue ham.green ham.red
1 0 1 0 0 1
2 0 1 1 0 0
3 1 0 0 1 0
4 1 0 0 0 1
Not exactly the case you are describing, but it can be useful too...
A quick way using the reshape2 package:
require(reshape2)
> dcast(df.original, ham ~ eggs, length)
Using ham as value column: use value_var to override.
ham bar foo
1 1 0 1
2 2 0 1
3 3 1 0
4 4 1 0
Note that this produces precisely the column names you want.
probably dummy variable is similar to what you want.
Then, model.matrix is useful:
> with(df.original, data.frame(model.matrix(~eggs+0), ham))
eggsbar eggsfoo ham
1 0 1 1
2 0 1 2
3 1 0 3
4 1 0 4
A late entry class.ind from the nnet package
library(nnet)
with(df.original, data.frame(class.ind(eggs), ham))
bar foo ham
1 0 1 1
2 0 1 2
3 1 0 3
4 1 0 4
Just came across this old thread and thought I'd add a function that utilizes ade4 to take a dataframe consisting of factors and/or numeric data and returns a dataframe with factors as dummy codes.
dummy <- function(df) {
NUM <- function(dataframe)dataframe[,sapply(dataframe,is.numeric)]
FAC <- function(dataframe)dataframe[,sapply(dataframe,is.factor)]
require(ade4)
if (is.null(ncol(NUM(df)))) {
DF <- data.frame(NUM(df), acm.disjonctif(FAC(df)))
names(DF)[1] <- colnames(df)[which(sapply(df, is.numeric))]
} else {
DF <- data.frame(NUM(df), acm.disjonctif(FAC(df)))
}
return(DF)
}
Let's try it.
df <-data.frame(eggs = c("foo", "foo", "bar", "bar"),
ham = c("red","blue","green","red"), x=rnorm(4))
dummy(df)
df2 <-data.frame(eggs = c("foo", "foo", "bar", "bar"),
ham = c("red","blue","green","red"))
dummy(df2)
Here is a more clear way to do it. I use model.matrix to create the dummy boolean variables and then merge it back into the original dataframe.
df.original <-data.frame(eggs = c("foo", "foo", "bar", "bar"), ham = c(1,2,3,4))
df.original
# eggs ham
# 1 foo 1
# 2 foo 2
# 3 bar 3
# 4 bar 4
# Create the dummy boolean variables using the model.matrix() function.
> mm <- model.matrix(~eggs-1, df.original)
> mm
# eggsbar eggsfoo
# 1 0 1
# 2 0 1
# 3 1 0
# 4 1 0
# attr(,"assign")
# [1] 1 1
# attr(,"contrasts")
# attr(,"contrasts")$eggs
# [1] "contr.treatment"
# Remove the "eggs" prefix from the column names as the OP desired.
colnames(mm) <- gsub("eggs","",colnames(mm))
mm
# bar foo
# 1 0 1
# 2 0 1
# 3 1 0
# 4 1 0
# attr(,"assign")
# [1] 1 1
# attr(,"contrasts")
# attr(,"contrasts")$eggs
# [1] "contr.treatment"
# Combine the matrix back with the original dataframe.
result <- cbind(df.original, mm)
result
# eggs ham bar foo
# 1 foo 1 0 1
# 2 foo 2 0 1
# 3 bar 3 1 0
# 4 bar 4 1 0
# At this point, you can select out the columns that you want.
I needed a function to 'explode' factors that is a bit more flexible, and made one based on the acm.disjonctif function from the ade4 package.
This allows you to choose the exploded values, which are 0 and 1 in acm.disjonctif. It only explodes factors that have 'few' levels. Numeric columns are preserved.
# Function to explode factors that are considered to be categorical,
# i.e., they do not have too many levels.
# - data: The data.frame in which categorical variables will be exploded.
# - values: The exploded values for the value being unequal and equal to a level.
# - max_factor_level_fraction: Maximum number of levels as a fraction of column length. Set to 1 to explode all factors.
# Inspired by the acm.disjonctif function in the ade4 package.
explode_factors <- function(data, values = c(-0.8, 0.8), max_factor_level_fraction = 0.2) {
exploders <- colnames(data)[sapply(data, function(col){
is.factor(col) && nlevels(col) <= max_factor_level_fraction * length(col)
})]
if (length(exploders) > 0) {
exploded <- lapply(exploders, function(exp){
col <- data[, exp]
n <- length(col)
dummies <- matrix(values[1], n, length(levels(col)))
dummies[(1:n) + n * (unclass(col) - 1)] <- values[2]
colnames(dummies) <- paste(exp, levels(col), sep = '_')
dummies
})
# Only keep numeric data.
data <- data[sapply(data, is.numeric)]
# Add exploded values.
data <- cbind(data, exploded)
}
return(data)
}
(The question is 10yo, but for the sake of completeness...)
The function i() from the fixest package does exactly that.
Beyond creating a design matrix from a factor-like variable, you can also very easily do two extra things on the fly:
binning values (with the argument 'bin'),
excluding some factor values (with the argument ref).
And since it is made for this task, if your variable happens to be numeric you don't need to wrap it with factor(x_num) (as opposed to the model.matrix solution).
Here's an example:
library(fixest)
data(airquality)
table(airquality$Month)
#> 5 6 7 8 9
#> 31 30 31 31 30
head(i(airquality$Month))
#> 5 6 7 8 9
#> [1,] 1 0 0 0 0
#> [2,] 1 0 0 0 0
#> [3,] 1 0 0 0 0
#> [4,] 1 0 0 0 0
#> [5,] 1 0 0 0 0
#> [6,] 1 0 0 0 0
#
# Binning (check out the help, there are many many ways to bin)
#
colSums(i(airquality$Month, bin = 5:6)))
#> 5 7 8 9
#> 61 31 31 30
#
# References
#
head(i(airquality$Month, ref = c(6, 9)), 3)
#> 5 7 8
#> [1,] 1 0 0
#> [2,] 1 0 0
#> [3,] 1 0 0
And here's a little wrapper expanding all non-numeric variables (by default):
library(fixest)
# data: data.frame
# var: vector of variable names // if missing, all non numeric variables
# no argument checking
expand_factor = function(data, var){
if(missing(var)){
var = names(data)[!sapply(data, is.numeric)]
if(length(var) == 0) return(data)
}
data_list = unclass(data)
new = lapply(var, \(x) i(data_list[[x]]))
data_list[names(data_list) %in% var] = new
do.call("cbind", data_list)
}
my_data = data.frame(eggs = c("foo", "foo", "bar", "bar"), ham = c(1,2,3,4))
expand_factor(my_data)
#> bar foo ham
#> [1,] 0 1 1
#> [2,] 0 1 2
#> [3,] 1 0 3
#> [4,] 1 0 4
Finally, for those wondering, the timing is equivalent to the model.matrix solution.
library(microbenchmark)
my_data = data.frame(x = as.factor(sample(100, 1e6, TRUE)))
microbenchmark(mm = model.matrix(~x, my_data),
i = i(my_data$x), times = 5)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> mm 155.1904 156.7751 209.2629 182.4964 197.9084 353.9443 5
#> i 154.1697 154.7893 159.5202 155.4166 163.9706 169.2550 5
In sapply == over eggs could be used to generate dummy vectors:
x <- with(df.original, data.frame(+sapply(unique(eggs), `==`, eggs), ham))
x
# foo bar ham
#1 1 0 1
#2 1 0 2
#3 0 1 3
#4 0 1 4
all.equal(x, df.desired)
#[1] TRUE
A maybe faster variant - Result best used as list or data.frame:
. <- unique(df.original$eggs)
with(df.original,
data.frame(+do.call(cbind, lapply(setNames(., .), `==`, eggs)), ham))
Indexing in a matrix - Result best used as matrix:
. <- unique(df.original$eggs)
i <- match(df.original$eggs, .)
nc <- length(.)
nr <- length(i)
cbind(matrix(`[<-`(integer(nc * nr), 1:nr + nr * (i - 1), 1), nr, nc,
dimnames=list(NULL, .)), df.original["ham"])
Using outer - Result best used as matrix:
. <- unique(df.original$eggs)
cbind(+outer(df.original$eggs, setNames(., .), `==`), df.original["ham"])
Using rep - Result best used as matrix:
. <- unique(df.original$eggs)
n <- nrow(df.original)
cbind(+matrix(df.original$eggs == rep(., each=n), n, dimnames=list(NULL, .)),
df.original["ham"])
I'm sure there's a simple solution to this, but I can't figure it out!! Suppose I have a dataframe that has the following information:
aaa<-c("A,B","B,C","B,D,E")
vvv<-c("101","101,102","102,103,104")
data_h<-data.frame(aaa,vvv)
data_h
aaa vvv
1 A,B 101
2 B,C 101,102
3 B,D,E 102,103,104
Desired output is a frequency map of individual hits, for subsequent analysis in a heat map. So:
101 102 103 104
A 1 0 0 0
B 2 2 1 1
C 1 1 0 0
D 0 1 1 1
E 0 1 1 1
How do I make this transformation? I've seen many similar examples, but none where the contents of the data-frame need to be parsed.
The goal is to ultimately use heatmap or something similar on the output table to visualize the correlation between "aaa" and "vvv".
Here is a base R solution in 4 lines of code. First we define a function, spl which splits the components of a comma separated string producing a vector of all the fields. eg takes two string arguments and applies spl to each of them and then creates a grid of the result of the splitting. Finally we apply eg to each row of data_h, rbind the results together and tabulate them with xtabs:
spl <- function(x) strsplit(as.character(x), ",")[[1]]
eg <- function(aaa, vvv) expand.grid(aaa = spl(aaa), vvv = spl(vvv))
dd <- do.call("rbind", Map(eg, data_h$aaa, data_h$vvv))
xtabs(data = dd)
The result is:
vvv
aaa 101 102 103 104
A 1 0 0 0
B 2 2 1 1
C 1 1 0 0
D 0 1 1 1
E 0 1 1 1
dcast Alternately replace the last line of code above (the one with the xtabs) with:
library(reshape2)
dcast(dd, aaa ~ vvv, fun = length, value.var = "vvv")
in which case the result is:
aaa 101 102 103 104
1 A 1 0 0 0
2 B 2 2 1 1
3 C 1 1 0 0
4 D 0 1 1 1
5 E 0 1 1 1
tapply. Another alternative would be tapply (however, it will fill in empty cells with NA rather than 0):
tapply(1:nrow(dd), dd, length)
ADDED Alternatives. Some improvements.
The shape of the data.frame suggests using splitstackshape package. But I don't know very well this package so I just use it to reshape the data, and then compute frequencies by hand using table:
library(splitstackshape)
data_h_split <- concat.split.multiple(data_h,1:2)
# aaa_1 aaa_2 aaa_3 vvv_1 vvv_2 vvv_3
# 1 A B <NA> 101 NA NA
# 2 B C <NA> 101 102 NA
# 3 B D E 102 103 104
Once you have the data in this format (no comma , regular columns), it is easy to compute frequencies using table( you can use tapply,reshape):
table(cbind.data.frame(ff= unlist(data_h_split[1:3]),
xx= unlist(data_h_split[4:6])))
xx
ff 101 102 103 104
A 1 0 0 0
B 1 1 0 0
C 0 1 0 0
D 0 0 1 0
0 0 0 0
E 0 0 0 1
Ananda's edit
Here's a multi-step approach to get the result using "splitstackshape" to work for this.
library(splitstackshape)
## Split the "vvv" column first, and reshape at the same time
x <- concat.split.multiple(data_h, split.cols="vvv", ",", "long")
## Add an ID column
x$id <- 1:nrow(x)
## Split the "aaa" column next, again reshaping as we do so
x <- concat.split.multiple(x[complete.cases(x), ], split.cols="aaa", ",", "long")
## Use `table` with `droplevels`
with(droplevels(x), table(aaa, vvv))
# vvv
# aaa 101 102 103 104
# A 1 0 0 0
# B 2 2 1 1
# C 1 1 0 0
# D 0 1 1 1
# E 0 1 1 1
My concat.split.multiple function is in great need of a rewrite to improve its efficiency. I've done some work on that in my cSplit function, which might be useful if you have a particularly large dataset.
Here's how I would solve your given problem with cSplit:
table(
cSplit(
cSplit(data_h, splitCols = 2, sep = ",",
direction = "long", makeEqual = FALSE),
splitCols = 1, sep = ",", direction = "long",
makeEqual = FALSE))
# vvv
# aaa 101 102 103 104
# A 1 0 0 0
# B 2 2 1 1
# C 1 1 0 0
# D 0 1 1 1
# E 0 1 1 1
It seems to be pretty efficient too...
First, the functions to test:
fun1 <- function() table(cSplit(cSplit(df, 2, ",", "long", FALSE), 1, ",", "long", FALSE))
fun2 <- function() {
spl <- function(x) strsplit(as.character(x), ",")[[1]]
eg <- function(aaa, vvv) expand.grid(aaa = spl(aaa), vvv = spl(vvv))
dd <- do.call("rbind", Map(eg, df$A, df$V))
xtabs(data = dd)
}
Second, some sample data. Change Nrows and re-generate to see the effect on different sized data.frames.
set.seed(1)
Nrow <- 100
aaa <- 100:200
vvv <- LETTERS
maxA <- 10
maxV <- 10
Aaa <- sample(maxA, Nrow, TRUE)
Vvv <- sample(maxV, Nrow, TRUE)
A <- vapply(seq_along(Aaa), function(x)
paste(sample(aaa, Aaa[x], TRUE), collapse = ","), character(1L))
V <- vapply(seq_along(Vvv), function(x)
paste(sample(vvv, Vvv[x], TRUE), collapse = ","), character(1L))
df <- data.frame(A, V)
head(df)
# A V
# 1 127,122,152 E,E,O,S,W,S,M
# 2 127,118,152,156 V,A,Z,Q
# 3 113,125,172,197,110,177 L,A,T
# 4 195,182,131,165,196,196,134,126,116,132 F,Z,X,S,T,M,W,E,Q,H
# 5 151,193,151 L,B,E,B,Y,I,N
# 6 126,104,142,186,135,113,137,163,139 Q,G,N
Compare the two approaches to make sure the results are the same:
X <- fun1()
Y <- fun2()
all(X == Y[dimnames(X)[[1]], dimnames(X)[[2]]])
# [1] TRUE
Benchmark (on 100 rows).
library(microbenchmark)
## Nrow = 100
microbenchmark(fun1(), fun2(), times = 10)
# Unit: milliseconds
# expr min lq median uq max neval
# fun1() 7.263802 7.326237 7.440843 7.868905 10.26451 10
# fun2() 62.869130 64.046836 68.525880 73.595061 80.02027 10
Benchmark (on 1000 rows).
## Nrow = 1000
microbenchmark(fun1(), fun2(), times = 10)
# Unit: milliseconds
# expr min lq median uq max neval
# fun1() 19.2303 20.21857 23.14337 26.97776 35.56338 10
# fun2() 775.6586 815.01639 835.98951 852.47804 888.15345 10
I have a column of int's like this:
idNums
2
101
34
25
8
...
I need to convert them to 3 factor columns like this:
digit1 digit2 digit3
0 0 2
1 0 1
0 3 4
0 2 5
0 0 8
... ... ...
Any suggestions?
Here's a fun solution using the modular arithmetic operators %% and %/%:
d <- c(2, 101, 34, 25, 8)
res <- data.frame(digit1 = d %/% 100,
digit2 = d %% 100 %/% 10,
digit3 = d %% 10)
# digit1 digit2 digit3
# 1 0 0 2
# 2 1 0 1
# 3 0 3 4
# 4 0 2 5
# 5 0 0 8
Note that it has the minor -- but nice -- side benefit of returning numeric values for each of the columns. If you do, however, want factor columns instead, just follow up with this command:
res[] <- lapply(res, as.factor)
all(sapply(res, class)=="factor")
#[1] TRUE
Use formatC and strsplit.
idNums <- c(2, 101, 34, 25, 8)
idChars <- formatC(idNums, width = 3, flag = "0")
idChars <- strsplit(idChars, "")
data.frame(
digits1 = sapply(idChars, function(x) x[1]),
digits2 = sapply(idChars, function(x) x[2]),
digits3 = sapply(idChars, function(x) x[3])
)
It's a little cleaner using the stringr package. Replace the call to strsplit with
str_split_fixed(idChars, "", 3)
I thought Richie Cottons use of formatC was kewl so I incorporated it:
testdat <- read.fwf(textConnection(formatC(idNums, width = 3, flag = "0") ),
widths=c(1,1,1),
col.names=c("digit1", "digit2", "digit3")
)
testdat
#------------
digit1 digit2 digit3
1 0 0 2
2 1 0 1
3 0 3 4
4 0 2 5
5 0 0 8
I have an R data frame containing a factor that I want to "expand" so that for each factor level, there is an associated column in a new data frame, which contains a 1/0 indicator. E.g., suppose I have:
df.original <-data.frame(eggs = c("foo", "foo", "bar", "bar"), ham = c(1,2,3,4))
I want:
df.desired <- data.frame(foo = c(1,1,0,0), bar=c(0,0,1,1), ham=c(1,2,3,4))
Because for certain analyses for which you need to have a completely numeric data frame (e.g., principal component analysis), I thought this feature might be built in. Writing a function to do this shouldn't be too hard, but I can foresee some challenges relating to column names and if something exists already, I'd rather use that.
Use the model.matrix function:
model.matrix( ~ Species - 1, data=iris )
If your data frame is only made of factors (or you are working on a subset of variables which are all factors), you can also use the acm.disjonctif function from the ade4 package :
R> library(ade4)
R> df <-data.frame(eggs = c("foo", "foo", "bar", "bar"), ham = c("red","blue","green","red"))
R> acm.disjonctif(df)
eggs.bar eggs.foo ham.blue ham.green ham.red
1 0 1 0 0 1
2 0 1 1 0 0
3 1 0 0 1 0
4 1 0 0 0 1
Not exactly the case you are describing, but it can be useful too...
A quick way using the reshape2 package:
require(reshape2)
> dcast(df.original, ham ~ eggs, length)
Using ham as value column: use value_var to override.
ham bar foo
1 1 0 1
2 2 0 1
3 3 1 0
4 4 1 0
Note that this produces precisely the column names you want.
probably dummy variable is similar to what you want.
Then, model.matrix is useful:
> with(df.original, data.frame(model.matrix(~eggs+0), ham))
eggsbar eggsfoo ham
1 0 1 1
2 0 1 2
3 1 0 3
4 1 0 4
A late entry class.ind from the nnet package
library(nnet)
with(df.original, data.frame(class.ind(eggs), ham))
bar foo ham
1 0 1 1
2 0 1 2
3 1 0 3
4 1 0 4
Just came across this old thread and thought I'd add a function that utilizes ade4 to take a dataframe consisting of factors and/or numeric data and returns a dataframe with factors as dummy codes.
dummy <- function(df) {
NUM <- function(dataframe)dataframe[,sapply(dataframe,is.numeric)]
FAC <- function(dataframe)dataframe[,sapply(dataframe,is.factor)]
require(ade4)
if (is.null(ncol(NUM(df)))) {
DF <- data.frame(NUM(df), acm.disjonctif(FAC(df)))
names(DF)[1] <- colnames(df)[which(sapply(df, is.numeric))]
} else {
DF <- data.frame(NUM(df), acm.disjonctif(FAC(df)))
}
return(DF)
}
Let's try it.
df <-data.frame(eggs = c("foo", "foo", "bar", "bar"),
ham = c("red","blue","green","red"), x=rnorm(4))
dummy(df)
df2 <-data.frame(eggs = c("foo", "foo", "bar", "bar"),
ham = c("red","blue","green","red"))
dummy(df2)
Here is a more clear way to do it. I use model.matrix to create the dummy boolean variables and then merge it back into the original dataframe.
df.original <-data.frame(eggs = c("foo", "foo", "bar", "bar"), ham = c(1,2,3,4))
df.original
# eggs ham
# 1 foo 1
# 2 foo 2
# 3 bar 3
# 4 bar 4
# Create the dummy boolean variables using the model.matrix() function.
> mm <- model.matrix(~eggs-1, df.original)
> mm
# eggsbar eggsfoo
# 1 0 1
# 2 0 1
# 3 1 0
# 4 1 0
# attr(,"assign")
# [1] 1 1
# attr(,"contrasts")
# attr(,"contrasts")$eggs
# [1] "contr.treatment"
# Remove the "eggs" prefix from the column names as the OP desired.
colnames(mm) <- gsub("eggs","",colnames(mm))
mm
# bar foo
# 1 0 1
# 2 0 1
# 3 1 0
# 4 1 0
# attr(,"assign")
# [1] 1 1
# attr(,"contrasts")
# attr(,"contrasts")$eggs
# [1] "contr.treatment"
# Combine the matrix back with the original dataframe.
result <- cbind(df.original, mm)
result
# eggs ham bar foo
# 1 foo 1 0 1
# 2 foo 2 0 1
# 3 bar 3 1 0
# 4 bar 4 1 0
# At this point, you can select out the columns that you want.
I needed a function to 'explode' factors that is a bit more flexible, and made one based on the acm.disjonctif function from the ade4 package.
This allows you to choose the exploded values, which are 0 and 1 in acm.disjonctif. It only explodes factors that have 'few' levels. Numeric columns are preserved.
# Function to explode factors that are considered to be categorical,
# i.e., they do not have too many levels.
# - data: The data.frame in which categorical variables will be exploded.
# - values: The exploded values for the value being unequal and equal to a level.
# - max_factor_level_fraction: Maximum number of levels as a fraction of column length. Set to 1 to explode all factors.
# Inspired by the acm.disjonctif function in the ade4 package.
explode_factors <- function(data, values = c(-0.8, 0.8), max_factor_level_fraction = 0.2) {
exploders <- colnames(data)[sapply(data, function(col){
is.factor(col) && nlevels(col) <= max_factor_level_fraction * length(col)
})]
if (length(exploders) > 0) {
exploded <- lapply(exploders, function(exp){
col <- data[, exp]
n <- length(col)
dummies <- matrix(values[1], n, length(levels(col)))
dummies[(1:n) + n * (unclass(col) - 1)] <- values[2]
colnames(dummies) <- paste(exp, levels(col), sep = '_')
dummies
})
# Only keep numeric data.
data <- data[sapply(data, is.numeric)]
# Add exploded values.
data <- cbind(data, exploded)
}
return(data)
}
(The question is 10yo, but for the sake of completeness...)
The function i() from the fixest package does exactly that.
Beyond creating a design matrix from a factor-like variable, you can also very easily do two extra things on the fly:
binning values (with the argument 'bin'),
excluding some factor values (with the argument ref).
And since it is made for this task, if your variable happens to be numeric you don't need to wrap it with factor(x_num) (as opposed to the model.matrix solution).
Here's an example:
library(fixest)
data(airquality)
table(airquality$Month)
#> 5 6 7 8 9
#> 31 30 31 31 30
head(i(airquality$Month))
#> 5 6 7 8 9
#> [1,] 1 0 0 0 0
#> [2,] 1 0 0 0 0
#> [3,] 1 0 0 0 0
#> [4,] 1 0 0 0 0
#> [5,] 1 0 0 0 0
#> [6,] 1 0 0 0 0
#
# Binning (check out the help, there are many many ways to bin)
#
colSums(i(airquality$Month, bin = 5:6)))
#> 5 7 8 9
#> 61 31 31 30
#
# References
#
head(i(airquality$Month, ref = c(6, 9)), 3)
#> 5 7 8
#> [1,] 1 0 0
#> [2,] 1 0 0
#> [3,] 1 0 0
And here's a little wrapper expanding all non-numeric variables (by default):
library(fixest)
# data: data.frame
# var: vector of variable names // if missing, all non numeric variables
# no argument checking
expand_factor = function(data, var){
if(missing(var)){
var = names(data)[!sapply(data, is.numeric)]
if(length(var) == 0) return(data)
}
data_list = unclass(data)
new = lapply(var, \(x) i(data_list[[x]]))
data_list[names(data_list) %in% var] = new
do.call("cbind", data_list)
}
my_data = data.frame(eggs = c("foo", "foo", "bar", "bar"), ham = c(1,2,3,4))
expand_factor(my_data)
#> bar foo ham
#> [1,] 0 1 1
#> [2,] 0 1 2
#> [3,] 1 0 3
#> [4,] 1 0 4
Finally, for those wondering, the timing is equivalent to the model.matrix solution.
library(microbenchmark)
my_data = data.frame(x = as.factor(sample(100, 1e6, TRUE)))
microbenchmark(mm = model.matrix(~x, my_data),
i = i(my_data$x), times = 5)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> mm 155.1904 156.7751 209.2629 182.4964 197.9084 353.9443 5
#> i 154.1697 154.7893 159.5202 155.4166 163.9706 169.2550 5
In sapply == over eggs could be used to generate dummy vectors:
x <- with(df.original, data.frame(+sapply(unique(eggs), `==`, eggs), ham))
x
# foo bar ham
#1 1 0 1
#2 1 0 2
#3 0 1 3
#4 0 1 4
all.equal(x, df.desired)
#[1] TRUE
A maybe faster variant - Result best used as list or data.frame:
. <- unique(df.original$eggs)
with(df.original,
data.frame(+do.call(cbind, lapply(setNames(., .), `==`, eggs)), ham))
Indexing in a matrix - Result best used as matrix:
. <- unique(df.original$eggs)
i <- match(df.original$eggs, .)
nc <- length(.)
nr <- length(i)
cbind(matrix(`[<-`(integer(nc * nr), 1:nr + nr * (i - 1), 1), nr, nc,
dimnames=list(NULL, .)), df.original["ham"])
Using outer - Result best used as matrix:
. <- unique(df.original$eggs)
cbind(+outer(df.original$eggs, setNames(., .), `==`), df.original["ham"])
Using rep - Result best used as matrix:
. <- unique(df.original$eggs)
n <- nrow(df.original)
cbind(+matrix(df.original$eggs == rep(., each=n), n, dimnames=list(NULL, .)),
df.original["ham"])