Data
We have numerous text strings that look like this (way longer in our real dataset):
df <- data.frame(
id = c('text1','text2','text3'),text = c('ABA','ABA','AAA')
)
>df
id text
1 text1 ABA
2 text2 ABA
3 text3 AAA
We want to create a matrix that tells how often a letter at position x is found together with the other letters at other positions, so in this case:
3A 3 1 2 3
2B 2 0 2 2
2A 1 1 0 1
1A 3 1 2 3
1A 2A 2B 3A
What I tried
I previously converted the matrix to a binary matrix, looking like this:
structure(list(pos1_A = c(1, 1, 1), pos2_A = c(0, 0, 1), pos2_B = c(1,
1, 0), pos3_A = c(1, 1, 1)), class = "data.frame", row.names = c("text1",
"text2", "text3"))
pos1_A pos2_A pos2_B pos3_A
text1 1 0 1 1
text2 1 0 1 1
text3 1 1 0 1
Then I can run commands like cor to get correlations, however, instead of correlations I want the frequencies.
Note this is different from questions about co-occurrences wherein the variable name itself (here position) is neglected, for example like "How to use R to create a word co-occurrence matrix"
Huge credit to #Ronak Shah with the answer here
It's much simpler if we convert the categorical data to a numerical (binary matrix), for example using this hacky but easy way with the homals package and then apply the method by #Ronak Shah linked above:
# The dataset
df <- data.frame(
id = c('text1','text2','text3'),text = c('ABA','ABA','AAA')
)
# Split the strings in characters and add column names
df2 <- df %>% splitstackshape::cSplit('text', sep = '', stripWhite = FALSE, type.convert = FALSE, direction = 'wide') %>%
column_to_rownames('id')
colnames(df2) <- paste0('pos', 1:ncol(df2))
# Convert to binary matrix (hacky way)
bin.mat <- homals:::expandFrame(df2, clean = F)
# Method by #Ronak Shah to get the frequency matrix
fun <- function(x, y) sum(bin.mat[, x] & bin.mat[, y])
n <- seq_along(bin.mat)
mat <- outer(n, n, Vectorize(fun))
dimnames(mat) <- list(names(bin.mat)[n], names(bin.mat[n]))
This produces the matrix:
>mat
pos1_A pos2_A pos2_B pos3_A
pos1_A 3 1 2 3
pos2_A 1 1 0 1
pos2_B 2 0 2 2
pos3_A 3 1 2 3
Here's an alternative approach that produces a matrix as originally requested:
# Make all strings the same length:
df$text <- stringr::str_pad(df$text, side = "right", max(nchar(df$text)))
# Create a matrix with all letters labelled by their position:
all_vals <- apply(do.call(rbind, strsplit(df$text, "")), 1,
function(x) paste0(seq_along(x), x))
# Create a vector of all possible letter / position combos
all_labs <- do.call(paste0, expand.grid(seq(max(nchar(df$text))),
unique(unlist(strsplit(df$text, "")))))
# Create a function that will count all co-occurences per data frame row
f <- function(y, x) as.vector(outer(x, x, function(a, b) 1 * (a %in% y & b %in% y)))
# Create the results matrix and label it
m <- matrix(rowSums(apply(as.data.frame(all_vals), 2, f, all_labs)), nrow = length(all_labs))
rownames(m) <- all_labs
colnames(m) <- all_labs
m
#> 1A 2A 3A 1B 2B 3B
#> 1A 3 1 3 0 2 0
#> 2A 1 1 1 0 0 0
#> 3A 3 1 3 0 2 0
#> 1B 0 0 0 0 0 0
#> 2B 2 0 2 0 2 0
#> 3B 0 0 0 0 0 0
Created on 2020-05-24 by the reprex package (v0.3.0)
Related
I have a data frame - in which I have a column with a lengthy string separated by _. Now I am interested in counting the patterns and several possible combinations from the long string. In the use case I provided below, you can find that I would like to count the occurrence of events A and B but not anything else.
If A and B repeat like A_B or B_A alone or if they repeats itself n number of times, I want to count them and also if there are several occurrences of those combinations.
Example data frame:
participant <- c("A", "B", "C")
trial <- c(1,1,2)
string_pattern <- c("A_B_A_C_A_B", "B_A_B_A_C_D_A_B", "A_B_C_A_B")
df <- data.frame(participant, trial, string_pattern)
Expected output:
participant trial string_pattern A_B B_A A_B_A B_A_B B_A_B_A
1. A 1 A_B_A_C_A_B 2 1 1 0 0
2. B 1 B_A_B_A_C_D_A_B 2 2 1 1 1
3. C 2 A_B_C_A_B 2 0 0 0 0
My code:
revised_df <- df%>%
dplyr::mutate(A_B = stringr::str_count(string_pattern, "A_B"),
B_A = stringr::str_count(string_pattern, "B_A"),
B_A_B = string::str_count(string_pattern, "B_A_B"))
My approach gets complicated as the number of combinations increases. Hence, looking for a better solution.
You could write a function to solve this:
m <- function(s){
a <- seq(nchar(s)-1)
start <- rep(a, rev(a))
stop <- ave(start, start, FUN = \(x)seq_along(x)+x)
b <- substring(s, start, stop)
gsub('(?<=\\B)|(?=\\B)', '_', b, perl = TRUE)
}
n <- function(x){
names(x) <- x
a <- strsplit(gsub("_", '', gsub("_[^AB]+_", ':', x)), ':')
b <- t(table(stack(lapply(a, \(y)unlist(sapply(y, m))))))
data.frame(pattern=x, as.data.frame.matrix(b), row.names = NULL)
}
n(string_pattern)
pattern A_B A_B_A B_A B_A_B B_A_B_A
1 A_B_A_C_A_B 2 1 1 0 0
2 B_A_B_A_C_D_A_B 2 1 2 1 1
3 A_B_C_A_B 2 0 0 0 0
Try: This checks each string row for current column name
library(dplyr)
df |>
mutate(A_B = 0, B_A = 0, A_B_A = 0, B_A_B = 0, B_A_B_A = 0) |>
mutate(across(A_B:B_A_B_A, ~ str_count(string_pattern, cur_column())))
participant trial string_pattern A_B B_A A_B_A B_A_B B_A_B_A
1 A 1 A_B_A_C_A_B 2 1 1 0 0
2 B 1 B_A_B_A_C_D_A_B 2 2 1 1 1
3 C 2 A_B_C_A_B 2 0 0 0 0
I have table with an unequal number of elements in each row, with each element having a count of 1 or 2 appended to a string. I want to create a matrix of presence/absence of each string, but including the count (1,2) and placing a zero if the string is not found.
From this:
V1 V2 V3 V4 V5
1 A cat:2 dog:1 mouse:1 horse:2
2 B dog:2 mouse:2 dolphin:2
3 C horse:2
4 D cat:1 mouse:2 dolphin:2
To this:
cat dog mouse horse dolphin
A 2 1 1 2 0
B 0 2 2 0 2
C 0 0 0 2 0
D 1 0 2 0 2
I have looked up previous solutions to similar problems:
Convert a dataframe to presence absence matrix
put they create a 0/1 matrix of absence, not including the count.
sample data:
structure(list(V1 = c("A", "B", "C", "D"),
V2 = c("cat:2", "dog:2", "horse:2", "cat:1"),
V3 = c("dog:1", "mouse:2", "", "mouse:2"),
V4 = c("mouse:1", "dolphin:2", "", "dolphin:2"),
V5 = c("horse:2", "", "", "")),
.Names = c("V1", "V2", "V3", "V4", "V5"),
class = "data.frame", row.names = c(NA, -4L))
Maybe some package could make this easier, but here is a solution. It won't be fast for large data, but it does the job:
#split the strings
tmp <- apply(DF[,-1], 1, strsplit, ":")
#extract the first strings
names <- lapply(tmp,function(x) c(na.omit(sapply(x, "[", 1))))
uniquenames <- unique(unlist(names))
#extract the numbers
reps <- lapply(tmp,function(x) as.numeric(na.omit(sapply(x, "[", 2))))
#make the numbers named vectors
res <- mapply(setNames, reps, names)
#subset the named vectors and combine result in a matrix
res <- do.call(rbind, lapply(res, "[",uniquenames))
#cosmetics
colnames(res) <- uniquenames
rownames(res) <- DF$V1
res[is.na(res)] <- 0
# cat dog mouse horse dolphin
#A 2 1 1 2 0
#B 0 2 2 0 2
#C 0 0 0 2 0
#D 1 0 2 0 2
You can separate the animals from the counts with separate from tidyr right after melting the data into long format and then dcasting to wide using the counts as values (which need to be casted from character to numeric as a previous step).
data %>%
melt("V1") %>%
separate(value, c("animal", "count"), ":", fill = "left") %>%
transform(count = as.numeric(count)) %>%
dcast(V1 ~ animal, value.var = "count", fun.aggregate = sum) %>%
select(-"NA")
# V1 cat dog dolphin horse mouse
# 1 A 2 1 0 2 1
# 2 B 0 2 2 0 2
# 3 C 0 0 0 2 0
# 4 D 1 0 2 0 2
I have an R data frame containing a factor that I want to "expand" so that for each factor level, there is an associated column in a new data frame, which contains a 1/0 indicator. E.g., suppose I have:
df.original <-data.frame(eggs = c("foo", "foo", "bar", "bar"), ham = c(1,2,3,4))
I want:
df.desired <- data.frame(foo = c(1,1,0,0), bar=c(0,0,1,1), ham=c(1,2,3,4))
Because for certain analyses for which you need to have a completely numeric data frame (e.g., principal component analysis), I thought this feature might be built in. Writing a function to do this shouldn't be too hard, but I can foresee some challenges relating to column names and if something exists already, I'd rather use that.
Use the model.matrix function:
model.matrix( ~ Species - 1, data=iris )
If your data frame is only made of factors (or you are working on a subset of variables which are all factors), you can also use the acm.disjonctif function from the ade4 package :
R> library(ade4)
R> df <-data.frame(eggs = c("foo", "foo", "bar", "bar"), ham = c("red","blue","green","red"))
R> acm.disjonctif(df)
eggs.bar eggs.foo ham.blue ham.green ham.red
1 0 1 0 0 1
2 0 1 1 0 0
3 1 0 0 1 0
4 1 0 0 0 1
Not exactly the case you are describing, but it can be useful too...
A quick way using the reshape2 package:
require(reshape2)
> dcast(df.original, ham ~ eggs, length)
Using ham as value column: use value_var to override.
ham bar foo
1 1 0 1
2 2 0 1
3 3 1 0
4 4 1 0
Note that this produces precisely the column names you want.
probably dummy variable is similar to what you want.
Then, model.matrix is useful:
> with(df.original, data.frame(model.matrix(~eggs+0), ham))
eggsbar eggsfoo ham
1 0 1 1
2 0 1 2
3 1 0 3
4 1 0 4
A late entry class.ind from the nnet package
library(nnet)
with(df.original, data.frame(class.ind(eggs), ham))
bar foo ham
1 0 1 1
2 0 1 2
3 1 0 3
4 1 0 4
Just came across this old thread and thought I'd add a function that utilizes ade4 to take a dataframe consisting of factors and/or numeric data and returns a dataframe with factors as dummy codes.
dummy <- function(df) {
NUM <- function(dataframe)dataframe[,sapply(dataframe,is.numeric)]
FAC <- function(dataframe)dataframe[,sapply(dataframe,is.factor)]
require(ade4)
if (is.null(ncol(NUM(df)))) {
DF <- data.frame(NUM(df), acm.disjonctif(FAC(df)))
names(DF)[1] <- colnames(df)[which(sapply(df, is.numeric))]
} else {
DF <- data.frame(NUM(df), acm.disjonctif(FAC(df)))
}
return(DF)
}
Let's try it.
df <-data.frame(eggs = c("foo", "foo", "bar", "bar"),
ham = c("red","blue","green","red"), x=rnorm(4))
dummy(df)
df2 <-data.frame(eggs = c("foo", "foo", "bar", "bar"),
ham = c("red","blue","green","red"))
dummy(df2)
Here is a more clear way to do it. I use model.matrix to create the dummy boolean variables and then merge it back into the original dataframe.
df.original <-data.frame(eggs = c("foo", "foo", "bar", "bar"), ham = c(1,2,3,4))
df.original
# eggs ham
# 1 foo 1
# 2 foo 2
# 3 bar 3
# 4 bar 4
# Create the dummy boolean variables using the model.matrix() function.
> mm <- model.matrix(~eggs-1, df.original)
> mm
# eggsbar eggsfoo
# 1 0 1
# 2 0 1
# 3 1 0
# 4 1 0
# attr(,"assign")
# [1] 1 1
# attr(,"contrasts")
# attr(,"contrasts")$eggs
# [1] "contr.treatment"
# Remove the "eggs" prefix from the column names as the OP desired.
colnames(mm) <- gsub("eggs","",colnames(mm))
mm
# bar foo
# 1 0 1
# 2 0 1
# 3 1 0
# 4 1 0
# attr(,"assign")
# [1] 1 1
# attr(,"contrasts")
# attr(,"contrasts")$eggs
# [1] "contr.treatment"
# Combine the matrix back with the original dataframe.
result <- cbind(df.original, mm)
result
# eggs ham bar foo
# 1 foo 1 0 1
# 2 foo 2 0 1
# 3 bar 3 1 0
# 4 bar 4 1 0
# At this point, you can select out the columns that you want.
I needed a function to 'explode' factors that is a bit more flexible, and made one based on the acm.disjonctif function from the ade4 package.
This allows you to choose the exploded values, which are 0 and 1 in acm.disjonctif. It only explodes factors that have 'few' levels. Numeric columns are preserved.
# Function to explode factors that are considered to be categorical,
# i.e., they do not have too many levels.
# - data: The data.frame in which categorical variables will be exploded.
# - values: The exploded values for the value being unequal and equal to a level.
# - max_factor_level_fraction: Maximum number of levels as a fraction of column length. Set to 1 to explode all factors.
# Inspired by the acm.disjonctif function in the ade4 package.
explode_factors <- function(data, values = c(-0.8, 0.8), max_factor_level_fraction = 0.2) {
exploders <- colnames(data)[sapply(data, function(col){
is.factor(col) && nlevels(col) <= max_factor_level_fraction * length(col)
})]
if (length(exploders) > 0) {
exploded <- lapply(exploders, function(exp){
col <- data[, exp]
n <- length(col)
dummies <- matrix(values[1], n, length(levels(col)))
dummies[(1:n) + n * (unclass(col) - 1)] <- values[2]
colnames(dummies) <- paste(exp, levels(col), sep = '_')
dummies
})
# Only keep numeric data.
data <- data[sapply(data, is.numeric)]
# Add exploded values.
data <- cbind(data, exploded)
}
return(data)
}
(The question is 10yo, but for the sake of completeness...)
The function i() from the fixest package does exactly that.
Beyond creating a design matrix from a factor-like variable, you can also very easily do two extra things on the fly:
binning values (with the argument 'bin'),
excluding some factor values (with the argument ref).
And since it is made for this task, if your variable happens to be numeric you don't need to wrap it with factor(x_num) (as opposed to the model.matrix solution).
Here's an example:
library(fixest)
data(airquality)
table(airquality$Month)
#> 5 6 7 8 9
#> 31 30 31 31 30
head(i(airquality$Month))
#> 5 6 7 8 9
#> [1,] 1 0 0 0 0
#> [2,] 1 0 0 0 0
#> [3,] 1 0 0 0 0
#> [4,] 1 0 0 0 0
#> [5,] 1 0 0 0 0
#> [6,] 1 0 0 0 0
#
# Binning (check out the help, there are many many ways to bin)
#
colSums(i(airquality$Month, bin = 5:6)))
#> 5 7 8 9
#> 61 31 31 30
#
# References
#
head(i(airquality$Month, ref = c(6, 9)), 3)
#> 5 7 8
#> [1,] 1 0 0
#> [2,] 1 0 0
#> [3,] 1 0 0
And here's a little wrapper expanding all non-numeric variables (by default):
library(fixest)
# data: data.frame
# var: vector of variable names // if missing, all non numeric variables
# no argument checking
expand_factor = function(data, var){
if(missing(var)){
var = names(data)[!sapply(data, is.numeric)]
if(length(var) == 0) return(data)
}
data_list = unclass(data)
new = lapply(var, \(x) i(data_list[[x]]))
data_list[names(data_list) %in% var] = new
do.call("cbind", data_list)
}
my_data = data.frame(eggs = c("foo", "foo", "bar", "bar"), ham = c(1,2,3,4))
expand_factor(my_data)
#> bar foo ham
#> [1,] 0 1 1
#> [2,] 0 1 2
#> [3,] 1 0 3
#> [4,] 1 0 4
Finally, for those wondering, the timing is equivalent to the model.matrix solution.
library(microbenchmark)
my_data = data.frame(x = as.factor(sample(100, 1e6, TRUE)))
microbenchmark(mm = model.matrix(~x, my_data),
i = i(my_data$x), times = 5)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> mm 155.1904 156.7751 209.2629 182.4964 197.9084 353.9443 5
#> i 154.1697 154.7893 159.5202 155.4166 163.9706 169.2550 5
In sapply == over eggs could be used to generate dummy vectors:
x <- with(df.original, data.frame(+sapply(unique(eggs), `==`, eggs), ham))
x
# foo bar ham
#1 1 0 1
#2 1 0 2
#3 0 1 3
#4 0 1 4
all.equal(x, df.desired)
#[1] TRUE
A maybe faster variant - Result best used as list or data.frame:
. <- unique(df.original$eggs)
with(df.original,
data.frame(+do.call(cbind, lapply(setNames(., .), `==`, eggs)), ham))
Indexing in a matrix - Result best used as matrix:
. <- unique(df.original$eggs)
i <- match(df.original$eggs, .)
nc <- length(.)
nr <- length(i)
cbind(matrix(`[<-`(integer(nc * nr), 1:nr + nr * (i - 1), 1), nr, nc,
dimnames=list(NULL, .)), df.original["ham"])
Using outer - Result best used as matrix:
. <- unique(df.original$eggs)
cbind(+outer(df.original$eggs, setNames(., .), `==`), df.original["ham"])
Using rep - Result best used as matrix:
. <- unique(df.original$eggs)
n <- nrow(df.original)
cbind(+matrix(df.original$eggs == rep(., each=n), n, dimnames=list(NULL, .)),
df.original["ham"])
I have two frequency tables created using R's table() function:
freq1 <- table(unlist(strsplit(topic_list1, split=";")))
freq2 <- table(unlist(strsplit(topic_list2, split=";")))
topic_list1 and topic_list2 are strings that contains textual representations of topics separated by ;.
I want a way to compare the two frequencies, graphically if possible.
So if the two lists contain the same topic with different frequencies, I would like to be able to see it. The same goes for topics present in one frequency table, but not in the other.
There's probably a more elegant way to do this, but this ought to work:
# here I'm generating some example data
set.seed(5)
topic_list1 <- paste(sample(letters, 20, replace=T), sep=";")
topic_list2 <- paste(sample(letters, 15, replace=T), sep=";")
# I don't make the tables right away
tl1 <- unlist(strsplit(topic_list1, split=";"))
tl2 <- unlist(strsplit(topic_list2, split=";"))
big_list <- unique(c(tl1, tl2))
# this computes your frequencies
lbl <- length(big_list)
tMat1 <- matrix(rep(tl1, lbl), byrow=T, nrow=lbl)
tMat2 <- matrix(rep(tl2, lbl), byrow=T, nrow=lbl)
tMat1 <- cbind(big_list, tMat1)
tMat2 <- cbind(big_list, tMat2)
counts1 <- apply(tMat1, 1, function(x){sum(x[1]==x[2:length(x)])})
counts2 <- apply(tMat2, 1, function(x){sum(x[1]==x[2:length(x)])})
total_freqs <- rbind(counts1, counts2, counts1-counts2)
# this makes it nice looking & user friendly
colnames(total_freqs) <- big_list
rownames(total_freqs) <- c("topics1", "topics2", "difference")
total_freqs <- total_freqs[ ,order(total_freqs[3,])]
total_freqs
d l a z b f s y m r x h n i g k c v o
topics1 0 0 0 0 0 2 1 1 1 1 2 2 1 1 1 1 2 2 2
topics2 2 2 2 1 1 2 1 1 1 0 1 1 0 0 0 0 0 0 0
difference -2 -2 -2 -1 -1 0 0 0 0 1 1 1 1 1 1 1 2 2 2
From there you could just use the straight numbers or visualize them however you want (e.g, dotplots, etc.). Here's a simple dotplot:
windows()
dotchart(t(total_freqs)[,3], main="Frequencies of topics1 - topics2")
abline(v=0)
You can simply barplot them (with beside=T argument), which will give you a way to visually compare the counts per level ...
below is an example:
counts <- table(mtcars$vs, mtcars$gear)
barplot(counts, col=c("darkblue","red"), legend=rownames(counts), beside=T)
I have an R data frame containing a factor that I want to "expand" so that for each factor level, there is an associated column in a new data frame, which contains a 1/0 indicator. E.g., suppose I have:
df.original <-data.frame(eggs = c("foo", "foo", "bar", "bar"), ham = c(1,2,3,4))
I want:
df.desired <- data.frame(foo = c(1,1,0,0), bar=c(0,0,1,1), ham=c(1,2,3,4))
Because for certain analyses for which you need to have a completely numeric data frame (e.g., principal component analysis), I thought this feature might be built in. Writing a function to do this shouldn't be too hard, but I can foresee some challenges relating to column names and if something exists already, I'd rather use that.
Use the model.matrix function:
model.matrix( ~ Species - 1, data=iris )
If your data frame is only made of factors (or you are working on a subset of variables which are all factors), you can also use the acm.disjonctif function from the ade4 package :
R> library(ade4)
R> df <-data.frame(eggs = c("foo", "foo", "bar", "bar"), ham = c("red","blue","green","red"))
R> acm.disjonctif(df)
eggs.bar eggs.foo ham.blue ham.green ham.red
1 0 1 0 0 1
2 0 1 1 0 0
3 1 0 0 1 0
4 1 0 0 0 1
Not exactly the case you are describing, but it can be useful too...
A quick way using the reshape2 package:
require(reshape2)
> dcast(df.original, ham ~ eggs, length)
Using ham as value column: use value_var to override.
ham bar foo
1 1 0 1
2 2 0 1
3 3 1 0
4 4 1 0
Note that this produces precisely the column names you want.
probably dummy variable is similar to what you want.
Then, model.matrix is useful:
> with(df.original, data.frame(model.matrix(~eggs+0), ham))
eggsbar eggsfoo ham
1 0 1 1
2 0 1 2
3 1 0 3
4 1 0 4
A late entry class.ind from the nnet package
library(nnet)
with(df.original, data.frame(class.ind(eggs), ham))
bar foo ham
1 0 1 1
2 0 1 2
3 1 0 3
4 1 0 4
Just came across this old thread and thought I'd add a function that utilizes ade4 to take a dataframe consisting of factors and/or numeric data and returns a dataframe with factors as dummy codes.
dummy <- function(df) {
NUM <- function(dataframe)dataframe[,sapply(dataframe,is.numeric)]
FAC <- function(dataframe)dataframe[,sapply(dataframe,is.factor)]
require(ade4)
if (is.null(ncol(NUM(df)))) {
DF <- data.frame(NUM(df), acm.disjonctif(FAC(df)))
names(DF)[1] <- colnames(df)[which(sapply(df, is.numeric))]
} else {
DF <- data.frame(NUM(df), acm.disjonctif(FAC(df)))
}
return(DF)
}
Let's try it.
df <-data.frame(eggs = c("foo", "foo", "bar", "bar"),
ham = c("red","blue","green","red"), x=rnorm(4))
dummy(df)
df2 <-data.frame(eggs = c("foo", "foo", "bar", "bar"),
ham = c("red","blue","green","red"))
dummy(df2)
Here is a more clear way to do it. I use model.matrix to create the dummy boolean variables and then merge it back into the original dataframe.
df.original <-data.frame(eggs = c("foo", "foo", "bar", "bar"), ham = c(1,2,3,4))
df.original
# eggs ham
# 1 foo 1
# 2 foo 2
# 3 bar 3
# 4 bar 4
# Create the dummy boolean variables using the model.matrix() function.
> mm <- model.matrix(~eggs-1, df.original)
> mm
# eggsbar eggsfoo
# 1 0 1
# 2 0 1
# 3 1 0
# 4 1 0
# attr(,"assign")
# [1] 1 1
# attr(,"contrasts")
# attr(,"contrasts")$eggs
# [1] "contr.treatment"
# Remove the "eggs" prefix from the column names as the OP desired.
colnames(mm) <- gsub("eggs","",colnames(mm))
mm
# bar foo
# 1 0 1
# 2 0 1
# 3 1 0
# 4 1 0
# attr(,"assign")
# [1] 1 1
# attr(,"contrasts")
# attr(,"contrasts")$eggs
# [1] "contr.treatment"
# Combine the matrix back with the original dataframe.
result <- cbind(df.original, mm)
result
# eggs ham bar foo
# 1 foo 1 0 1
# 2 foo 2 0 1
# 3 bar 3 1 0
# 4 bar 4 1 0
# At this point, you can select out the columns that you want.
I needed a function to 'explode' factors that is a bit more flexible, and made one based on the acm.disjonctif function from the ade4 package.
This allows you to choose the exploded values, which are 0 and 1 in acm.disjonctif. It only explodes factors that have 'few' levels. Numeric columns are preserved.
# Function to explode factors that are considered to be categorical,
# i.e., they do not have too many levels.
# - data: The data.frame in which categorical variables will be exploded.
# - values: The exploded values for the value being unequal and equal to a level.
# - max_factor_level_fraction: Maximum number of levels as a fraction of column length. Set to 1 to explode all factors.
# Inspired by the acm.disjonctif function in the ade4 package.
explode_factors <- function(data, values = c(-0.8, 0.8), max_factor_level_fraction = 0.2) {
exploders <- colnames(data)[sapply(data, function(col){
is.factor(col) && nlevels(col) <= max_factor_level_fraction * length(col)
})]
if (length(exploders) > 0) {
exploded <- lapply(exploders, function(exp){
col <- data[, exp]
n <- length(col)
dummies <- matrix(values[1], n, length(levels(col)))
dummies[(1:n) + n * (unclass(col) - 1)] <- values[2]
colnames(dummies) <- paste(exp, levels(col), sep = '_')
dummies
})
# Only keep numeric data.
data <- data[sapply(data, is.numeric)]
# Add exploded values.
data <- cbind(data, exploded)
}
return(data)
}
(The question is 10yo, but for the sake of completeness...)
The function i() from the fixest package does exactly that.
Beyond creating a design matrix from a factor-like variable, you can also very easily do two extra things on the fly:
binning values (with the argument 'bin'),
excluding some factor values (with the argument ref).
And since it is made for this task, if your variable happens to be numeric you don't need to wrap it with factor(x_num) (as opposed to the model.matrix solution).
Here's an example:
library(fixest)
data(airquality)
table(airquality$Month)
#> 5 6 7 8 9
#> 31 30 31 31 30
head(i(airquality$Month))
#> 5 6 7 8 9
#> [1,] 1 0 0 0 0
#> [2,] 1 0 0 0 0
#> [3,] 1 0 0 0 0
#> [4,] 1 0 0 0 0
#> [5,] 1 0 0 0 0
#> [6,] 1 0 0 0 0
#
# Binning (check out the help, there are many many ways to bin)
#
colSums(i(airquality$Month, bin = 5:6)))
#> 5 7 8 9
#> 61 31 31 30
#
# References
#
head(i(airquality$Month, ref = c(6, 9)), 3)
#> 5 7 8
#> [1,] 1 0 0
#> [2,] 1 0 0
#> [3,] 1 0 0
And here's a little wrapper expanding all non-numeric variables (by default):
library(fixest)
# data: data.frame
# var: vector of variable names // if missing, all non numeric variables
# no argument checking
expand_factor = function(data, var){
if(missing(var)){
var = names(data)[!sapply(data, is.numeric)]
if(length(var) == 0) return(data)
}
data_list = unclass(data)
new = lapply(var, \(x) i(data_list[[x]]))
data_list[names(data_list) %in% var] = new
do.call("cbind", data_list)
}
my_data = data.frame(eggs = c("foo", "foo", "bar", "bar"), ham = c(1,2,3,4))
expand_factor(my_data)
#> bar foo ham
#> [1,] 0 1 1
#> [2,] 0 1 2
#> [3,] 1 0 3
#> [4,] 1 0 4
Finally, for those wondering, the timing is equivalent to the model.matrix solution.
library(microbenchmark)
my_data = data.frame(x = as.factor(sample(100, 1e6, TRUE)))
microbenchmark(mm = model.matrix(~x, my_data),
i = i(my_data$x), times = 5)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> mm 155.1904 156.7751 209.2629 182.4964 197.9084 353.9443 5
#> i 154.1697 154.7893 159.5202 155.4166 163.9706 169.2550 5
In sapply == over eggs could be used to generate dummy vectors:
x <- with(df.original, data.frame(+sapply(unique(eggs), `==`, eggs), ham))
x
# foo bar ham
#1 1 0 1
#2 1 0 2
#3 0 1 3
#4 0 1 4
all.equal(x, df.desired)
#[1] TRUE
A maybe faster variant - Result best used as list or data.frame:
. <- unique(df.original$eggs)
with(df.original,
data.frame(+do.call(cbind, lapply(setNames(., .), `==`, eggs)), ham))
Indexing in a matrix - Result best used as matrix:
. <- unique(df.original$eggs)
i <- match(df.original$eggs, .)
nc <- length(.)
nr <- length(i)
cbind(matrix(`[<-`(integer(nc * nr), 1:nr + nr * (i - 1), 1), nr, nc,
dimnames=list(NULL, .)), df.original["ham"])
Using outer - Result best used as matrix:
. <- unique(df.original$eggs)
cbind(+outer(df.original$eggs, setNames(., .), `==`), df.original["ham"])
Using rep - Result best used as matrix:
. <- unique(df.original$eggs)
n <- nrow(df.original)
cbind(+matrix(df.original$eggs == rep(., each=n), n, dimnames=list(NULL, .)),
df.original["ham"])