Related
I have two data frame with same row size and different column number, the name of the columns is also different, however the content may be similar in some of them.
i.e. df1:
df1<- data.frame("a"=c("0","1","0","1","0","0","0"),
"b"=c("1","1","1","1","1","0","0"),
"c"=c("1","1","0","0","1","0","0"),
"d"=c("1","1","1","1","1","1","1"))
df2:
df2<- data.frame("e"=c("1","1","0","1","0","0","0"),
"f"=c("1","1","1","1","1","0","0"),
"g"=c("0","0","0","0","1","0","0"),
"h"=c("0","0","0","0","1","1","1"))
If you see, the column "b" of df1 and "f" of df2 are equal. Therefore, the result I want is a new dataframe looking like this:
df3 <- data.frame("a"=c("0","1","0","1","0","0","0"),
"c"=c("1","1","0","0","1","0","0"),
"d"=c("1","1","1","1","1","1","1"),
"e"=c("1","1","0","1","0","0","0"),
"g"=c("0","0","0","0","1","0","0"),
"h"=c("0","0","0","0","1","1","1"))
NOTE: column "b" and "f" (that were similar) are not in the new df3.
I have looked in the web but I did not find an example for this. I think the major complexity is that the merge is by content and not by column name.
This would do the job:
df3 <- cbind(df1,df2)
df3 <- t(t(df3)[!(duplicated(t(df3)) | duplicated(t(df3), fromLast = TRUE)),])
df3
# a c d e g h
#1 0 1 1 1 0 0
#2 1 1 1 1 0 0
#3 0 0 1 0 0 0
#4 1 0 1 1 0 0
#5 0 1 1 0 1 1
#6 0 0 1 0 0 1
#7 0 0 1 0 0 1
this will give you a matrix, you can save the result as a df if so desired
We can use sapply to check for the columns that perfectly match.
mat <- sapply(df1, function(x) sapply(df2, function(y) all(x == y)))
mat
# a b c d
#e FALSE FALSE FALSE FALSE
#f FALSE TRUE FALSE FALSE
#g FALSE FALSE FALSE FALSE
#h FALSE FALSE FALSE FALSE
Here we can see column b from df1 and column f from df2 should be removed. We can do this by :
m2 <- which(mat, arr.ind = TRUE)
cbind(df1[-m2[, 2]], df2[-m2[, 1]])
# a c d e g h
#1 0 1 1 1 0 0
#2 1 1 1 1 0 0
#3 0 0 1 0 0 0
#4 1 0 1 1 0 0
#5 0 1 1 0 1 1
#6 0 0 1 0 0 1
#7 0 0 1 0 0 1
Here is a more tidyverse solution.
library(dplyr)
library(tidyr)
# based on Ronak's sapply approach
matches <- as.data.frame(sapply(df1, function(x) sapply(df2, function(y) identical(x, y)))) %>%
rownames_to_column(var = "df2") %>%
pivot_longer(-df2, names_to = "df1") %>% # pivot longer
filter(value) # keep only the matches
# programmatically build list of names to remove
vars_remove <- c(matches$df1, matches$df2) # will remove var names that are matches
df1 %>% bind_cols(df2) %>%
select(-any_of(vars_remove))
a c d e g h
1 0 1 1 1 0 0
2 1 1 1 1 0 0
3 0 0 1 0 0 0
4 1 0 1 1 0 0
5 0 1 1 0 1 1
6 0 0 1 0 0 1
7 0 0 1 0 0 1
We can use outer from base R
mat <- outer(df1, df2, FUN = Vectorize(function(x, y) all(x == y)))
mat
# e f g h
#a FALSE FALSE FALSE FALSE
#b FALSE TRUE FALSE FALSE
#c FALSE FALSE FALSE FALSE
#d FALSE FALSE FALSE FALSE
Now, we can get the row/column names
m2 <- as.matrix(subset(as.data.frame.table(mat), Freq, select = -Freq))
Now, we use the 'm2' to get remove the column names from 'df1', 'df2' and cbind
cbind(df1[setdiff(names(df1), m2[,1])], df2[setdiff(names(df2), m2[,2])])
# a c d e g h
#1 0 1 1 1 0 0
#2 1 1 1 1 0 0
#3 0 0 1 0 0 0
#4 1 0 1 1 0 0
#5 0 1 1 0 1 1
#6 0 0 1 0 0 1
#7 0 0 1 0 0 1
I have an issue of translating matrix into one hot encoding in R. I implemented in Matlab but i have difficulty in handling the object in R. Here i have an object of type 'matrix'.
I would like to apply one hot encoding to this matrix. I have problem with column names.
here is an example:
> set.seed(4)
> t <- matrix(floor(runif(10, 1,9)),5,5)
[,1] [,2] [,3] [,4] [,5]
[1,] 5 3 5 3 5
[2,] 1 6 1 6 1
[3,] 3 8 3 8 3
[4,] 3 8 3 8 3
[5,] 7 1 7 1 7
> class(t)
[1] "matrix"
Expecting:
1_1 1_3 1_5 1_7 2_1 2_3 2_6 2_8 ...
[1,] 0 0 1 0 0 1 0 0 ...
[2,] 1 0 0 0 0 0 1 0 ...
[3,] 0 1 0 0 0 0 0 1 ...
[4,] 0 1 0 0 0 0 0 1 ...
[5,] 0 0 0 1 1 0 0 0 ...
I tried the following, but the matrix remains the same.
library(data.table)
library(mltools)
test_table <- one_hot(as.data.table(t))
Any suggestions would be very much appreciated.
Your data table must contain some columns (variables) that have class "factor". Try this:
> t <- data.table(t)
> t[,V1:=factor(V1)]
> one_hot(t)
V1_1 V1_3 V1_5 V1_7 V2 V3 V4 V5
1: 0 0 1 0 3 5 3 5
2: 1 0 0 0 6 1 6 1
3: 0 1 0 0 8 3 8 3
4: 0 1 0 0 8 3 8 3
5: 0 0 0 1 1 7 1 7
But I read that from here that the dummyVars function from the caret package is quicker if your matrix is large.
Edit: Forgot to set the seed. :P
And a quick way to factor all variables in a data table:
t.f <- t[, lapply(.SD, as.factor)]
There are probably more concise ways to do this but this should work (and is at least easy to read and understand ;)
Suggested solution using base R and double loop:
set.seed(4)
t <- matrix(floor(runif(10, 1,9)),5,5)
# initialize result object
#
t_hot <- NULL
# for each column in original matrix
#
for (col in seq_along(t[1,])) {
# for each unique value in this column (sorted so the resulting
# columns appear in order)
#
for (val in sort(unique(t[, col]))) {
t_hot <- cbind(t_hot, ifelse(t[, col] == val, 1, 0))
# make name for this column
#
colnames(t_hot)[ncol(t_hot)] <- paste0(col, "_", val)
}
}
This returns:
1_1 1_3 1_5 1_7 2_1 2_3 2_6 2_8 3_1 3_3 3_5 3_7 4_1 4_3 4_6 4_8 5_1 5_3 5_5 5_7
[1,] 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0
[2,] 1 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 1 0 0 0
[3,] 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 1 0 0
[4,] 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 1 0 0
[5,] 0 0 0 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1
I want to manipulate two columns in R, so that when both events are true, refer to one of the columns to decide the value. For example:
a<- c(0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0)
b<- c(0,1,1,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0)
when a and b are both true, at a[9] and a[10], refer to b to decide the value of another column c in the following lines. Then, if b is FALSE at some line, (here is line 17) check again if both a and b are true. So, the desired output is like this:
c<- c(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0)
data <- cbind(a,b,c)
data
a b c
[1,] 0 0 0
[2,] 0 1 0
[3,] 0 1 0
[4,] 0 0 0
[5,] 0 0 0
[6,] 1 0 0
[7,] 1 0 0
[8,] 1 0 0
[9,] 1 1 1
[10,] 1 1 1
[11,] 0 1 1
[12,] 0 1 1
[13,] 0 1 1
[14,] 0 1 1
[15,] 0 1 1
[16,] 0 1 1
[17,] 0 0 0
[18,] 0 0 0
As the data comes in many lines, I would prefer the use vectorized method like ifelse() to handle this.
Many thanks to all the people who can help me with this.
I'm not 100% sure I understand the issue but here is a tidyverse solution that reproduces the output:
a<- c(0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0)
b<- c(0,1,1,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0)
df <- data.frame(a,b)
library(tidyverse)
df %>% mutate(c=case_when(
a == 1 & b == 1 ~ 1,
a == 0 & b == 0 ~ 0,
TRUE ~ NA_real_
)) %>% fill(c)
# a b c
# 1 0 0 0
# 2 0 1 0
# 3 0 1 0
# 4 0 0 0
# 5 0 0 0
# 6 1 0 0
# 7 1 0 0
# 8 1 0 0
# 9 1 1 1
# 10 1 1 1
# 11 0 1 1
# 12 0 1 1
# 13 0 1 1
# 14 0 1 1
# 15 0 1 1
# 16 0 1 1
# 17 0 0 0
# 18 0 0 0
I have a data table, and I try Reshaping it but it doesn't work, how do I do this:
I have a data table:
Name | Value
-------------
Bob | 8,9,10
------------
Mike | 2,3,4
------------
Sandr| 5,6,7
How do I make this into a list like:
Value | Name
-------------
2 | Mike
3 | Mike
4 | Mike
5 | Sandr
6 | Sandr
7 | Sandr
8 | Bob
9 | Bob
10 | Bob
And then make this list into a matrix like:
2 3 4 5 6 7 8 9 10
-------------------
2 | 1 1 1 0 0 0 0 0 0
3 | 1 1 1 0 0 0 0 0 0
4 | 1 1 1 0 0 0 0 0 0
5 | 0 0 0 1 1 1 0 0 0
6 | 0 0 0 1 1 1 0 0 0
7 | 0 0 0 1 1 1 0 0 0
8 | 0 0 0 0 0 0 1 1 1
9 | 0 0 0 0 0 0 1 1 1
10| 0 0 0 0 0 0 1 1 1
The functions you are looking for are stack and contrasts.
data<-list(bob=c(8,9,10),mike=c(2,3,4),sandr=c(5,6,7))
as.data.frame(data)
bob mike sandr
1 8 2 5
2 9 3 6
3 10 4 7
stack(data)
values ind
1 8 bob
2 9 bob
3 10 bob
4 2 mike
5 3 mike
6 4 mike
7 5 sandr
8 6 sandr
9 7 sandr
df<-stack(data)
contrasts(df$ind,contrasts=FALSE)[df$ind,df$ind]
bob bob bob mike mike mike sandr sandr sandr
bob 1 1 1 0 0 0 0 0 0
bob 1 1 1 0 0 0 0 0 0
bob 1 1 1 0 0 0 0 0 0
mike 0 0 0 1 1 1 0 0 0
mike 0 0 0 1 1 1 0 0 0
mike 0 0 0 1 1 1 0 0 0
sandr 0 0 0 0 0 0 1 1 1
sandr 0 0 0 0 0 0 1 1 1
sandr 0 0 0 0 0 0 1 1 1
You can assign row names and column names and sort if desired
im<-contrasts(df$ind,contrasts=FALSE)[df$ind,df$ind]
rownames(im)<-df$values
colnames(im)<-df$values
res <- read.table(text="Name | Value
Bob | 8,9,10
Mike | 2,3,4
Sandr| 5,6,7", header=TRUE, sep="|")
dres <- data.frame(Value= unlist( strsplit(as.character(res$Value), ",") )
, Name=rep(res$Name, each=3))
dres <- dres[order(as.numeric(as.character(dres$Value))), ]
dres
outer(sort(dres$Value), sort(dres$Value), FUN=function(x,y) dres[x, "Name"] == dres[y,"Name"] )
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
[1,] TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
[2,] TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
[3,] TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
[4,] FALSE FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE
[5,] FALSE FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE
[6,] FALSE FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE
[7,] FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE
[8,] FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE
[9,] FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE
I have two factors. factor A have 2 level, factor B have 3 level.
How to create the following design matrix?
factorA1 factorA2 factorB1 factorB2 factorB3
[1,] 1 0 1 0 0
[2,] 1 0 0 1 0
[3,] 1 0 0 0 1
[4,] 0 1 1 0 0
[5,] 0 1 0 1 0
[6,] 0 1 0 0 1
You have a couple of options:
Use base and piece it together yourself:
(iris.dummy<-with(iris,model.matrix(~Species-1)))
(IRIS<-data.frame(iris,iris.dummy))
Or use the ade4 package as follows:
dummy <- function(df) {
require(ade4)
ISFACT <- sapply(df, is.factor)
FACTS <- acm.disjonctif(df[, ISFACT, drop = FALSE])
NONFACTS <- df[, !ISFACT,drop = FALSE]
data.frame(NONFACTS, FACTS)
}
dat <-data.frame(eggs = c("foo", "foo", "bar", "bar"),
ham = c("red","blue","green","red"), x=rnorm(4))
dummy(dat)
## x eggs.bar eggs.foo ham.blue ham.green ham.red
## 1 0.3365302 0 1 0 0 1
## 2 1.1341354 0 1 1 0 0
## 3 2.0489741 1 0 0 1 0
## 4 1.1019108 1 0 0 0 1
Assuming your data in in a data.frame called dat, let's say the two factors are given as in this example:
> dat <- data.frame(f1=sample(LETTERS[1:3],20,T),f2=sample(LETTERS[4:5],20,T),id=1:20)
> dat
f1 f2 id
1 C D 1
2 B E 2
3 B E 3
4 A D 4
5 C E 5
6 C E 6
7 C D 7
8 B E 8
9 C D 9
10 A D 10
11 B E 11
12 C E 12
13 B D 13
14 B E 14
15 A D 15
16 C E 16
17 C D 17
18 C D 18
19 B D 19
20 C D 20
> dat$f1
[1] C B B A C C C B C A B C B B A C C C B C
Levels: A B C
> dat$f2
[1] D E E D E E D E D D E E D E D E D D D D
Levels: D E
You can use outer to get a matrix as you showed, for each factor:
> F1 <- with(dat, outer(f1, levels(f1), `==`)*1)
> colnames(F1) <- paste("f1",sep="=",levels(dat$f1))
> F1
f1=A f1=B f1=C
[1,] 0 0 1
[2,] 0 1 0
[3,] 0 1 0
[4,] 1 0 0
[5,] 0 0 1
[6,] 0 0 1
[7,] 0 0 1
[8,] 0 1 0
[9,] 0 0 1
[10,] 1 0 0
[11,] 0 1 0
[12,] 0 0 1
[13,] 0 1 0
[14,] 0 1 0
[15,] 1 0 0
[16,] 0 0 1
[17,] 0 0 1
[18,] 0 0 1
[19,] 0 1 0
[20,] 0 0 1
Now do the same for the second factor:
> F2 <- with(dat, outer(f2, levels(f2), `==`)*1)
> colnames(F2) <- paste("f2",sep="=",levels(dat$f2))
And cbind them to get the final result:
> cbind(F1,F2)
model.matrix is the process that lm and others use in the background to convert for you.
dat <- data.frame(f1=sample(LETTERS[1:3],20,T),f2=sample(LETTERS[4:5],20,T),id=1:20)
dat
model.matrix(~dat$f1 + dat$f2)
It creates the INTERCEPT variable as a column of 1's, but you can easily remove that if you need.
model.matrix(~dat$f1 + dat$f2)[,-1]
Edit: Now i see that this is essentially the same as one of the other comments, but more concise.
Expanding and generalizing #Ferdinand.kraft's answer:
dat <- data.frame(
f1 = sample(LETTERS[1:3], 20, TRUE),
f2 = sample(LETTERS[4:5], 20, TRUE),
row.names = paste0("id_", 1:20))
covariates <- c("f1", "f2") # in case you have other columns that you don't want to include in the design matrix
design <- do.call(cbind, lapply(covariates, function(covariate){
apply(outer(dat[[covariate]], unique(dat[[covariate]]), FUN = "=="), 2, as.integer)
}))
rownames(design) <- rownames(dat)
colnames(design) <- unlist(sapply(covariates, function(covariate) unique(dat[[covariate]])))
design <- design[, !duplicated(colnames(design))] # duplicated colnames happen sometimes
design
# C A B D E
# id_1 1 0 0 1 0
# id_2 0 1 0 1 0
# id_3 0 0 1 1 0
# id_4 1 0 0 1 0
# id_5 0 1 0 1 0
# id_6 0 1 0 0 1
# id_7 0 0 1 0 1
Model matrix only allows what it calls "dummy" coding for the first factor in a formula.
If the intercept is present, it plays that role. To get the desired effect of a redundant index matrix (where you have a 1 in every column for the corresponding factor level and 0 elsewhere), you can lie to model.matrix() and pretend there's an extra level. Then trim off the intercept column.
> a=rep(1:2,3)
> b=rep(1:3,2)
> df=data.frame(A=a,B=b)
> # Lie and pretend there's a level 0 in each factor.
> df$A=factor(a,as.character(0:2))
> df$B=factor(b,as.character(0:3))
> mm=model.matrix (~A+B,df)
> mm
(Intercept) A1 A2 B1 B2 B3
1 1 1 0 1 0 0
2 1 0 1 0 1 0
3 1 1 0 0 0 1
4 1 0 1 1 0 0
5 1 1 0 0 1 0
6 1 0 1 0 0 1
attr(,"assign")
[1] 0 1 1 2 2 2
attr(,"contrasts")
attr(,"contrasts")$A
[1] "contr.treatment"
attr(,"contrasts")$B
[1] "contr.treatment"
> # mm has an intercept column not requested, so kill it
> dm=as.matrix(mm[,-1])
> dm
A1 A2 B1 B2 B3
1 1 0 1 0 0
2 0 1 0 1 0
3 1 0 0 0 1
4 0 1 1 0 0
5 1 0 0 1 0
6 0 1 0 0 1
> # You can also add interactions
> mm2=model.matrix (~A*B,df)
> dm2=as.matrix(mm2[,-1])
> dm2
A1 A2 B1 B2 B3 A1:B1 A2:B1 A1:B2 A2:B2 A1:B3 A2:B3
1 1 0 1 0 0 1 0 0 0 0 0
2 0 1 0 1 0 0 0 0 1 0 0
3 1 0 0 0 1 0 0 0 0 1 0
4 0 1 1 0 0 0 1 0 0 0 0
5 1 0 0 1 0 0 0 1 0 0 0
6 0 1 0 0 1 0 0 0 0 0 1
Things get complicated with model.matrix() again if we add a covariate x and interactions of x with factors.
a=rep(1:2,3)
b=rep(1:3,2)
x=1:6
df=data.frame(A=a,B=b,x=x)
# Lie and pretend there's a level 0 in each factor.
df$A=factor(a,as.character(0:2))
df$B=factor(b,as.character(0:3))
mm=model.matrix (~A + B + A:x + B:x,df)
print(mm)
(Intercept) A1 A2 B1 B2 B3 A0:x A1:x A2:x B1:x B2:x B3:x
1 1 1 0 1 0 0 0 1 0 1 0 0
2 1 0 1 0 1 0 0 0 2 0 2 0
3 1 1 0 0 0 1 0 3 0 0 0 3
4 1 0 1 1 0 0 0 0 4 4 0 0
5 1 1 0 0 1 0 0 5 0 0 5 0
6 1 0 1 0 0 1 0 0 6 0 0 6
So mm has an intercept, but now A:x interaction terms have an unwanted level A0:x
If we reintroduce x as as a separate term, we will cancel that unwanted level
mm2=model.matrix (~ x + A + B + A:x + B:x, df)
print(mm2)
(Intercept) x A1 A2 B1 B2 B3 x:A1 x:A2 x:B1 x:B2 x:B3
1 1 1 1 0 1 0 0 1 0 1 0 0
2 1 2 0 1 0 1 0 0 2 0 2 0
3 1 3 1 0 0 0 1 3 0 0 0 3
4 1 4 0 1 1 0 0 0 4 4 0 0
5 1 5 1 0 0 1 0 5 0 0 5 0
6 1 6 0 1 0 0 1 0 6 0 0 6
We can get rid of the unwanted intercept and the unwanted bare x term
dm2=as.matrix(mm2[,c(-1,-2)])
print(dm2)
A1 A2 B1 B2 B3 x:A1 x:A2 x:B1 x:B2 x:B3
1 1 0 1 0 0 1 0 1 0 0
2 0 1 0 1 0 0 2 0 2 0
3 1 0 0 0 1 3 0 0 0 3
4 0 1 1 0 0 0 4 4 0 0
5 1 0 0 1 0 5 0 0 5 0
6 0 1 0 0 1 0 6 0 0 6