How to efficiently swap elements in a dataframe? - r

Suppose that we have the following dataframe:
set.seed(1)
(tmp <- data.frame(x = 1:10, R1 = sample(LETTERS[1:5], 10, replace =
TRUE), R2 = sample(LETTERS[1:5], 10, replace = TRUE)))
x R1 R2
1 1 B B
2 2 B A
3 3 C D
4 4 E B
5 5 B D
6 6 E C
7 7 E D
8 8 D E
9 9 D B
10 10 A D
I want to do the following: if the difference between the level index
of factor R1 and that of factor R2 is an odd number, the levels of the
two factors need to be switched between them, which can be performed
through the following code:
for(ii in 1:dim(tmp)[1]) {
kk <- which(levels(tmp$R2) %in% tmp[ii,'R2'], arr.ind = TRUE) -
which(levels(tmp$R1) %in% tmp[ii,'R1'], arr.ind = TRUE)
if(kk%%2!=0) { # swap the their levels between the two factors
qq <- tmp[ii,]$R1
tmp[ii,]$R1 <- tmp[ii,]$R2
tmp[ii,]$R2 <- qq
}
}
More concise and efficient ways to achieve this?
P.S. A slightly different situation is the following.
set.seed(1)
(tmp <- data.frame(x = 1:10, R1 = sample(LETTERS[1:5], 10, replace =
TRUE), R2 = sample(LETTERS[2:6], 10, replace = TRUE)))
x R1 R2
1 C B
2 B B
3 C E
4 E C
5 E B
6 D E
7 E E
8 D F
9 C D
10 A E
Notice that the factor levels between the two factors, R1 and R2, slide by one level; that is, factor R1 does not have level F while factor R2 does not have level A. I want to swap the factor levels based on the combined levels of the two factors as shown below:
tl <- unique(c(levels(tmp$R1), levels(tmp$R2)))
for(ii in 1:dim(tmp)[1]) {
kk <- which(tl %in% tmp[ii,'R2'], arr.ind = TRUE) - which(tl %in%
tmp[ii,'R1'], arr.ind = TRUE)
if(kk%%2!=0) { # swap the their levels between the two factors
qq <- tmp[ii,]$R1
tmp[ii,]$R1 <- tmp[ii,]$R2
tmp[ii,]$R2 <- qq
}
}
How to go about this case? Thanks!

#Find out the indices where difference is odd
inds = abs(as.numeric(tmp$R1) - as.numeric(tmp$R2)) %% 2 != 0
#create new columns where values for the appropriate inds are from relevant columns
tmp$R1_new = replace(tmp$R1, inds, tmp$R2[inds])
tmp$R2_new = replace(tmp$R2, inds, tmp$R1[inds])
tmp
# x R1 R2 R1_new R2_new
#1 1 B B B B
#2 2 B A A B
#3 3 C D D C
#4 4 E B B E
#5 5 B D B D
#6 6 E C E C
#7 7 E D D E
#8 8 D E E D
#9 9 D B D B
#10 10 A D D A
Delete the old R1 and R2 if necessary

A solution using dplyr. dt is the final output. Notice that we need to use if_else from dplyr here, not the common ifelse from base R.
library(dplyr)
dt <- tmp %>%
mutate(R1_new = if_else((as.numeric(R2) - as.numeric(R1)) %% 2 != 0, R2, R1),
R2_new = if_else((as.numeric(R2) - as.numeric(R1)) %% 2 != 0, R1, R2)) %>%
select(x, R1 = R1_new, R2 = R2_new)
Update
For the updated case, add one mutate call to redefine the factor level of R1 and R2. The rest is the same.
tl <- unique(c(levels(tmp$R1), levels(tmp$R2)))
dt <- tmp %>%
mutate(R1 = factor(R1, levels = tl), R2 = factor(R2, levels = tl)) %>%
mutate(R1_new = if_else((as.numeric(R2) - as.numeric(R1)) %% 2 != 0, R2, R1),
R2_new = if_else((as.numeric(R2) - as.numeric(R1)) %% 2 != 0, R1, R2)) %>%
select(x, R1 = R1_new, R2 = R2_new)

Here is an option using data.table
library(data.table)
setDT(tmp)[(as.integer(R1) - as.integer(R2))%%2 != 0, c('R2', 'R1') := .(R1, R2)]
tmp
# x R1 R2
#1: 1 B B
#2: 2 A B
#3: 3 D C
#4: 4 B E
#5: 5 B D
#6: 6 E C
#7: 7 D E
#8: 8 E D
#9: 9 D B
#10:10 D A

Related

Copy values of a column between data frames depending on values of another column

I got these two data frames:
a <- c('A','B','C','D','E','F','G','H')
b <- c(1,2,1,3,1,3,1,6)
c <- c('K','K','H','H','K','K','H','H')
frame1 <- data.frame(a,b,c)
a <- c('A','A','B','B','C','C','D','D','E','E','F','F','G','H','H')
d <- c(5,5,6,3,1,9,1,0,2,3,6,5,5,5,4)
e <- c('W','W','D','D','D','D','W','W','D','D','W','W','D','W','W')
frame2<- data.frame(a,d,e)
And now I want to include the column 'e' from 'frame2' into 'frame1' depending on the matching value in column 'a' of both data frames. Note: 'e' is the same for all rows with the same value in 'a'.
The result should look like this:
a b c e
1 A 1 K W
2 B 2 K D
3 C 1 H D
4 D 3 H W
5 E 1 K D
6 F 3 K W
7 G 1 H D
8 H 6 H W
Any sugestions?
You can use match to matching value in column 'a' of both data frames:
frame1$e <- frame2$e[match(frame1$a, frame2$a)]
frame1
# a b c e
#1 A 1 K W
#2 B 2 K D
#3 C 1 H D
#4 D 3 H W
#5 E 1 K D
#6 F 3 K W
#7 G 1 H D
#8 H 6 H W
or using merge:
merge(frame1, frame2[!duplicated(frame2$a), c("a", "e")], all.x=TRUE)
you can perform join operation on 'a' column of both dataframes and take those values only which are matched. you can do left join , and after that remove 'a' column from 2nd dataframe and also remove rest of the columns, which are'nt needed from 2nd dataframe.
Using dplyr :
library(dplyr)
frame2 %>%
distinct(a, e, .keep_all = TRUE) %>%
right_join(frame1, by = 'a') %>%
select(-d) %>%
arrange(a)
# a e b c
#1 A W 1 K
#2 B D 2 K
#3 C D 1 H
#4 D W 3 H
#5 E D 1 K
#6 F W 3 K
#7 G D 1 H
#8 H W 6 H

avoiding nested sapply when collapsing variable in data.frame with multiple factors

I have a dataframe with multiple factors and multiple numeric vars. I would like to collapse one of the factors (say by mean).
In my attempts I could only think of nested sapply or for loops to isolate the numerical elements to be averaged.
var <- data.frame(A = c(rep('a',8),rep('b',8)), B =
c(rep(c(rep('c',2),rep('d',2)),4)), C = c(rep(c('e','f'),8)),
D = rnorm(16), E = rnorm(16))
> var
A B C D E
1 a c e 1.1601720731 -0.57092435
2 a c f -0.0120178626 1.05003748
3 a d e 0.5311032778 1.67867806
4 a d f -0.3399901000 0.01459940
5 a c e -0.2887561691 -0.03847519
6 a c f 0.0004299922 -0.36695879
7 a d e 0.8124655890 0.05444033
8 a d f -0.3777058654 1.34074427
9 b c e 0.7380720821 0.37708543
10 b c f -0.3163496271 0.10921373
11 b d e -0.5543252191 0.35020193
12 b d f -0.5753686426 0.54642790
13 b c e -1.9973216646 0.63597405
14 b c f -0.3728926714 -3.07669300
15 b d e -0.6461596329 -0.61659041
16 b d f -1.7902722068 -1.06761729
sapply(4:ncol(var), function(i){
sapply(1:length(levels(var$A)), function(j){
sapply(1:length(levels(var$B)), function(t){
sapply(1:length(levels(var$C)), function(z){
mean(var[var$A == levels(var$A)[j] &
var$B == levels(var$B)[t] &
var$C == levels(var$C)[z],i])
})
})
})
})
[,1] [,2]
[1,] 0.435707952 -0.3046998
[2,] -0.005793935 0.3415393
[3,] 0.671784433 0.8665592
[4,] -0.358847983 0.6776718
[5,] -0.629624791 0.5065297
[6,] -0.344621149 -1.4837396
[7,] -0.600242426 -0.1331942
[8,] -1.182820425 -0.2605947
Is there a way to do this without this many sapply? maybe with mapply or outer
Maybe just,
var <- data.frame(A = c(rep('a',8),rep('b',8)), B =
c(rep(c(rep('c',2),rep('d',2)),4)), C = c(rep(c('e','f'),8)),
D = rnorm(16), E = rnorm(16))
library(dplyr)
var %>%
group_by(A,B,C) %>%
summarise_if(is.numeric,mean)
(Note that the output you show isn't what I get when I run your sapply code, but the above is identical to what I get when I run your sapply's.)
For inline aggregation (keeping same number of rows of data frame), consider ave:
var$D_mean <- with(var, ave(D, A, B, C, FUN=mean))
var$E_mean <- with(var, ave(E, A, B, C, FUN=mean))
For full aggregation (collapsed to factor groups), consider aggregate:
aggregate(. ~ A + B + C, var, mean)
I will complete the holy trinity with a data.table solution. Here .SD is a data.table of all the columns not listed in the by portion. This is a near-dupe of this question (only difference is >1 column being summarized), so click that if you want more solutions.
library(data.table)
setDT(var)
var[, lapply(.SD, mean), by = .(A, B, C)]
# A B C D E
# 1: a c e 0.07465822 0.032976115
# 2: a c f 0.40789460 -0.944631574
# 3: a d e 0.72054938 0.039781185
# 4: a d f -0.12463910 0.003363382
# 5: b c e -1.64343115 0.806838905
# 6: b c f -1.08122890 -0.707975411
# 7: b d e 0.03937829 0.048136471
# 8: b d f -0.43447899 0.028266455

Apply List of functions on List of columns based on different combinations

I have a dataframe df with three categorical variables cat1,cat2,cat3 and two continuous variables con1,con2. I would like to compute list of functions sd,mean on list of columns con1,con2 based on different combinations of list of columns cat1,cat2,cat3. I have done them explicitly subsetting all different combinations.
# Random generation of values for categorical data
set.seed(33)
df <- data.frame(cat1 = sample( LETTERS[1:2], 100, replace=TRUE ),
cat2 = sample( LETTERS[3:5], 100, replace=TRUE ),
cat3 = sample( LETTERS[2:4], 100, replace=TRUE ),
con1 = runif(100,0,100),
con2 = runif(100,23,45))
# Introducing null values
df$con1[c(23,53,92)] <- NA
df$con2[c(33,46)] <- NA
results <- data.frame()
funs <- list(sd=sd, mean=mean)
# calculation of mean and sd on total observations
sapply(funs, function(x) sapply(df[,c(4,5)], x, na.rm=T))
# calculation of mean and sd on different levels of cat1
sapply(funs, function(x) sapply(df[df$cat1=='A',c(4,5)], x, na.rm=T))
sapply(funs, function(x) sapply(df[df$cat1=='B',c(4,5)], x, na.rm=T))
# calculation of mean and sd on different levels of cat1 and cat2
sapply(funs, function(x) sapply(df[df$cat1=='A' & df$cat2=='C' ,c(4,5)], x, na.rm=T))
.
.
.
sapply(funs, function(x) sapply(df[df$cat1=='B' & df$cat2=='E' ,c(4,5)], x, na.rm=T))
# Similarly for the combinations of three cat variables cat1, cat2, cat3
I would like to write a function on dynamically computing the list of functions for list of columns based on different combinations. Could you please give some suggestions. Thanks !
Edit:
I have already got some smart suggestions using dplyr. It would be great if someone provides suggestions using the apply family functions as it will help in using them(dataframes) in the further requirements.
This is a simple one-line base solution:
> do.call(cbind, lapply(funs, function(x) aggregate(cbind(con1, con2) ~ cat1 + cat2 + cat3, data = df, FUN = x, na.rm = TRUE)))
sd.cat1 sd.cat2 sd.cat3 sd.con1 sd.con2 mean.cat1 mean.cat2 mean.cat3 mean.con1 mean.con2
1 A C B NA NA A C B 25.52641 37.40603
2 B C B 32.67192 6.966547 B C B 46.70387 34.85437
3 A D B 31.05224 6.530313 A D B 37.91553 37.13142
4 B D B 23.80335 6.001468 B D B 59.75107 30.29681
5 A E B 22.79285 1.526472 A E B 38.54742 25.23007
6 B E B 32.92139 2.621067 B E B 51.56253 29.52367
7 A C C 26.98661 5.710335 A C C 36.32045 36.42465
8 B C C 20.22217 8.117184 B C C 60.60036 34.98460
9 A D C 33.39273 7.367412 A D C 40.77786 35.03747
10 B D C 12.95351 8.829061 B D C 49.77160 33.21836
11 A E C 33.73433 4.689548 A E C 55.53135 32.38279
12 B E C 25.38637 9.172137 B E C 46.69063 31.56733
13 A C D 36.12545 6.323929 A C D 48.34187 32.36789
14 B C D 30.01992 7.130869 B C D 53.87571 33.12760
15 A D D 15.94151 11.756115 A D D 35.89909 31.76871
16 B D D 10.89030 6.829829 B D D 22.86577 32.53725
17 A E D 24.88410 6.108631 A E D 47.32549 35.22782
18 B E D 12.73711 8.151424 B E D 33.95569 36.70167

change data.frame column into rows in R

A <- c(1,6)
B <- c(2,7)
C <- c(3,8)
D <- c(4,9)
E <- c(5,0)
df <- data.frame(A,B,C,D,E)
df
A B C D E
1 1 2 3 4 5
2 6 7 8 9 0
I would like to have this:
df
1 2
A 1 6
B 2 7
C 3 8
D 4 9
E 5 0
If your dataframe is truly in that format, then all of your vectors will be character vectors. Or, you basically have a character matrix and you could do this:
data.frame(t(df))
It would be better, though, to just define it the way you want it from the get-go
df <- data.frame(c('A','B','C','D','E'),
c(1, 2, 3, 4, 5),
c(6, 7, 8, 9, 0))
You could also do this
df <- data.frame(LETTERS[1:5], 1:5, c(6:9, 0))
If you wanted to give the columns names, you could do this
df <- data.frame(L = LETTERS[1:5], N1 = 1:5, N2 = c(6:9, 0))
Sometimes, if I use read.DIF of Excel data the data gets transposed. Is that how you got the original data in? If so, you can call
read.DIF(filename, transpose = T)
to get the data in the correct orientation.
I really recommend data.table approach without manual steps becauce they are error-prone
A <- c(1,6)
B <- c(2,7)
C <- c(3,8)
D <- c(4,9)
E <- c(5,0)
df <- data.frame(A,B,C,D,E)
df
library('data.table')
dat.m <- melt(as.data.table(df, keep.rownames = "Vars"), id.vars = "Vars") # https://stackoverflow.com/a/44128640/54964
dat.m
Output
A B C D E
1 1 2 3 4 5
2 6 7 8 9 0
Vars variable value
1: 1 A 1
2: 2 A 6
3: 1 B 2
4: 2 B 7
5: 1 C 3
6: 2 C 8
7: 1 D 4
8: 2 D 9
9: 1 E 5
10: 2 E 0
R: 3.4.0 (backports)
OS: Debian 8.7

merge two dataframe based on matching two exchangable columns in each dataframe

I have two dataframe in R.
dataframe 1
A B C D E F G
1 2 a a a a a
2 3 b b b c c
4 1 e e f f e
dataframe 2
X Y Z
1 2 g
2 1 h
3 4 i
1 4 j
I want to match dataframe1's column A and B with dataframe2's column X and Y. It is NOT a pairwise comparsions, i.e. row 1 (A=1 B=2) are considered to be same as row 1 (X=1, Y=2) and row 2 (X=2, Y=1) of dataframe 2.
When matching can be found, I would like to add columns C, D, E, F of dataframe1 back to the matched row of dataframe2, as follows: with no matching as na.
Final dataframe
X Y Z C D E F G
1 2 g a a a a a
2 1 h a a a a a
3 4 i na na na na na
1 4 j e e f f e
I can only know how to do matching for single column, however, how to do matching for two exchangable columns and merging two dataframes based on the matching results is difficult for me. Pls kindly help to offer smart way of doing this.
For the ease of discussion (thanks for the comments by Vincent and DWin (my previous quesiton) that I should test the quote.) There are the quota for loading dataframe 1 and 2 to R.
df1 <- data.frame(A = c(1,2,4), B=c(2,3,1), C=c('a','b','e'),
D=c('a','b','e'), E=c('a','b','f'),
F=c('a','c','f'), G=c('a','c', 'e'))
df2 <- data.frame(X = c(1,2,3,1), Y=c(2,1,4,4), Z=letters[7:10])
The following works, but no doubt can be improved.
I first create a little helper function that performs a row-wise sort on A and B (and renames it to V1 and V2).
replace_index <- function(dat){
x <- as.data.frame(t(sapply(seq_len(nrow(dat)),
function(i)sort(unlist(dat[i, 1:2])))))
names(x) <- paste("V", seq_len(ncol(x)), sep="")
data.frame(x, dat[, -(1:2), drop=FALSE])
}
replace_index(df1)
V1 V2 C D E F G
1 1 2 a a a a a
2 2 3 b b b c c
3 1 4 e e f f e
This means you can use a straight-forward merge to combine the data.
merge(replace_index(df1), replace_index(df2), all.y=TRUE)
V1 V2 C D E F G Z
1 1 2 a a a a a g
2 1 2 a a a a a h
3 1 4 e e f f e j
4 3 4 <NA> <NA> <NA> <NA> <NA> i
This is slightly clunky, and has some potential collision and order issues but works with your example
df1a <- df1; df1a$A <- df1$B; df1a$B <- df1$A #reverse A and B
merge(df2, rbind(df1,df1a), by.x=c("X","Y"), by.y=c("A","B"), all.x=TRUE)
to produce
X Y Z C D E F G
1 1 2 g a a a a a
2 1 4 j e e f f e
3 2 1 h a a a a a
4 3 4 i <NA> <NA> <NA> <NA> <NA>
One approach would be to create an id key for matching that is order invariant.
# create id key to match
require(plyr)
df1 = adply(df1, 1, transform, id = paste(min(A, B), "-", max(A, B)))
df2 = adply(df2, 1, transform, id = paste(min(X, Y), "-", max(X, Y)))
# combine data frames using `match`
cbind(df2, df1[match(df2$id, df1$id),3:7])
This produces the output
X Y Z id C D E F G
1 1 2 g 1 - 2 a a a a a
1.1 2 1 h 1 - 2 a a a a a
NA 3 4 i 3 - 4 <NA> <NA> <NA> <NA> <NA>
3 1 4 j 1 - 4 e e f f e
You could also join the tables both ways (X == A and Y == B, then X == B and Y == A) and rbind them. This will produce duplicate pairs where one way yielded a match and the other yielded NA, so you would then reduce duplicates by slicing only a single row for each X-Y combination, the one without NA if one exists.
library(dplyr)
m <- left_join(df2,df1,by = c("X" = "A","Y" = "B"))
n <- left_join(df2,df1,by = c("Y" = "A","X" = "B"))
rbind(m,n) %>%
group_by(X,Y) %>%
arrange(C,D,E,F,G) %>% # sort to put NA rows on bottom of pairs
slice(1) # take top row from combination
Produces:
Source: local data frame [4 x 8]
Groups: X, Y
X Y Z C D E F G
1 1 2 g a a a a a
2 1 4 j e e f f e
3 2 1 h a a a a a
4 3 4 i NA NA NA NA NA
Here's another possible solution in base R. This solution cbind()s new key columns (K1 and K2) to both data.frames using the vectorized pmin() and pmax() functions to derive the canonical order of the key columns, and merges on those:
merge(cbind(df2,K1=pmin(df2$X,df2$Y),K2=pmax(df2$X,df2$Y)),cbind(df1,K1=pmin(df1$A,df1$B),K2=pmax(df1$A,df1$B)),all.x=T)[,-c(1:2,6:7)];
## X Y Z C D E F G
## 1 1 2 g a a a a a
## 2 2 1 h a a a a a
## 3 1 4 j e e f f e
## 4 3 4 i <NA> <NA> <NA> <NA> <NA>
Note that the use of pmin() and pmax() is only possible for this problem because you only have two key columns; if you had more, then you'd have to use some kind of apply+sort solution to achieve the canonical key order for merging, similar to what #Andrie does in his helper function, which would work for any number of key columns, but would be less performant.

Resources