How to keep and remove columns with certain condition simultaneously - r

I have 8 columns of variables which I must keep column 1 to 3. For column 4 to 8 I need to keep those with only 3 levels and drop which does not qualify that condition.
I tried the following command
data3 <- data2[,sapply(data2,function(col)length(unique(col)))==3]
It managed to retain the variables with 3 levels, but deleted my first 3 columns.

You could do a two step process:
data4 <- data2[1:3]
#Your answer for the second part here:
data3 <- data2[,sapply(data2,function(col)length(unique(col)))==3]
merge(data3,data4)
Depending on what you would like your expected output to be, could try with the option all =TRUE inside the merge().

I would suggest another approach:
x = 1:3
cbind(data2[x], Filter(function(i) length(unique(i))==3, data2[-x]))
# 1 2 3 5
#1 a 1 3 b
#2 b 2 4 b
#3 c 3 5 b
#4 d 4 6 a
#5 e 5 7 c
#6 f 6 8 c
#7 g 7 9 c
#8 h 8 10 a
#9 i 9 11 c
#10 j 10 12 b
Data:
data2 = setNames(
data.frame(letters[1:10],
1:10,
3:12,
sample(letters[1:10],10, replace=T),
sample(letters[1:3],10, replace=T)),
1:5)

Assuming that the columns 4:8 are factor class, we can also use nlevels to filter the columns. We create 'toKeep' as the numeric index of columns to keep, and 'toFilter' as numeric index of columns to filter. We subset the dataset into two: 1) using the 'toKeep' as the index (data2[toKeep]), 2) using the 'toFilter', we further subset the dataset by looping with sapply to find the number of levels (nlevels), create logical index (==3) to filter the columns and cbind with the first subset.
toKeep <- 1:3
toFilter <- setdiff(seq_len(ncol(data2)), n)
cbind(data2[toKeep], data2[toFilter][sapply(data2[toFilter], nlevels)==3])
# V1 V2 V3 V4 V6
#1 B B D C B
#2 B D D A B
#3 D E B A B
#4 C B E C A
#5 D D A D E
#6 E B A A B
data
set.seed(24)
data2 <- as.data.frame(matrix(sample(LETTERS[1:5], 8*6, replace=TRUE), ncol=8))

Related

remove cases following certain other cases

I have a dataframe, say
df = data.frame(x = c("a","a","b","b","b","c","d","t","c","b","t","c","t","a","a","b","d","t","t","c"),
y = c(2,4,5,2,6,2,4,5,2,6,2,4,5,2,6,2,4,5,2,6))
I want to remove only those rows in which one or multiple ts are directly in between a d and a c, in all other cases I want to retain the cases. So for this example, I would like to remove the ts on row 8, 18 and 19, but keep the others. I have over thousands of cases so doing this manually would be a true horror. Any help is very much appreciated.
One option would be to use rle to get runs of the same string and then you can use an sapply to check forward/backward and return all the positions you want to drop:
rle_vals <- rle(as.character(df$x))
drop <- unlist(sapply(2:length(rle_vals$values), #loop over values
function(i, vals, lengths) {
if(vals[i] == "t" & vals[i-1] == "d" & vals[i+1] == "c"){#Check if value is "t", previous is "d" and next is "c"
(sum(lengths[1:i-1]) + 1):sum(lengths[1:i]) #Get row #s
}
},vals = rle_vals$values, lengths = rle_vals$lengths))
drop
#[1] 8 18 19
df[-drop,]
# x y
#1 a 2
#2 a 4
#3 b 5
#4 b 2
#5 b 6
#6 c 2
#7 d 4
#9 c 2
#10 b 6
#11 t 2
#12 c 4
#13 t 5
#14 a 2
#15 a 6
#16 b 2
#17 d 4
#20 c 6
This also works, by collapsing to a string, identifying groups of t's between d and c (or c and d - not sure whether you wanted this option as well), then working out where they are and removing the rows as appropriate.
df = data.frame(x=c("a","a","b","b","b","c","d","t","c","b","t","c","t","a","a","b","d","t","t","c"),
y=c(2,4,5,2,6,2,4,5,2,6,2,4,5,2,6,2,4,5,2,6),stringsAsFactors = FALSE)
dfs <- paste0(df$x,collapse="") #collapse to a string
dfs2 <- do.call(rbind,lapply(list(gregexpr("dt+c",dfs),gregexpr("ct+d",dfs)),
function(L) data.frame(x=L[[1]],y=attr(L[[1]],"match.length"))))
dfs2 <- dfs2[dfs2$x>0,] #remove any -1 values (if string not found)
drop <- unlist(mapply(function(a,b) (a+1):(a+b-2),dfs2$x,dfs2$y))
df2 <- df[-drop,]
Here is another solution with base R:
df = data.frame(x = c("a","a","b","b","b","c","d","t","c","b","t","c","t","a","a","b","d","t","t","c"),
y = c(2,4,5,2,6,2,4,5,2,6,2,4,5,2,6,2,4,5,2,6))
#
s <- paste0(df$x, collapse="")
L <- c(NA, NA)
while (TRUE) {
r <- regexec("dt+c", s)[[1]]
if (r[1]==-1) break
L <- rbind(L, c(pos=r[1]+1, length=attr(r, "match.length")-2))
s <- sub("d(t+)c", "x\\1x", s)
}
L <- L[-1,]
drop <- unlist(apply(L,1, function(x) seq(from=x[1], len=x[2])))
df[-drop, ]
# > drop
# 8 18 19
# > df[-drop, ]
# x y
# 1 a 2
# 2 a 4
# 3 b 5
# 4 b 2
# 5 b 6
# 6 c 2
# 7 d 4
# 9 c 2
# 10 b 6
# 11 t 2
# 12 c 4
# 13 t 5
# 14 a 2
# 15 a 6
# 16 b 2
# 17 d 4
# 20 c 6
With gregexpr() it is shorter:
s <- paste0(df$x, collapse="")
g <- gregexpr("dt+c", s)[[1]]
L <- data.frame(pos=g+1, length=attr(g, "match.length")-2)
drop <- unlist(apply(L,1, function(x) seq(from=x[1], len=x[2])))
df[-drop, ]

Set value of data frame new field equal to another field based on condition on a third field in R

If I want to add a field to a given data frame and setting it equal to an existing field in the same data frame based on a condition on a different (existing) field.
I know this works:
is.even <- function(x) x %% 2 == 0
df <- data.frame(a = c(1,2,3,4,5,6),
b = c("A","B","C","D","E","F"))
df$test[is.even(df$a)] <- as.character(df[is.even(df$a), "b"])
> df
a b test
1 1 A NA
2 2 B B
3 3 C NA
4 4 D D
5 5 E NA
6 6 F F
But I have this feeling it can be done a lot better than this.
Using data.table it's quite easy
library(data.table)
dt = data.table(a = c(1,2,3,4,5,6),
b = c("A","B","C","D","E","F"))
dt[is.even(a), test := b]
> dt
a b test
1: 1 A NA
2: 2 B B
3: 3 C NA
4: 4 D D
5: 5 E NA
6: 6 F F

R: Reshape count matrix to long format with multiple entries

I have a matrix. The entries of the matrix are counts for the combination of the dimension levels. For example:
(m0 <- matrix(1:4, nrow=2, dimnames=list(c("A","B"),c("A","B"))))
A B
A 1 3
B 2 4
I can change it to a long format:
library("reshape")
(m1 <- melt(m0))
X1 X2 value
1 A A 1
2 B A 2
3 A B 3
4 B B 4
But I would like to have multipe entries according to value:
m2 <- m1
for (i in 1:nrow(m1)) {
j <- m1[i,"value"]
k <- 2
while ( k <= j) {
m2 <- rbind(m2,m1[i,])
k = k+1
}
}
> m2 <- subset(m2,select = - value)
> m2[order(m2$X1),]
X1 X2
1 A A
3 A B
31 A B
32 A B
2 B A
4 B B
21 B A
41 B B
42 B B
43 B B
Is there a parameter in melt which considers to multiply the entries according to value? Or any other library which can perform this issue?
We could do this with base R. We convert the dimnames of 'm0' to a 'data.frame' with two columns using expand.grid, then replicate the rows of the dataset with the values in 'm0', order the rows and change the row names to NULL (if necessary).
d1 <- expand.grid(dimnames(m0))
d2 <- d1[rep(1:nrow(d1), c(m0)),]
res <- d2[order(d2$Var1),]
row.names(res) <- NULL
res
# Var1 Var2
#1 A A
#2 A B
#3 A B
#4 A B
#5 B A
#6 B A
#7 B B
#8 B B
#9 B B
#10 B B
Or with melt, we convert the 'm0' to 'long' format and then replicate the rows as before.
library(reshape2)
dM <- melt(m0)
dM[rep(1:nrow(dM), dM$value),1:2]
As #Frank mentioned, we can also use table with as.data.frame to create 'dM'
dM <- as.data.frame(as.table(m0))

Reshape a correlation matrix, including each pair of variables only once

I have a table like this:
A B C D E
7 1 6 8 7
9 3 9 5 9
4 6 2 1 10
10 5 3 4 1
1 3 5 9 3
6 4 8 7 6
I am in the process of finding the correlation of each variable with every other variable in the table. This is the R code I use:
test <- read.csv("D:/AB/test.csv")
iterations <- ncol(test)
correlation <- matrix(ncol = 3 , nrow = iterations * iterations)
for (k in 1:iterations) {
for (l in 1:iterations){
corr <- cor(test[,k], test[,l])
corr_string_A <- names(test[k])
corr_string_B <- names(test[l])
correlation[l + ((k-1) * iterations),] <- rbind(corr_string_A, corr_string_B, corr)
}
}
The following is the output that I received:
Var1 Var2 value
1 A A 1.00000000
2 B A 0.50018605
3 C A -0.35747393
4 D A -0.25670054
5 E A -0.02974821
6 A B 0.50018605
7 B B 1.00000000
8 C B 0.56070716
9 D B 0.46164928
10 E B 0.16813991
11 A C -0.35747393
12 B C 0.56070716
13 C C 1.00000000
14 D C 0.52094589
15 E C 0.23190036
16 A D -0.25670054
17 B D 0.46164928
18 C D 0.52094589
19 D D 1.00000000
20 E D -0.39223227
21 A E -0.02974821
22 B E 0.16813991
23 C E 0.23190036
24 D E -0.39223227
25 E E 1.00000000
However, I don't want the values from the upper triangle; i.e., no diagonal values should occur, and each unique combination should appear only once. The final output should look like:
Var1 Var2 value
1 B A 0.50018605
2 C A -0.35747393
3 D A -0.25670054
4 E A -0.02974821
5 C B 0.56070716
6 D B 0.46164928
7 E B 0.16813991
8 D C 0.52094589
9 E C 0.23190036
10 E D -0.39223227
I understand that there are a few techniques like reshape using which the above output can be achieved, but I want to make the above R code to suit and produce the above mentioned results.
I believe the "n" in the second for loop should be made to change dynamically which can help achieving this. However I am not sure how to make this work.
You can convert your correlation matrix to the 3-column format with as.data.frame and as.table, and then limiting to values above or below the diagonal can be done with subset.
subset(as.data.frame(as.table(cor(dat))),
match(Var1, names(dat)) > match(Var2, names(dat)))
# Var1 Var2 Freq
# 2 B A -0.02299154
# 3 C A 0.23155350
# 4 D A -0.28036851
# 5 E A -0.05230260
# 8 C B -0.58384036
# 9 D B -0.80175393
# 10 E B 0.00000000
# 14 D C 0.52094589
# 15 E C 0.23190036
# 20 E D -0.39223227
Note that for larger datasets this should be much more efficient than separately calling cor on pairs of variables because cor is vectorized, and further it's clearly a lot less typing.
If you really must keep the looping code, then you can achieve your desired result with small changes to the pair of for loops and some book keeping about the row of correlation that you are computing:
iterations <- ncol(test)
correlation <- matrix(ncol = 3 , nrow = choose(iterations, 2))
pos <- 1
for (k in 2:iterations) {
for (l in 1:(k-1)){
corr <- cor(test[,k], test[,l])
corr_string_A <- names(test[k])
corr_string_B <- names(test[l])
correlation[pos,] <- rbind(corr_string_A, corr_string_B, corr)
pos <- pos+1
}
}
However I really wouldn't suggest this looping solution; it would be better to use the one-liner I provided and then to handle all generated NA values afterward.
From the OP's loop output, we can subset the rows,
df1[!duplicated(t(apply(df1[1:2], 1, sort))) & df1[,1]!=df1[,2],]
# Var1 Var2 value
#2 B A 0.50018605
#3 C A -0.35747393
#4 D A -0.25670054
#5 E A -0.02974821
#8 C B 0.56070716
#9 D B 0.46164928
#10 E B 0.16813991
#14 D C 0.52094589
#15 E C 0.23190036
#20 E D -0.39223227
Or as I mentioned (first) in the comments, just use
cor(test)

merge two dataframe based on matching two exchangable columns in each dataframe

I have two dataframe in R.
dataframe 1
A B C D E F G
1 2 a a a a a
2 3 b b b c c
4 1 e e f f e
dataframe 2
X Y Z
1 2 g
2 1 h
3 4 i
1 4 j
I want to match dataframe1's column A and B with dataframe2's column X and Y. It is NOT a pairwise comparsions, i.e. row 1 (A=1 B=2) are considered to be same as row 1 (X=1, Y=2) and row 2 (X=2, Y=1) of dataframe 2.
When matching can be found, I would like to add columns C, D, E, F of dataframe1 back to the matched row of dataframe2, as follows: with no matching as na.
Final dataframe
X Y Z C D E F G
1 2 g a a a a a
2 1 h a a a a a
3 4 i na na na na na
1 4 j e e f f e
I can only know how to do matching for single column, however, how to do matching for two exchangable columns and merging two dataframes based on the matching results is difficult for me. Pls kindly help to offer smart way of doing this.
For the ease of discussion (thanks for the comments by Vincent and DWin (my previous quesiton) that I should test the quote.) There are the quota for loading dataframe 1 and 2 to R.
df1 <- data.frame(A = c(1,2,4), B=c(2,3,1), C=c('a','b','e'),
D=c('a','b','e'), E=c('a','b','f'),
F=c('a','c','f'), G=c('a','c', 'e'))
df2 <- data.frame(X = c(1,2,3,1), Y=c(2,1,4,4), Z=letters[7:10])
The following works, but no doubt can be improved.
I first create a little helper function that performs a row-wise sort on A and B (and renames it to V1 and V2).
replace_index <- function(dat){
x <- as.data.frame(t(sapply(seq_len(nrow(dat)),
function(i)sort(unlist(dat[i, 1:2])))))
names(x) <- paste("V", seq_len(ncol(x)), sep="")
data.frame(x, dat[, -(1:2), drop=FALSE])
}
replace_index(df1)
V1 V2 C D E F G
1 1 2 a a a a a
2 2 3 b b b c c
3 1 4 e e f f e
This means you can use a straight-forward merge to combine the data.
merge(replace_index(df1), replace_index(df2), all.y=TRUE)
V1 V2 C D E F G Z
1 1 2 a a a a a g
2 1 2 a a a a a h
3 1 4 e e f f e j
4 3 4 <NA> <NA> <NA> <NA> <NA> i
This is slightly clunky, and has some potential collision and order issues but works with your example
df1a <- df1; df1a$A <- df1$B; df1a$B <- df1$A #reverse A and B
merge(df2, rbind(df1,df1a), by.x=c("X","Y"), by.y=c("A","B"), all.x=TRUE)
to produce
X Y Z C D E F G
1 1 2 g a a a a a
2 1 4 j e e f f e
3 2 1 h a a a a a
4 3 4 i <NA> <NA> <NA> <NA> <NA>
One approach would be to create an id key for matching that is order invariant.
# create id key to match
require(plyr)
df1 = adply(df1, 1, transform, id = paste(min(A, B), "-", max(A, B)))
df2 = adply(df2, 1, transform, id = paste(min(X, Y), "-", max(X, Y)))
# combine data frames using `match`
cbind(df2, df1[match(df2$id, df1$id),3:7])
This produces the output
X Y Z id C D E F G
1 1 2 g 1 - 2 a a a a a
1.1 2 1 h 1 - 2 a a a a a
NA 3 4 i 3 - 4 <NA> <NA> <NA> <NA> <NA>
3 1 4 j 1 - 4 e e f f e
You could also join the tables both ways (X == A and Y == B, then X == B and Y == A) and rbind them. This will produce duplicate pairs where one way yielded a match and the other yielded NA, so you would then reduce duplicates by slicing only a single row for each X-Y combination, the one without NA if one exists.
library(dplyr)
m <- left_join(df2,df1,by = c("X" = "A","Y" = "B"))
n <- left_join(df2,df1,by = c("Y" = "A","X" = "B"))
rbind(m,n) %>%
group_by(X,Y) %>%
arrange(C,D,E,F,G) %>% # sort to put NA rows on bottom of pairs
slice(1) # take top row from combination
Produces:
Source: local data frame [4 x 8]
Groups: X, Y
X Y Z C D E F G
1 1 2 g a a a a a
2 1 4 j e e f f e
3 2 1 h a a a a a
4 3 4 i NA NA NA NA NA
Here's another possible solution in base R. This solution cbind()s new key columns (K1 and K2) to both data.frames using the vectorized pmin() and pmax() functions to derive the canonical order of the key columns, and merges on those:
merge(cbind(df2,K1=pmin(df2$X,df2$Y),K2=pmax(df2$X,df2$Y)),cbind(df1,K1=pmin(df1$A,df1$B),K2=pmax(df1$A,df1$B)),all.x=T)[,-c(1:2,6:7)];
## X Y Z C D E F G
## 1 1 2 g a a a a a
## 2 2 1 h a a a a a
## 3 1 4 j e e f f e
## 4 3 4 i <NA> <NA> <NA> <NA> <NA>
Note that the use of pmin() and pmax() is only possible for this problem because you only have two key columns; if you had more, then you'd have to use some kind of apply+sort solution to achieve the canonical key order for merging, similar to what #Andrie does in his helper function, which would work for any number of key columns, but would be less performant.

Resources