Exchange data.table columns with most prevalent value of columns - r

I have data
test = data.table(
a = c(1,1,3,4,5,6),
b = c("a", "be", "a", "c", "d", "c"),
c = rep(1, 6)
)
I wish to take the unique values of column a, store it in another data.table, and afterwards fill in the remaining columns with the most prevalent values of those remaining columns, such that my resulting data.table would be:
test2 = data.table(a = c(1,3,4,5,6), b = "a", c = 1)
Column be has equal amounts of "a" and "c", but it doesn't matter which is chosen in those cases.
Attempt so far:
test2 = unique(test, by = "a")
test2[, c("b", "c") := lapply(.SD, FUN = function(x){test2[, .N, by = x][order(-N)][1,1]}), .SDcols = c("b", "c")]
EDIT: I would preferrably like a generic solution that is compatible with a function where I specify the column to be "uniqued", and the rest of the columns are with the single most prevalent value. Hence my use of lapply and .SD =)
EDIT2: as #MichaelChirico points out, how do we keep the class. With the following data.table some of the solutions does not work, although solution of #chinsoon12 does work:
test = data.table(a = c(1,1,3,4,5,6),
b = c("a", "be", "a", "c", "d", "c"),
c = rep(1, 6),
d = as.Date("2019-01-01"))

Another option:
dtmode <- function(x) x[which.max(rowid(x))]
test[, .(A=unique(A), B=dtmode(B), C=dtmode(C))]
data:
test = data.table(
A = c(1,1,3,4,5,6),
B = c("a", "be", "a", "c", "d", "c"),
C = rep(1, 6)
)

Not a clean way to do this but it works.
test = data.frame(a = c(1,1,3,4,5,6), b = c("a", "be", "a", "c", "d", "c"), c = rep(1, 6))
a = unique(test$a)
b = tail(names(sort(table(test$b))), 1)
c = tail(names(sort(table(test$c))), 1)
test2 = cbind(a,b,c)
Output is like this:
> test2
a b c
[1,] "1" "c" "1"
[2,] "3" "c" "1"
[3,] "4" "c" "1"
[4,] "5" "c" "1"
[5,] "6" "c" "1"
>

#EmreKiratli is very close to what I would do:
test[ , c(
list(a = unique(a)),
lapply(.SD, function(x) as(tail(names(sort(table(x))), 1L), class(x)))
), .SDcols = !'a']
The as(., class(x)) part is because names in R are always character, so we have to convert back to the original class of x.
You might like this better in magrittr form since it's many nested functions:
library(magrittr)
test[ , c(
list(a = unique(a)),
lapply(.SD, function(x) {
table(x) %>% sort %>% names %>% tail(1L) %>% as(class(x))
})
), .SDcols = !'a']

I was able to make an OK solution, but if somebody can do it more elegantly, for example not going through the step of storting a list in refLevel below, please let me know! I'm very interested in learning data.table properly!
#solution:
test = data.table(a = c(1,1,3,4,5,6), b = c("a", "be", "a", "c", "d", "c"), c = rep(1, 6))
test2 = unique(test, by="a")
funPrev = function(x){unlist(as.data.table(x)[, .N, by=x][order(-N)][1,1], use.names = F)}
refLevel = lapply(test[, c("b", "c")], funPrev)
test2[, c("b", "c") := refLevel]
...and using a function (if anybody see any un-necessary step, please let me know):
genData = function(dt, var_unique, vars_prev){
data = copy(dt)
data = unique(data, by = var_unique)
funPrev = function(x){unlist(as.data.table(x)[, .N, by=x][order(-N)][1,1], use.names = F)}
refLevel = lapply(dt[, .SD, .SDcols = vars_prev], funPrev)
data[, (vars_prev) := refLevel]
return(data)
}
test2 = genData(test, "a", c("b", "c"))

Here's another variant which one might find less sophisticated, yet more readable. It's essentially chinsoon12's rowid approach generalized for any number of columns. Also the classes are kept.
test = data.table(a = c(1,1,3,4,5,6),
b = c("a", "be", "a", "c", "d", "c"),
c = rep(1, 6),
d = as.Date("2019-01-01"))
test2 = unique(test, by = "a")
for (col in setdiff(names(test2), "a")) test2[[col]] = test2[[col]][which.max(rowid(test2[[col]]))]

Related

Overriding data.table key order causes incorrect merge results

In the following example I use a dplyr::arrange on a data.table with a key. This overrides the sort on that column:
x <- data.table(a = sample(1000:1100), b = sample(c("A", NA, "B", "C", "D"), replace = TRUE), c = letters)
setkey(x, "a")
# lose order on datatable key
x <- dplyr::arrange(x, b)
y <- data.table(a = sample(1000:1100), f = c(letters, NA), g = c("AA", "BB", NA, NA, NA, NA))
setkey(y, "a")
res <- merge(x, y, by = c("a"), all.x = TRUE)
# try merge with key removed
res2 <- merge(x %>% as.data.frame() %>% as.data.table(), y, by = c("a"), all.x = TRUE)
# merge results are inconsistent
identical(res, res2)
I can see that if I ordered with x <- x[order(b)], I would maintain the sort on the key and the results would be consistent.
I am not sure why I cannot use dplyr::arrange and what relationship the sort key has with the merge. Any insight would be appreciated.
The problem is that with dplyr::arrange(x, b) you do not remove the sorted attribute from your data.table contrary to using x <- x[order(b)] or setorder(x, "b").
The data.table way would be to use setorder in the first place e.g.
library(data.table)
x <- data.table(a = sample(1000:1100), b = sample(c("A", NA, "B", "C", "D"), replace = TRUE), c = letters)
setorder(x, "b", "a", na.last=TRUE)
The wrong results of joins on data.tables which have a key although they are not sorted by it, is a known bug (see also #5361 in data.table bug tracker).

wilcox test inside rows

I have a data frame:
structure(list(groups = c("A", "A", "A", "A", "B", "B", "B",
"B", "C", "C", "C", "C", "D", "D", "D", "D"), weight = c(50.34869444,
49.20443342, 50.62727386, 50.12316397, 49.84571613, 50.88337532,
48.23188285, 51.13725686, 51.19946209, 49.02212935, 50.00188434,
49.70067628, 50.50444172, 48.88528478, 49.2378029, 49.11125589
), height = c(149.5389985, 150.7241218, 149.6922257, 149.6660622,
150.2770344, 149.6382699, 150.1900336, 151.264749, 151.3418096,
149.9407582, 150.2397936, 149.3163071, 148.079746, 149.1675788,
147.5201934, 150.8203477), age = c(10.18377395, 8.388813147,
9.858806212, 9.859746016, 9.584814407, 9.081315423, 10.67367302,
10.26713746, 10.96606861, 11.58603799, 10.34936347, 9.93621052,
9.584046986, 8.413787028, 10.39826156, 9.977231496), month_birth = c(3.627272074,
1.989467718, 2.175805989, 1.095100584, 2.16437856, 1.215151355,
2.63897628, 0.942159155, 1.155299136, 0.404000756, 1.695590789,
2.739378326, 1.950649717, 1.312775225, 1.904828579, 1.325257624
)), class = "data.frame", row.names = c(NA, -16L))
I want to use wilcox test to compare columns within each group individually
What was I trying to do:
wilcox.fun <- function(dat, col,group.labels) {
c1 <- combn(unique(group.labels),2)
sigs <- list()
for(i in 1:ncol(c1)) {
sigs[[i]] <- wilcox.test(
dat[c1[i,],col],
dat[c1[i,],col]
)
}
names(sigs) <- paste("Group",c1[1,],"by Group",c1[2,])
tests <- data.frame(Test=names(sigs),
W=unlist(lapply(sigs,function(x) x$statistic)),
p=unlist(lapply(sigs,function(x) x$p.value)),row.names=NULL)
return(tests)
}
debug(test.fun)
tests <- lapply(colnames(data[,c(2:6)]),function(x) wilcox.fun(data,group.labels=c(2:6),x))
names(tests) <- colnames(data[,c(2:6)])
I want to use the wilcox test to compare not between groups, but within the same group between the selected columns.
You can try this code to apply wilcox.test for every combination of variables within each group.
wilcox.fun <- function(dat) {
do.call(rbind, combn(names(dat)[-1], 2, function(x) {
test <- wilcox.test(dat[[x[1]]], dat[[x[2]]])
data.frame(Test = sprintf('Group %s by Group %s', x[1], x[2]),
W = test$statistic,
p = test$p.value)
}, simplify = FALSE))
}
result <- purrr::map_df(split(data, data$groups), wilcox.fun, .id = 'Group')

Replace values in vector where not %in% vector

Short question:
I can substitute certain variable values like this:
values <- c("a", "b", "a", "b", "c", "a", "b")
df <- data.frame(values)
What's the easiest way to replace all the values of df$values by "x" (where the value is neither "a" or "b")?
Output should be:
c("a", "b", "a", "b", "x", "a", "b")
Your example is a bit unclear and not reproducible.
However, based on guessing what you actually want, I could suggest trying this option using the data.table package:
df[values %in% c("a", "b"), values := "x"]
or the dplyr package:
df %>% mutate(values = ifelse(values %in% c("a","b"), x, values))
What about:
df[!df[, 1] %in% c("a", "b"), ] <- "x"
values
1 a
2 b
3 a
4 b
5 x
6 a
7 b

Avoid the use of nested loop in multiple column comparison

I have a dataframe like this:
df <- data.frame(Patient.ID = rep(paste("Pat", seq(1:3), sep = ""), 2),
Gene = c(rep("Gene1", 3), rep("Gene2", 3)),
Ref = c("A", "C", "G", "T", "A", "T"),
Tum1 = c("A", "A", "T", "T", "A", "T"),
Tum2 = c("A", "C", "G", "G", "C", "C"))
What I would like to do is determine the change that is occurring between the Ref or either Tum column. In other words, if Tum1 is different from Tum2 take the character string which is different to the Ref column and store that in a separate column as the change so the dataframe above would become:
df <- data.frame(Patient.ID = rep(paste("Pat", seq(1:3), sep = ""), 2),
Gene = c(rep("Gene1", 3), rep("Gene2", 3)),
Ref = c("A", "C", "G", "T", "A", "T"),
Tum1 = c("A", "A", "T", "T", "A", "T"),
Tum2 = c("A", "C", "G", "G", "C", "C"),
BaseChange = c("NoCh", "C.A", "G.T", "T.G", "A.C", "T.C"))
I'm aware I could use a nested ifelse() statement like below (but extended) to solve this, but my actual dataframe has many more combinations and I figure there has to be a "safer" method of doing so.
df$BaseChange <- as.factor(ifelse(df$Ref == "C" & df$Tum1 == "A" | df$Ref== "C" & df$Tum2 == "A", "C.A",
ifelse((df$Ref == "G" & df$Tum1 == "T" | df$Ref == "G" & df$Tum2 == "T"), "G.T",...)))
Any help would be greatly appreciated.
It's not pretty, but it works:
df <- df %>%
mutate(BaseChange2 = ifelse( (as.character(Ref)==as.character(Tum1) & as.character(Ref) == as.character(Tum2)), "NoCh",
ifelse(as.character(Ref)==as.character(Tum1),paste(Ref,Tum2, sep="."),paste(Ref,Tum1, sep="."))))
It seems tha you need to paste unique Tums together, i.e.
apply(df[3:5], 1, function(i) paste0(unique(i), collapse = '.'))
#[1] "A" "C.A" "G.T" "T.G" "A.C" "T.C"
To replace the first A,
v2 <- apply(df[3:5], 1, function(i) paste0(unique(i), collapse = '.'))
replace(v2, nchar(v2) == 1, 'NoChange')
#[1] "NoChange" "C.A" "G.T" "T.G" "A.C" "T.C"

lookup using two columns with unique rows in R data.table

I am wondering if it is possible to use two columns to do a lookup in R data.table. Here is a little experiment that failed:
x <- data.table(A = c("a", "a", "b", "b", "c", "c"),
D = c( 1, 2, 1, 2, 4, 5))
DT <- data.table(A = c("a", "a", "b", "b"),
D = c( 1, 2, 1, 2))
setkey(DT, A, D)
DT[J(x$A, x$D), ] # Same as below, thanks to ilir, I thought it returns an error previously
DT[J(x$A, x$D), , allow.cartesian=TRUE]
# Return:
# A D
# 1: a 1
# 2: a 2
# 3: b 1
# 4: b 2
# 5: c 4 # <- ideally (NA NA) or (c NA)
# 6: c 5 # <- ideally (NA NA) or (c NA)
In this experiment, rows in DT are unique, however, both columns have duplicated entries. When calling DT[J(x$A, x$D), ], what I want to do is to lookup table DT, thus I would expect the result only has entries in DT, however, this is clearly not the case.
Is there an effective way to do a lookup with two columns as keys?
Thanks to ilir, the following code works:
x <- data.table(A = c("a", "a", "b", "b", "c", "c"),
D = c( 1, 2, 1, 2, 4, 5))
DT <- data.table(A = c("a", "a", "b", "b"),
D = c( 1, 2, 1, 2))
DT[, aux := 1L]
setkey(DT, A, D)
DT[J(x$A, x$D), ]
inx <- !is.na(DT[J(x$A, x$D), ]$aux)

Resources