Calling setdiff() on multiple vectors - r

How can I use setdiff() in R to get the elements that are in one vector but not in the others My example is as follows:
dat1 <- c("osa", "bli", "usd", "mnl")
dat2 <- c("mnu", "erd", "usd", "mnl")
dat3 <- c("ssu", "erd", "usd", "mnl")
The following code only returns what is diffrent in dat1 compared to dat2 and dat3:
diffs <- Reduce(setdiff,
list(A = dat1,
B = dat2,
C = dat3
)
How can I modify this code to be able to get all the elements that are uniquely present in on vector compared to the other? Thanks

another solution using setdiff :
myl <- list(A = dat1,
B = dat2,
C = dat3)
lapply(1:length(myl), function(n) setdiff(myl[[n]], unlist(myl[-n])))
[[1]]
[1] "osa" "bli"
[[2]]
[1] "mnu"
[[3]]
[1] "ssu"

a second possibility :
f <- function (...)
{
aux <- list(...)
ind <- rep(1:length(aux), sapply(aux, length))
x <- unlist(aux)
boo <- !(duplicated(x) | duplicated(x, fromLast = T))
split(x[boo], ind[boo])
}
f(dat1, dat2, dat3)
$`1`
[1] "osa" "bli"
$`2`
[1] "mnu"
$`3`
[1] "ssu"

Try this:
all.dat <- list(dat1, dat2, dat3)
from.dat <- rep(seq_along(all.dat), sapply(all.dat, length))
in.dat <- split(from.dat, unlist(all.dat))
in.one.dat <- in.dat[sapply(in.dat, length) == 1]
in.one.dat
# $bli
# [1] 1
# $mnu
# [1] 2
# $osa
# [1] 1
# $ssu
# [1] 3
which tells you what items are found in only one of the dat objects, and which one. If you only care for the names, then finish with: names(in.one.dat).

Related

mean for replicate lists in R?

I have simulation and data structures as follows (just a toy example):
foo = function(mu=0,lambda=1){
x1 = rnorm(1,mu) #X~N(μ,1)
y1 = rexp(1,lambda) #Y~Exp(λ)
list(x=x1,y=y1)
}
mu = 1; lambda = 2 #true values: E(X)=μ=1; E(Y)=1/λ=0.5
set.seed(0); out = replicate(1000, foo(mu,lambda), simplify=FALSE)
# str(out)
Then we get a list out of length(out)=1000, with each list having out$x and out$y.
I want to compute the means for 1000 out$xs and out$ys, respectively.
Of course, I can reach my goal through a not-clever way as
m = c() #for storing simulated values
for(i in 1:2){
s = sapply( 1:1000, function(j)out[[j]][i] )
m[i] = mean( as.numeric(s) )
}
m
# [1] 0.9736922 0.4999028
Can we use a more simple and efficient way to compute the means? I also try lapply(out, mean)
and Reduce("+",out)/1000, but failed...
This is another option if the sublists are always the same length:
> rowMeans(matrix(unlist(out),2))
[1] 0.9736922 0.4999028
Or:
> rowMeans(replicate(1000,unlist(foo(mu,lambda))))
x y
0.9736922 0.4999028
An option is to use purrr::transpose
library(purrr)
out %>% transpose() %>% map(~ mean(unlist(.x)[1:1000]))
# Or: out[1:1000] %>% transpose() %>% map(~ mean(unlist(.x)))
#$x
#[1] 0.9736922
#
#$y
#[1] 0.4999028
Or a base R solution using lapply (which is essentially the same as your explicit for loop):
lapply(c("x", "y"), function(var) mean(sapply(out[1:1000], "[[", var)))
#[[1]]
#[1] 0.9736922
#
#[[2]]
#[1] 0.4999028

Function to apply mean on a list of vectors without need to list the vectors (changes to a function call)

I have multiple objects and I need to apply some function to them, in my example mean. But the function call shouldn't include list, it must look like this: my_function(a, b, d).
Advise how to do it please, probably I need quote or substitute, but I'm not sure how to use them.
a <- c(1:15)
b <- c(1:17)
d <- c(1:19)
my_function <- function(objects) {
lapply(objects, mean)
}
my_function(list(a, b, d))
A possible solution:
a <- c(1:15)
b <- c(1:17)
d <- c(1:19)
my_function <- function(...) {
lapply(list(...), mean)
}
my_function(a, b, d)
#> [[1]]
#> [1] 8
#>
#> [[2]]
#> [1] 9
#>
#> [[3]]
#> [1] 10
To still be able to benefit from the other arguments of mean such as na.rm= and trim=, i.e. to generalize, we may match the formalArgs with the dots and split the call accordingly.
my_function <- function(...) {
cl <- match.call()
m <- match(formalArgs(base:::mean.default), names(cl), 0L)
vapply(as.list(cl)[-c(1L, m)], function(x) {
eval(as.call(c(quote(base:::mean.default), list(x), as.list(cl[m]))))
}, numeric(1L))
}
## OP's example
my_function(a, b, d)
# [1] 8 9 10
## generalization:
set.seed(42)
my_function(rnorm(12), rnorm(5), c(NA, rnorm(3)))
# [1] 0.7553736 -0.2898547 NA
set.seed(42)
my_function(rnorm(12), rnorm(5), c(NA, rnorm(3)), na.rm=TRUE)
# 0.7553736 -0.2898547 -1.2589363
set.seed(42)
my_function(rnorm(12), rnorm(5), c(NA, rnorm(3)), na.rm=TRUE, trim=.5)
# 0.5185655 -0.2787888 -2.4404669
Data:
a <- 1:15; b <- 1:17; d <- 1:19

Find variables that occur only in one cluster in data.frame in R

Using BASE R, I wonder how to answer the following question:
Are there any value on X or Y (i.e., variables of interest names) that occurs only in one element in m (as a cluster) but not others? If yes, produce my desired output below.
For example:
Here we see X == 3 only occurs in element m[[3]] but not m[[1]] and m[[2]].
Here we also see Y == 99 only occur in m[[1]] but not others.
Note: the following is a toy example, a functional answer is appreciated. AND X & Y may or may not be numeric (e.g., be string).
f <- data.frame(id = c(rep("AA",4), rep("BB",2), rep("CC",2)), X = c(1,1,1,1,1,1,3,3),
Y = c(99,99,99,99,6,6,6,6))
m <- split(f, f$id) # Here is `m`
mods <- names(f)[-1] # variables of interest names
Desired output:
list(AA = c(Y = 99), CC = c(X = 3))
# $AA
# Y
# 99
# $CC
# X
# 3
This is a solution based on rapply() and table().
ux <- rapply(m, unique)
tb <- table(uxm <- ux[gsub(rx <- "^.*\\.(.*)$", "\\1", names(ux)) %in% mods])
r <- Map(setNames, n <- uxm[uxm %in% names(tb)[tb == 1]], gsub(rx, "\\1", names(n)))
setNames(r, gsub("^(.*)\\..*$", "\\1", names(r)))
# $AA
# Y
# 99
#
# $CC
# X
# 3
tmp = do.call(rbind, lapply(names(f)[-1], function(x){
d = unique(f[c("id", x)])
names(d) = c("id", "val")
transform(d, nm = x)
}))
tmp = tmp[ave(as.numeric(as.factor(tmp$val)), tmp$val, FUN = length) == 1,]
lapply(split(tmp, tmp$id), function(a){
setNames(a$val, a$nm)
})
#$AA
# Y
#99
#$BB
#named numeric(0)
#$CC
#X
#3
This utilizes #jay.sf's idea of rapply() with an idea from a previous answer:
vec <- rapply(lapply(m, '[', , mods), unique)
unique_vec <- vec[!duplicated(vec) & !duplicated(vec, fromLast = T)]
vec_names <- do.call(rbind, strsplit(names(unique_vec), '.', fixed = T))
names(unique_vec) <- vec_names[, 2]
split(unique_vec, vec_names[, 1])
$AA
Y
99
$CC
X
3

Create a function converting every character column of a data frame into a factor one

I created the following function to convert every character column of a data frame (x) into a factor one, but I got an error message as "Error in if (e[i]) { : argument is not interpretable as logical." Any help would be appreciated.
f<-function(x){
e<-lapply(x, is.character)
i <- 1
while (i >= 1) {
if(e[i]) {as.factor(x[[i]])}
else {x[i]}
}
x
}
You can use :
char2factor <- function(df) {
data.frame(lapply(df, function (v) {
if (is.character(v)) factor(v)
else v
}))
}
For example, if you had the following data :
df <- data.frame(v1=LETTERS[1:5],v2=1:5,stringsAsFactors=FALSE)
df
# v1 v2
# 1 A 1
# 2 B 2
# 3 C 3
# 4 D 4
# 5 E 5
lapply(df, class)
# $v1
# [1] "character"
#
# $v2
# [1] "integer"
You would get :
char2factor(df)
# v1 v2
# 1 A 1
# 2 B 2
# 3 C 3
# 4 D 4
# 5 E 5
lapply(char2factor(df), class)
# $v1
# [1] "factor"
#
# $v2
# [1] "integer"
EDIT:
Per joran's comment (this can be done in one succinct line):
Use:
data.frame(lapply(dat, "["), stringsAsFactors = TRUE)
In context:
#make fake data
dat <- data.frame(w = state.abb [1:10], x=LETTERS[1:10], y=rnorm(10),
z =1:10, stringsAsFactors = FALSE)
str(dat)
dat2 <- data.frame(lapply(dat, "["), stringsAsFactors = TRUE)
str(dat2)
This is the approach I think I would take (EDIT-not anymore):
FUN <- function(x) {
if (is.character(x)) {
x <- as.factor(x)
}
x
}
for(i in seq_along(inds)) {
dat[, i] <- FUN(dat[, i])
}
str(dat)
Using colwise from plyr, you can do
dat <- colwise(function(x) {
if(is.character(x)) as.factor(x) else x
})(dat)

Function argument as a part of the output name

Perhaps a silly question, but I can't find any answers to it anywhere (that I've looked :P ). I am trying to create a function with two arguments, these will be vectors (e.g.x=c(a,b,c) and y=c(50,75,100)). I will write a function which calculates all the combinations of these and have the argument used as a part of the output name. E.g.
function(x,y)
df$output_a_50 = a*2+50^2
df$output_a_75 = a*2+75^2
.....
Any suggestions will be appreciated :)
As #Spacedman and others discussed, your problem is that if you pass c(a, b, c) to your function, the names will be lost. The best alternative in my opinion, is to pass a list:
foo <- function(x, y) {
df <- list()
for (xx in names(x)) {
for (yy in y) {
varname <- paste("output", xx, yy, sep = "_")
df[[varname]] <- x[[xx]]*2 + yy^2
}
}
df
}
foo(x = list(a = NA, b = 1, c = 2:3),
y = c(50, 75, 100))
# $output_a_50
# [1] NA
#
# $output_a_75
# [1] NA
#
# $output_a_100
# [1] NA
#
# $output_b_50
# [1] 2502
#
# $output_b_75
# [1] 5627
#
# $output_b_100
# [1] 10002
#
# $output_c_50
# [1] 2504 2506
#
# $output_c_75
# [1] 5629 5631
#
# $output_c_100
# [1] 10004 10006

Resources