Vectorise a function with a supplied variable - r

I have a function I've been trying to vectorise going from if(){} to ifelse(). It works fine when all the arguments to the function are contained within the data set it is working on, but if I supply an argument as a string, then the vectorisation stops and the first result is used for the whole data set.
Here's an example
# data
dat <- data.frame(var1 = rep(c(0,1), 4),
var2 = c(rep("a", 4), rep("b", 4))
)
# function
my_fun <- function(x, y){
z <- ifelse(y == "a", fun_a(x), fun_b(x))
return(z)
}
fun_a <- function(x){
z <- ifelse(x == 0, "zero", x)
return(z)
}
fun_b <- function(x){
z <- ifelse(x == 1, "ONE", x)
return(z)
}
dat$var3 <- my_fun(dat$var1, dat$var2)
This returns what I expect, a vector with a row-wise value based on var1 and var2
> dat
var1 var2 var3
1 0 a zero
2 1 a 1
3 0 a zero
4 1 a 1
5 0 b 0
6 1 b ONE
7 0 b 0
8 1 b ONE
However, I want to use this functions on different data sets where var2 is not included. I realise that an easy way around would be to add var2 as an extra column in the data set, but I don't really want to do that.
This is what happens when I supply var2 as a string:
other_dat <- data.frame(var1 = rep(c(0,1), 4))
other_dat$var3 <- my_fun(other_dat$var1, y = "a")
other_dat
var1 var3
1 0 zero
2 1 zero
3 0 zero
4 1 zero
5 0 zero
6 1 zero
7 0 zero
8 1 zero
How can I vectorise this function so that it accepts a string argument and returns the result I desire?

You can vectorise the y i.e. make y of similar length as x and then the ifelse will apply the function my_func on all the values. Revised code:
# data
dat <- data.frame(var1 = rep(c(0,1), 4),
var2 = c(rep("a", 4), rep("b", 4))
)
# function
my_fun <- function(x, y){
if(length(y) == 1) {
y <- rep(y, length(x))
}
z <- ifelse(y == "a", fun_a(x), fun_b(x))
return(z)
}
fun_a <- function(x){
z <- ifelse(x == 0, "zero", x)
return(z)
}
fun_b <- function(x){
z <- ifelse(x == 1, "ONE", x)
return(z)
}
dat$var3 <- my_fun(dat$var1, "a")
other_dat <- data.frame(var1 = rep(c(0,1), 4))
other_dat$var3 <- my_fun(other_dat$var1, y = "a")
other_dat
Hope this helps.

Related

What solves my problem: Map, reduce or a recursion?

I really need some help to write a recursion in R.
The function that I want changes a certain observation according to a set of comparisons between different rows in a data frame, which I shall call g. One of these comparisons depends on the previous value of this same observation.
Suppose first that I want to update the value of column index, row i in my data df in the following way:
j <- 1:4
g <- (df$dom[i] > 0 &
abs(df$V2009[i] - df$V2009[j]) <= w) |
df$index[i] == df$index[j]
df$index[i] <- ifelse(any(g), which(g)[[1]], df$index[[i]])
The thing is, the object w is actually a list:
w = list(0, 1, 2, df$age[i])
So, as you can see, I want to create a function foo() that updates df$index iteratively. It changes it by looping through w and comparisons depend on updated values.
Here is some data:
df <- data.frame(dom = c(0, 0, 6, 6),
V2009 = c(9, 11, 9, 11),
index = c(1, 2, 1, 2),
age = c(2, 2, 2, 2))
I am not sure if a recursive function is actually needed or if something like reduce or map would do it.
Thank you!
The following function uses a double for loop to change the values of column index according to the condition defining g. It accepts a data.frame as input and returns the updated data.frame.
foo <- function(x){
change_index <- function(x, i, w){
j <- seq_len(nrow(x))
(x$dom[i] > 0 & abs(x$V2009[i] - x$V2009[j]) <= w) |
x$index[i] == x$index[j]
}
for(i in seq_len(nrow(x))){
W <- list(0, 1, 2, x$age[i])
for(w in W){
g <- change_index(x, i, w)
if(any(g)) x$index[i] <- which(g)[1]
}
}
x
}
foo(df)
# dom V2009 index age
#1 0 9 1 2
#2 0 11 2 2
#3 6 9 1 2
#4 6 11 1 2
One can define w inside a function and use lexical scoping (closure).
Using your instructions, the function index_value calculates for any given i the index value.
correct_index_col returns the corrected df.
df <- data.frame(dom = c(0, 0, 6, 6),
V2009 = c(9, 11, 9, 11),
index = c(1, 2, 1, 2),
age = c(2, 2, 2, 2))
index_value <- function(df, i) {
j <- nrow(df)
w <- c(0, 1, 2, df$age[i])
g <- (df$dom[i] > 0 & abs(df$V2009[i] - df$V2009[j]) <= w) |
df$index[i] == df$index[j]
ifelse(any(g), which(g)[[1]], df$index[[i]])
}
correct_index_col <- function(df) {
indexes <- Vectorize(function(i) {
index_value(df, i)
})
df$index <- indexes(1:nrow(df))
df
}
# > correct_index_col(df)
# dom V2009 index age
# 1 0 9 1 2
# 2 0 11 1 2
# 3 6 9 3 2
# 4 6 11 1 2
#
If you want to really update (mutate) your df, then you have to do
df <- correct_index_col(df).
Here is an attempt of my own. I guess I figured out a way to use recursion over mutate:
test <- function(i, df, k){
j <- 1:nrow(df)
w <- list(0, 1, 2, df$age[i])
g <- (df$dom[i] > 0 & abs(df$V2009[i] - df$V2009[j]) <= w[k]) |
df$index[i] == df$index[j]
l <- ifelse(any(g), which(g)[1], df$index[i])
return(l)
}
loop <- function(data,
k = 1) {
data <- data %>%
mutate(index = map_dbl(seq(n()),
~ test(.x, df = cur_data(), k)))
if (k == 4) {
return(data)
} else {
return(loop(data, k + 1))
}
}
df %>% loop()
I welcome any comments in case this is inefficient considering large datasets

How can I add more rows to my dataframe every several rows

I have this dataframe looks like this:
ID A B C
1 0 O 1
1 0 P 2
1 0 Q 3
2 0 R 1
2 0 S 2
2 0 T 3
3 0 U 1
3 0 V 2
3 0 W 3
I need to add 48 new rows for each ID(I have more than 3 IDs).
For column A: values in the 48 new cells equal to 1
For column B: values in the 48 new cells equal to the value in the last row (Q for 1, T for 2, W for 3)
For column C: values need to increase by 0.5 for 48 times starting the value in the last row.
If you dataframe is that simple, you can use the method below. If you have a lot more ID numbers, it would be better to write a function (more details below the first solution)
ID <- rep(1:3, each = 3)
C <- rep(1:3, 3)
B <- LETTERS[15:23]
DF <- data.frame(ID, "A" = 0, B, C)
# create 3 separate dataframes
DF1 <- DF [ DF$ID == 1,]
DF2 <- DF [ DF$ID == 2,]
DF3 <- DF [ DF$ID == 3,]
# apply the same routine on each dataframe
DF1[4:51, 'A'] <- 1
DF1[4:51, 'ID'] <- 1
DF1[4:51, 'B'] <- DF1[3, 'B']
DF1[4:51, 'C'] <- DF1[3, 'C'] + seq(from = 0.5, by = 0.5,
length.out = 48)
DF2[4:51, 'A'] <- 1
DF2[4:51, 'ID'] <- 2
DF2[4:51, 'B'] <- DF2[3, 'B']
DF2[4:51, 'C'] <- DF2[3, 'C'] + seq(from = 0.5, by = 0.5,
length.out = 48)
DF3[4:51, 'A'] <- 1
DF3[4:51, 'ID'] <- 3
DF3[4:51, 'B'] <- DF3[3, 'B']
DF3[4:51, 'C'] <- DF3[3, 'C'] + seq(from = 0.5, by = 0.5,
length.out = 48)
# combine the 3 dataframe into 1
DFfinal <- rbind(DF1, DF2, DF3)
# reset the row numbers
rownames(DFfinal) <- NULL
If you have much more IDs that 3, it is better to write a function to split and create a DF'n' and then run that function for each ID to build the full dataframe.
extendIDdf <- function(df, i) {
dfi <- df[ID == i,]
dfi[4:51, 'A'] <- 1
dfi[4:51, 'ID'] <- 1
dfi[4:51, 'B'] <- dfi[3, 'B']
dfi[4:51, 'C'] <- dfi[3, 'C'] + seq(from = 0.5, by = 0.5,
length.out = 48)
return (dfi)
}
uniqueIDs <- unique(DF$ID)
DFF <- DF[ID == "",] # create an empty dataframe with correct columns
for ( i in uniqueIDs) {
DFF <- rbind(DFF, extendIDdf(DF, i)) # create the extended dataframe for the ID == i and then amends the running dataframe with it.
}
rownames(DFF) <- NULL

R: looping through list of variable names in data.frame to create new variables

I am trying to write a function that will take a data.frame, a list (or a character vector) of variable names of the data.frame and create some new variables with names derived from the corresponding variable names in the list and values from the variables named in the list.
For example, if data.frame d has variable x, y, z, w, the list of names is c('x', 'z') the output maybe vectors with names x.cat, z.cat and values based on values of d$x and d$z.
I can do this with a loop
df <- data.frame(x = c(1 : 10), y = c(11 : 20), z = c(21 : 30), w = c(41: 50))
vnames <- c("x", "w")
loopfunc <- function(dat, vlst){
s <- paste(vlst, "cat", sep = ".")
for (i in 1:length(vlst)){
dat[s[i]] <- NA
dat[s[i]][dat[vlst[i]] %% 4 == 0 ] <- 0
dat[s[i]][dat[vlst[i]] %% 4 == 1 | dat[vlst[i]] %%4 == 3] <- 1
dat[s[i]][dat[vlst[i]] %% 4 == 2 ] <- 2
}
dat[s]
}
dout <- loopfunc(df, vnames)
This would output a 10x2 data.frame with columns x.cat and w.cat, the values of these are 0, 1, or 2 depending on the remainder of the corresponding values of df$x and df$w mod 4.
I would like to find a way to something like this without loop, maybe using the apply functions?
Here is a failed attempt
noloopfunc <- function(dat, l){
assign(l[2], NA)
assign(l[2][d[l[1]] %% 4 == 0], 0)
assign(l[2][d[l[1]] %% 4 == 2], 2)
assign(l[2][(d[l[1]] %% 4 == 1) | (d[l[1]] %% 4 == 3)], 1)
as.name(l[2])
}
newvnames <- sapply(vnames, function(x){paste(x, "cat", sep = ".")})
vpairs <- mapply(c, vnames, newvnames, SIMPLIFY = F)
lapply(vpairs, noloopfunc, d <- df)
Here the formal argument l is supposed to represent vpairs[[1]] or vpairs[[2]], both string vectors of length 2.
I found several threads on Stackoverflow on converting strings to variable names but I couldn't find anything where it is used in this way where the variables have to be referred to subsequently and assigned values in a non interactive way.
Thanks for any help.
You can replace your loop with an apply variant
dout <- as.data.frame(sapply(vnames, function(x) {
out <- rep(NA, nrow(df))
out[df[,x] %% 4 == 0] <- 0
out[df[,x] %% 4 == 1 | df[,x] %% 4 == 3] <- 1
out[df[,x] %% 4 == 2] <- 2
out
}))
names(dout) <- paste(vnames, "cat", sep=".")

R, pass column name as argument to function using dplyr::filter() and %in%

How can I pass a column name in a function similar to the question here but using dplyr chaining and filter() together with %in%.
require(dplyr)
set.seed(8)
df <- data.frame(
A=sample(c(1:3), 10, replace=T),
B=sample(c(1:3), 10, replace=T))
If want to get rows where column A is 1 or 2 I can do:
df %>% filter(A %in% c(1,2))
I get:
A B
1 2 3
2 1 2
3 1 3
4 2 1
5 1 1
6 1 3
Now, how can I put this in a function, where one can specify the column, this does not work:
fun1 <- function(x, column, n){
res <-
x %>% filter(column %in% n)
return(res)
}
fun1(df, A, c(1,2))
You could try
fun1 <- function(x, column, n){
x %>%
filter_(lazyeval::interp(quote(x %in% y), x=as.name(column), y=n))
}
fun1(df, 'A', 1:2)
Or
fun2 <- function(x, column, n){
args <- as.list(match.call())
x %>%
filter(eval(args$column, x) %in% n)
}
fun2(df, A, 1:2)
If you want to keep your function, try:
fun1 <- function(x, column, n){
res <- x %>% filter_(paste(column,"%in%",n))
return(res)
}
fun1(df, "A", "c(1,2)")
Try changing your function to
fun1 <- function(x, column, n){
require(lazyeval)
filter_(x,
interp(quote(col %in% n),
col = lazy(column), n = n))
}
all(fun1(df, A, c(1, 2)) == filter(df, A %in% c(1,2)))
# TRUE

Recode dataframe based on one column - in reverse

I asked this question a while ago (Recode dataframe based on one column) and the answer worked perfectly. Now however, i almost want to do the reverse. Namely, I have a (700k * 2000) of 0/1/2 or NA. In a separate dataframe I have two columns (Ref and Obs). The 0 corresponds to two instances of Ref, 1 is one instance of Ref and one instance of Obs and 2 is two Obs. To clarify, data snippet:
Genotype File ---
Ref Obs
A G
T C
G C
Ref <- c("A", "T", "G")
Obs <- c("G", "C", "C")
Current Data---
Sample.1 Sample.2 .... Sample.2000
0 1 2
0 0 0
0 NA 1
mat <- matrix(nrow=3, ncol=3)
mat[,1] <- c(0,0,0)
mat[,2] <- c(1,0,NA)
mat[,3] <- c(2,0,1)
Desired Data format---
Sample.1 Sample.1 Sample.2 Sample.2 Sample.2000 Sample.2000
A A A G G G
T T T T T T
G G 0 0 G C
I think that's right. The desired data format has two columns (space separated) for each sample. 0 in this format (plink ped file for the bioinformaticians out there) is missing data.
MAJOR ASSUMPTION: your data is in 3 element frames, i.e. you want to apply your mapping to the first 3 rows, then the next 3, and so on, which I think makes sense given DNA frames. If you want a rolling 3 element window this will not work (but code can be modified to make it work). This will work for an arbitrary number of columns, and arbitrary number of 3 row groups:
# Make up a matrix with your properties (4 cols, 6 rows)
col <- 4L
frame <- 3L
mat <- matrix(sample(c(0:2, NA_integer_), 2 * frame * col, replace=T), ncol=col)
# Mapping data
Ref <- c("A", "T", "G")
Obs <- c("G", "C", "C")
map.base <- cbind(Ref, Obs)
num.to.let <- matrix(c(1, 1, 1, 2, 2, 2), byrow=T, ncol=2) # how many from each of ref obs
# Function to map 0,1,2,NA to Ref/Obs
re_map <- function(mat.small) { # 3 row matrices, with col columns
t(
mapply( # iterate through each row in matrix
function(vals, map, num.to.let) {
vals.2 <- unlist(lapply(vals, function(x) map[num.to.let[x + 1L, ]]))
ifelse(is.na(vals.2), 0, vals.2)
},
vals=split(mat.small, row(mat.small)), # a row
map=split(map.base, row(map.base)), # the mapping for that row
MoreArgs=list(num.to.let=num.to.let) # general conversion of number to Obs/Ref
) )
}
# Split input data frame into 3 row matrices (assumes frame size 3),
# and apply mapping function to each group
mat.split <- split.data.frame(mat, sort(rep(1:(nrow(mat) / frame), frame)))
mat.res <- do.call(rbind, lapply(mat.split, re_map))
colnames(mat.res) <- paste0("Sample.", rep(1:ncol(mat), each=2))
print(mat.res, quote=FALSE)
# Sample.1 Sample.1 Sample.2 Sample.2 Sample.3 Sample.3 Sample.4 Sample.4
# 1 G G A G G G G G
# 2 C C 0 0 T C T C
# 3 0 0 G C G G G G
# 1 A A A A A G A A
# 2 C C C C T C C C
# 3 C C G G 0 0 0 0
I am not sure but this could be what you need:
first same simple data
geno <- data.frame(Ref = c("A", "T", "G"), Obs = c("G", "C", "C"))
data <- data.frame(s1 = c(0,0,0),s2 = c(1, 0, NA))
then a couple of functions:
f <- function(i , x, geno){
x <- x[i]
if(!is.na(x)){
if (x == 0) {y <- geno[i , c(1,1)]}
if (x == 1) {y <- geno[i, c(1,2)]}
if (x == 2) {y <- geno[i, c(2,2)]}
}
else y <- c(0,0)
names(y) <- c("s1", "s2")
y
}
g <- function(x, geno){
Reduce(rbind, lapply(1:length(x), FUN = f , x = x, geno = geno))
}
The way f() is defined may not be the most elegant but it does the job
Then simply run it as a doble for loop in a lapply fashion
as.data.frame(Reduce(cbind, lapply(data , g , geno = geno )))
hope it helps
Here's one way based on the sample data in your answer:
# create index
idx <- lapply(data, function(x) cbind((x > 1) + 1, (x > 0) + 1))
# list of matrices
lst <- lapply(idx, function(x) {
tmp <- apply(x, 2, function(y) geno[cbind(seq_along(y), y)])
replace(tmp, is.na(tmp), 0)
})
# one data frame
as.data.frame(lst)
# s1.1 s1.2 s2.1 s2.2
# 1 A A A G
# 2 T T T T
# 3 G G 0 0

Resources