Referencing a dataframe recursively - r

Is there a way to have a dataframe refer to itself?
I find myself spending a lot of time writing things like y$Category1[is.na(y$Category1)]<-NULL which are hard to read and feel like a lot of slow repetitive typing. I wondered if there was something along the lines of:
y$Category1[is.na(self)] <- NULL I could use instead.
Thanks

What a great question. Unfortunately, as #user295691 pointed out in the coments, the issue is with regards to referencing a vector twice: once as the object being indexed and once as the subject of a condition. It does appear impossible to avoid the double reference.
numericVector[cond(numericVector)] <- newVal
What I think we can do is have a nice and neat function so that instead of
# this
y$Category1[is.na(y$Category1)] <- list(NULL)
# we can have this:
NAtoNULL(y$Category1)
For example, the following functions wrap selfAssign() (below):
NAtoNULL(obj) # Replaces NA values in obj with NULL.
NAtoVal(obj, val) # Replaces NA values in obj with val.
selfReplace(obj, toReplace, val) # Replaces toReplace values in obj with val
# and selfAssign can be called directly, but I'm not sure there would be a good reason to
selfAssign(obj, ind, val) # equivalent to obj[ind] <- val
Example:
# sample df
df <- structure(list(subj=c("A",NA,"C","D","E",NA,"G"),temp=c(111L,112L,NA,114L,115L,116L,NA),size=c(0.7133,NA,0.7457,NA,0.0487,NA,0.8481)),.Names=c("subj","temp","size"),row.names=c(NA,-7L),class="data.frame")
df
subj temp size
1 A 111 0.7133
2 <NA> 112 NA
3 C NA 0.7457
4 D 114 NA
5 E 115 0.0487
6 <NA> 116 NA
7 G NA 0.8481
# Make some replacements
NAtoNULL(df$size) # Replace all NA's in df$size wtih NULL's
NAtoVal(df$temp, 0) # Replace all NA's in df$tmp wtih 0's
NAtoVal(df$subj, c("B", "E")) # Replace all NA's in df$subj with alternating "B" and "E"
# the modified df is now:
df
subj temp size
1 A 111 0.7133
2 B 112 NULL
3 C 0 0.7457
4 D 114 NULL
5 E 115 0.0487
6 E 116 NULL
7 G 0 0.8481
# replace the 0's in temp for NA
selfReplace(df$temp, 0, NA)
# replace NULL's in size for 1's
selfReplace(df$size, NULL, 1)
# replace all "E"'s in subj with alternate c("E", "F")
selfReplace(df$subj, c("E"), c("E", "F"))
df
subj temp size
1 A 111 0.7133
2 B 112 1
3 C NA 0.7457
4 D 114 1
5 E 115 0.0487
6 F 116 1
7 G NA 0.8481
Right now this works for vectors, but will fail with *apply. I would love to get it working fully, especially with applying plyr. The key would be to modify
FUNCTIONS
The code for the functions are below.
An important point. This does not (yet!) work with *apply / plyr.
I believe it can by modifying the value of n and adjusting sys.parent(.) in match.call() but it still needs some fiddling.
Any suggestions / modifications would be grealy appreciated
selfAssign <- function(self, ind, val, n=1, silent=FALSE) {
## assigns val to self[ind] in environment parent.frame(n)
## self should be a vector. Currently will not work for matricies or data frames
## GRAB THE CORRECT MATCH CALL
#--------------------------------------
# if nested function, match.call appropriately
if (class(match.call()) == "call") {
mc <- (match.call(call=sys.call(sys.parent(1))))
} else {
mc <- match.call()
}
# needed in case self is complex (ie df$name)
mc2 <- paste(as.expression(mc[[2]]))
## CLEAN UP ARGUMENT VALUES
#--------------------------------------
# replace logical indecies with numeric indecies
if (is.logical(ind))
ind <- which(ind)
# if no indecies will be selected, stop here
if(identical(ind, integer(0)) || is.null(ind)) {
if(!silent) warning("No indecies selected")
return()
}
# if val is a string, we need to wrap it in quotes
if (is.character(val))
val <- paste('"', val, '"', sep="")
# val cannot directly be NULL, must be list(NULL)
if(is.null(val))
val <- "list(NULL)"
## CREATE EXPRESSIONS AND EVAL THEM
#--------------------------------------
# create expressions to evaluate
ret <- paste0("'[['(", mc2, ", ", ind, ") <- ", val)
# evaluate in parent.frame(n)
eval(parse(text=ret), envir=parent.frame(n))
}
NAtoNULL <- function(obj, n=1) {
selfAssign(match.call()[[2]], is.na(obj), NULL, n=n+1)
}
NAtoVal <- function(obj, val, n=1) {
selfAssign(match.call()[[2]], is.na(obj), val, n=n+1)
}
selfReplace <- function(obj, toReplace, val, n=1) {
## replaces occurrences of toReplace within obj with val
# determine ind based on value & length of toReplace
# TODO: this will not work properly for data frames, but neither will selfAssign, yet.
if (is.null(toReplace)) {
ind <- sapply(obj, function(x) is.null(x[[1]]))
} else if (is.na(toReplace)) {
ind <- is.na(obj)
} else {
if (length(obj) > 1) { # note, this wont work for data frames
ind <- obj %in% toReplace
} else {
ind <- obj == toReplace
}
}
selfAssign(match.call()[[2]], ind, val, n=n+1)
}
## THIS SHOULD GO INSIDE NAtoNULL, NAtoVal etc.
# todo: modify for use with *apply
if(substr(paste(as.expression(x1)), 1, 10) == "FUN(obj = ") {
# PASS. This should identify when the call is coming from *apply.
# in such a case, need to increase n by 1 for apply & lapply. Increase n by 2 for sapply
# I'm not sure the increase required for plyr functions
}

Related

R: fast (conditional) subsetting where feasible

I would like to subset rows of my data
library(data.table); set.seed(333); n <- 100
dat <- data.table(id=1:n, x=runif(n,100,120), y=runif(n,200,220), z=runif(n,300,320))
> head(dat)
id x y z
1: 1 109.3400 208.6732 308.7595
2: 2 101.6920 201.0989 310.1080
3: 3 119.4697 217.8550 313.9384
4: 4 111.4261 205.2945 317.3651
5: 5 100.4024 212.2826 305.1375
6: 6 114.4711 203.6988 319.4913
in several stages. I am aware that I could apply subset(.) sequentially to achieve this.
> s <- subset(dat, x>119)
> s <- subset(s, y>219)
> subset(s, z>315)
id x y z
1: 55 119.2634 219.0044 315.6556
My problem is that I need to automate this and it might happen that the subset is empty. In this case, I would want to skip the step(s) that result in an empty set. For example, if my data was
dat2 <- dat[1:50]
> s <-subset(dat2,x>119)
> s
id x y z
1: 3 119.4697 217.8550 313.9384
2: 50 119.2519 214.2517 318.8567
the second step subset(s, y>219) would come up empty but I would still want to apply the third step subset(s,z>315). Is there a way to apply a subset-command only if it results in a non-empty set? I imagine something like subset(s, y>219, nonzero=TRUE). I would want to avoid constructions like
s <- dat
if(nrow(subset(s, x>119))>0){s <- subset(s, x>119)}
if(nrow(subset(s, y>219))>0){s <- subset(s, y>219)}
if(nrow(subset(s, z>318))>0){s <- subset(s, z>319)}
because I fear the if-then jungle would be rather slow, especially since I need to apply all of this to different data.tables within a list using lapply(.). That's why I am hoping to find a solution optimized for speed.
PS. I only chose subset(.) for clarity, solutions with e.g. data.table would be just as welcome if not more so.
I agree with Konrad's answer that this should throw a warning or at least report what happens somehow. Here's a data.table way that will take advantage of indices (see package vignettes for details):
f = function(x, ..., verbose=FALSE){
L = substitute(list(...))[-1]
mon = data.table(cond = as.character(L))[, skip := FALSE]
for (i in seq_along(L)){
d = eval( substitute(x[cond, verbose=v], list(cond = L[[i]], v = verbose)) )
if (nrow(d)){
x = d
} else {
mon[i, skip := TRUE]
}
}
print(mon)
return(x)
}
Usage
> f(dat, x > 119, y > 219, y > 1e6)
cond skip
1: x > 119 FALSE
2: y > 219 FALSE
3: y > 1e+06 TRUE
id x y z
1: 55 119.2634 219.0044 315.6556
The verbose option will print extra info provided by data.table package, so you can see when indices are being used. For example, with f(dat, x == 119, verbose=TRUE), I see it.
because I fear the if-then jungle would be rather slow, especially since I need to apply all of this to different data.tables within a list using lapply(.).
If it's for non-interactive use, maybe better to have the function return list(mon = mon, x = x) to more easily keep track of what the query was and what happened. Also, the verbose console output could be captured and returned.
An interesting approach could be developed using modified filter function offered in dplyr. In case of conditions not being met the non_empty_filter filter function returns original data set.
Notes
IMHO, this is fairly non-standard behaviour and should be reported via warning. Of course, this can be removed and has no bearing on the function results.
Function
library(tidyverse)
library(rlang) # enquo
non_empty_filter <- function(df, expr) {
expr <- enquo(expr)
res <- df %>% filter(!!expr)
if (nrow(res) > 0) {
return(res)
} else {
# Indicate that filter is not applied
warning("No rows meeting conditon")
return(df)
}
}
Condition met
Behaviour: Returning one row for which the condition is met.
dat %>%
non_empty_filter(x > 119 & y > 219)
Results
# id x y z
# 1 55 119.2634 219.0044 315.6556
Condition not met
Behaviour: Returning the full data set as the whole condition is not met due to y > 1e6.
dat %>%
non_empty_filter(x > 119 & y > 219 & y > 1e6)
Results
# id x y z
# 1: 1 109.3400 208.6732 308.7595
# 2: 2 101.6920 201.0989 310.1080
# 3: 3 119.4697 217.8550 313.9384
# 4: 4 111.4261 205.2945 317.3651
# 5: 5 100.4024 212.2826 305.1375
# 6: 6 114.4711 203.6988 319.4913
# 7: 7 112.1879 209.5716 319.6732
# 8: 8 106.1344 202.2453 312.9427
# 9: 9 101.2702 210.5923 309.2864
# 10: 10 106.1071 211.8266 301.0645
Condition met/not met one-by-one
Behaviour: Skipping filter that would return an empty data set.
dat %>%
non_empty_filter(y > 1e6) %>%
non_empty_filter(x > 119) %>%
non_empty_filter(y > 219)
Results
# id x y z
# 1 55 119.2634 219.0044 315.6556

Nested ifelse: improved syntax

Description
ifelse() function allows to filter the values in a vector through a series of tests, each of them producing different actions in case of a positive result. For instance, let xx be a data.frame, as follows:
xx <- data.frame(a=c(1,2,1,3), b=1:4)
xx
a b
1 1
2 2
1 3
3 4
Suppose that you want to create a new column, c, from column b, but depending on the values in column a in the following way:
For each row,
if the value in column a is 1, the value in column c, is the same value in column b.
if the value in column a is 2, the value in column c, is 100 times the value in column b.
in any other case, the value in column c is the negative of the value in column b.
Using ifelse(), a solution could be:
xx$c <- ifelse(xx$a==1, xx$b,
ifelse(xx$a==2, xx$b*100,
-xx$b))
xx
a b c
1 1 1
2 2 200
1 3 3
3 4 -4
Problem 1
An aesthetic problem arises when the number of tests increases, say, four tests:
xx$c <- ifelse(xx$a==1, xx$b,
ifelse(xx$a==2, xx$b*100,
ifelse(xx$a==3, ...,
ifelse(xx$a==4, ...,
...))))
I found partial solution to the problem in this page, which consists in the definition of the functions if.else_(), i_(), e_(), as follows:
library(lazyeval)
i_ <- function(if_stat, then) {
if_stat <- lazyeval::expr_text(if_stat)
then <- lazyeval::expr_text(then)
sprintf("ifelse(%s, %s, ", if_stat, then)
}
e_ <- function(else_ret) {
else_ret <- lazyeval::expr_text(else_ret)
else_ret
}
if.else_ <- function(...) {
args <- list(...)
for (i in 1:(length(args) - 1) ) {
if (substr(args[[i]], 1, 6) != "ifelse") {
stop("All but the last argument, need to be if.then_ functions.", call. = FALSE)
}
}
if (substr(args[[length(args)]], 1, 6) == "ifelse"){
stop("Last argument needs to be an else_ function.", call. = FALSE)
}
args$final <- paste(rep(')', length(args) - 1), collapse = '')
eval_string <- do.call('paste', args)
eval(parse(text = eval_string))
}
In this way, the problem given in the Description, can be rewritten as follows:
xx <- data.frame(a=c(1,2,1,3), b=1:4)
xx$c <- if.else_(
i_(xx$a==1, xx$b),
i_(xx$a==2, xx$b*100),
e_(-xx$b)
)
xx
a b c
1 1 1
2 2 200
1 3 3
3 4 -4
And the code for the four tests will simply be:
xx$c <- if.else_(
i_(xx$a==1, xx$b),
i_(xx$a==2, xx$b*100),
i_(xx$a==3, ...), # dots meaning actions for xx$a==3
i_(xx$a==4, ...), # dots meaning actions for xx$a==4
e_(...) # dots meaning actions for any other case
)
Problem 2 & Question
The given code apparently solves the problem. Then, I wrote the following test function:
test.ie <- function() {
dd <- data.frame(a=c(1,2,1,3), b=1:4)
if.else_(
i_(dd$a==1, dd$b),
i_(dd$a==2, dd$b*100),
e_(-dd$b)
) # it should give c(1, 200, 3, -4)
}
When I tried the test:
test.ie()
it spit the following error message:
Error in ifelse(dd$a == 1, dd$b, ifelse(dd$a == 2, dd$b * 100, -dd$b)) :
object 'dd' not found
Question
Since the if.else_() syntactic constructor is not supposed to run only from the console, is there a way for it to 'know' the variables from the function that calls it?
Note
In "Best way to replace a lengthy ifelse structure in R", a similar problem was posted. However, the given solution there focuses on building the table's new column with the given constant output values (the "then" or "else" slots of the ifelse() function), whereas my case addresses a syntactic problem in which the "then" or "else" slots can even be expressions in terms of other data.frame elements or variables.
I think you can use dplyr::case_when inside dplyr::mutate to achieve this.
library(dplyr)
df <- tibble(a=c(1,2,1,3), b=1:4)
df %>%
mutate(
foo = case_when(
.$a == 1 ~ .$b,
.$a == 2 ~ .$b * 100L,
TRUE ~ .$b * -1L
)
)
#> # A tibble: 4 x 3
#> a b foo
#> <dbl> <int> <int>
#> 1 1 1 1
#> 2 2 2 200
#> 3 1 3 3
#> 4 3 4 -4
In the upcoming relase of dplyr 0.6.0 you won't need to use the akward work-around of .$, and you can just use:
df %>%
mutate(
foo = case_when(
a == 1 ~ b,
a == 2 ~ b * 100L,
TRUE ~ b * -1L
)
)
Taking into account MrFlick's advice, I re-coded the if.else_() function as follows:
if.else_ <- function(...) {
args <- list(...)
for (i in 1:(length(args) - 1) ) {
if (substr(args[[i]], 1, 6) != "ifelse") {
stop("All but the last argument, need to be if.then_ functions.", call. = FALSE)
}
}
if (substr(args[[length(args)]], 1, 6) == "ifelse"){
stop("Last argument needs to be an else_ function.", call. = FALSE)
}
args$final <- paste(rep(')', length(args) - 1), collapse = '')
eval_string <- do.call('paste', args)
eval(parse(text = eval_string), envir = parent.frame())
}
Now the test.ie() function runs properly
test.ie()
[1] 1 200 3 -4
With full respect to the OP's remarkable effort to improve nested ifelse(), I prefer a different approach which I believe is easy to write, concise, maintainable and fast:
xx <- data.frame(a=c(1L,2L,1L,3L), b=1:4)
library(data.table)
# coerce to data.table, and set the default first
setDT(xx)[, c:= -b]
xx[a == 1L, c := b] # 1st special case
xx[a == 2L, c := 100L*b] # 2nd special case, note use of integer 100L
# xx[a == 3L, c := ...] # other cases
# xx[a == 4L, c := ...]
#...
xx
# a b c
#1: 1 1 1
#2: 2 2 200
#3: 1 3 3
#4: 3 4 -4
Note that for the 2nd special case b is multiplied by the integer constant 100L to make sure that the right hand sides are all of type integer in order to avoid type conversion to double.
Edit 2: This can also be written in an even more concise (but still maintainable) way as a one-liner:
setDT(xx)[, c:= -b][a == 1L, c := b][a == 2L, c := 100*b][]
data.table chaining works here, because c is updated in place so that subsequent expressions are acting on all rows of xx even if the previous expression was a selective update of a subset of rows.
Edit 1: This approach can be implemented with base R as well:
xx <- data.frame(a=c(1L,2L,1L,3L), b=1:4)
xx$c <- -xx$b
idx <- xx$a == 1L; xx$c[idx] <- xx$b[idx]
idx <- xx$a == 2L; xx$c[idx] <- 100 * xx$b[idx]
xx
# a b c
#1 1 1 1
#2 2 2 200
#3 1 3 3
#4 3 4 -4

Return multiple results of column-to-matrix operations within a data.table

I have a data.table with multiple categorical variables for which I would like to create contrast (or "dummy") variables along with many more numerical variables which I would like to simply pass by reference.
Example dataset:
library('data.table')
d <- data.table(1:3, # there are lots of numerics, so I want to avoid copying
letters[1:3], # convert these to factor then dummy variable
10:12,
LETTERS[24:26])
# >d
# V1 V2 V3 V4
# 1: 1 a 10 X
# 2: 2 b 11 Y
# 3: 3 c 12 Z
The desired result looks like:
>dummyDT(d)
V1 V3 V2.b V2.c V4.Y V4.Z
1: 1 10 0 0 0 0
2: 2 11 1 0 1 0
3: 3 12 0 1 0 1
which can be produced with:
# this does what I want but is slow and inelegant and not idiomatic data.table
categorToMatrix <- function(x, name_prefix='Var'){
# set levels in order of appearance to avoid default re-sort by alpha
m <- contrasts(factor(x, levels=unique(x)))
dimnames(m) <- list(NULL, paste(name_prefix, colnames(m), sep='.') )
m
}
dummyDT <- function(d){
toDummy <- which(sapply(d, function(x) is.factor(x) | is.character(x)))
if(length(toDummy)>0){
dummyComponent <-
data.table(
do.call(cbind, lapply(toDummy, function(j) {
categorToMatrix(d[[j]], name_prefix = names(d)[j])
} )
)
)
asIs <- (1:ncol(d))[-toDummy]
if(length(asIs)>0) {
allCols <- cbind(d[,asIs,with=FALSE], dummyComponent)
} else allCols <- dummyComponent
} else allCols <- d
return(allCols)
}
(I do not care about maintaining original column ordering.)
I have tried in addition to the above, the approach of splitting each matrix into a list of columns, as in:
# split a matrix into list of columns and keep track of column names
# expanded from #Tommy's answer at: https://stackoverflow.com/a/6821395/2573061
splitMatrix <- function(m){
setNames( lapply(seq_len(ncol(m)), function(j) m[,j]), colnames(m) )
}
# Example:
splitMatrix(categoricalToMatrix(d$V2, name_prefix='V2'))
# $V2.b
# [1] 0 1 0
#
# $V2.c
# [1] 0 0 1
which works for an individual column, but then when I try to lapply to multiple columns, these lists get somehow coerced into string-rows and recycled, which is baffling me:
dummyDT2 <- function(d){
stopifnot(inherits(d,'data.table'))
toDummy <- which(sapply(d, function(x) is.factor(x) | is.character(x)))
if(length(toDummy)>0){
dummyComponent <- d[, lapply(.SD, function(x) splitMatrix( categorToMatrix(x) ) ) ,
.SDcols=isChar]
asIs <- (1:ncol(d))[-toDummy]
if(length(asIs)>0) {
allCols <- cbind(d[,asIs,with=FALSE], dummyComponent)
} else allCols <- dummyComponent
} else allCols <- d
return(allCols)
}
dummyDT2(d)
# V1 V3 V2
# 1: 1 10 0,1,0
# 2: 2 11 0,0,1
# 3: 3 12 0,1,0
# Warning message:
# In data.table::data.table(...) :
# Item 2 is of size 2 but maximum size is 3 (recycled leaving remainder of 1 items)
I then tried wrapping splitMatrix with data.table() and got an amusingly laconic error message.
I know that functions like caret::dummyVars exist for data.frame. I am trying to create a data.table optimized version.
Closely related question: How to one-hot-encode factor variables with data.table?
But there are two differences: I do not want full-rank dummy variables (because I'm using this for regression) but rather contrast variables (n-1 of these for n levels) and I have multiple numeric variables that I do not want to OHE.

Aggregate over categories that contain NAs with ddply and lapply?

I would like to aggregate a data.frame over 3 categories, with one of them varying. Unfortunately this one varying category contains NAs (actually it's the reason why it needs to vary). Thus I created a list of data.frames. Every data.frame within this list contains only complete cases with respect to three variables (with only one of them changing).
Let's reproduce this:
library(plyr)
mydata <- warpbreaks
names(mydata) <- c("someValue","group","size")
mydata$category <- c(1,2,3)
mydata$categoryA <- c("A","A","X","X","Z","Z")
# add some NA
mydata$category[c(8,10,19)] <- NA
mydata$categoryA[c(14,1,20)] <- NA
# create a list of dfs that contains TRUE FALSE
noNAList <- function(vec){
res <- !is.na(vec)
return(res)
}
testTF <- lapply(mydata[,c("category","categoryA")],noNAList)
# create a list of data.frames
selectDF <- function(TFvec){
res <- mydata[TFvec,]
return(res)
}
# check x and see that it may contain NAs as long
# as it's not in one of the 3 categories I want to aggregate over
x <-lapply(testTF,selectDF)
## let's ddply get to work
doddply <- function(df){
ddply(df,.(group,size),summarize,sumTest = sum(someValue))
}
y <- lapply(x, doddply);y
y comes very close to what I want to get
$category
group size sumTest
1 A L 375
2 A M 198
3 A H 185
4 B L 254
5 B M 259
6 B H 169
$categoryA
group size sumTest
1 A L 375
2 A M 204
3 A H 200
4 B L 254
5 B M 259
6 B H 169
But I need to implement aggregation over a third varying variable, which is in this case category and categoryA. Just like:
group size category sumTest sumTestTotal
1 A H 1 46 221
2 A H 2 46 221
3 A H 3 93 221
and so forth. How can I add names(x) to lapply, or do I need a loop or environment here?
EDIT:
Note that I want EITHER category OR categoryA added to the mix. In reality I have about 15 mutually exclusive categorical vars.
I think you might be making this really hard on yourself, if I understand your question correctly.
If you want to aggregate the data.frame 'myData' by three (or four) variables, you would simply do this:
aggregate(someValue ~ group + size + category + categoryA, sum, data=mydata)
group size category categoryA someValue
1 A L 1 A 51
2 B L 1 A 19
3 A M 1 A 17
4 B M 1 A 63
aggregate will automatically remove rows that include NA in any of the categories. If someValue is sometimes NA, then you can add the parameter na.rm=T.
I also noted that you put a lot of unnecessary code into functions. For example:
# create a list of data.frames
selectDF <- function(TFvec){
res <- mydata[TFvec,]
return(res)
}
Can be written like:
selectDF <- function(TFvec) mydata[TFvec,]
Also, using lapply to create a list of two data frames without the NA is overkill. Try this code:
x = list(mydata[!is.na(mydata$category),],mydata[!is.na(mydata$categoryA),])
I know the question explicitly requests a ddply()/lapply() solution.
But ... if you are willing to come on over to the dark side, here is a data.table()-based function that should do the trick:
# Convert mydata to a data.table
library(data.table)
dt <- data.table(mydata, key = c("group", "size"))
# Define workhorse function
myfunction <- function(dt, VAR) {
E <- as.name(substitute(VAR))
dt[i = !is.na(eval(E)),
j = {n <- sum(.SD[,someValue])
.SD[, list(sumTest = sum(someValue),
sumTestTotal = n,
share = sum(someValue)/n),
by = VAR]
},
by = key(dt)]
}
# Test it out
s1 <- myfunction(dt, "category")
s2 <- myfunction(dt, "categoryA")
ADDED ON EDIT
Here's how you could run this for a vector of different categorical variables:
catVars <- c("category", "categoryA")
ll <- lapply(catVars,
FUN = function(X) {
do.call(myfunction, list(dt, X))
})
names(ll) <- catVars
lapply(ll, head, 3)
# $category
# group size category sumTest sumTestTotal share
# [1,] A H 2 46 185 0.2486486
# [2,] A H 3 93 185 0.5027027
# [3,] A H 1 46 185 0.2486486
#
# $categoryA
# group size categoryA sumTest sumTestTotal share
# [1,] A H A 79 200 0.395
# [2,] A H X 68 200 0.340
# [3,] A H Z 53 200 0.265
Finally, I found a solution that might not be as slick as Josh' but it works without no dark forces (data.table). You may laugh – here's my reproducible example using the same sample data as in the question.
qual <- c("category","categoryA")
# get T / F vectors
noNAList <- function(vec){
res <- !is.na(vec)
return(res)
}
selectDF <- function(TFvec) mydata[TFvec,]
NAcheck <- lapply(mydata[,qual],noNAList)
# create a list of data.frames
listOfDf <- lapply(NAcheck,selectDF)
workhorse <- function(charVec,listOfDf){
dfs <- list2env(listOfDf)
# create expression list
exlist <- list()
for(i in 1:length(qual)){
exlist[[qual[i]]] <- parse(text=paste("ddply(",qual[i],
",.(group,size,",qual[i],"),summarize,sumTest = sum(someValue))",
sep=""))
}
res <- lapply(exlist,eval,envir=dfs)
return(res)
}
Is this more like what you mean? I find your example extremely difficult to understand. In the below code, the method can take any column, and then aggregate by it. It can return multiple aggregation functions of someValue. I then find all the column names you would like to aggregate by, and then apply the function to that vector.
# Build a method to aggregate by column.
agg.by.col = function (column) {
by.list=list(mydata$group,mydata$size,mydata[,column])
names(by.list) = c('group','size',column)
aggregate(mydata$someValue, by=by.list, function(x) c(sum=sum(x),mean=mean(x)))
}
# Find all the column names you want to aggregate by
cols = names(mydata)[!(names(mydata) %in% c('someValue','group','size'))]
# Apply the method to each column name.
lapply (cols, agg.by.col)

Creating a function to replace NAs from one data.frame with values from another

I regularly have situations where I need to replace missing values from a data.frame with values from some other data.frame that is at a different level of aggregation. So, for example, if I have a data.frame full of county data I might replace NA values with state values stored in another data.frame. After writing the same merge... ifelse(is.na()) yada yada a few dozen times I decided to break down and write a function to do this.
Here's what I cooked up, along with an example of how I use it:
fillNaDf <- function(naDf, fillDf, mergeCols, fillCols){
mergedDf <- merge(naDf, fillDf, by=mergeCols)
for (col in fillCols){
colWithNas <- mergedDf[[paste(col, "x", sep=".")]]
colWithOutNas <- mergedDf[[paste(col, "y", sep=".")]]
k <- which( is.na( colWithNas ) )
colWithNas[k] <- colWithOutNas[k]
mergedDf[col] <- colWithNas
mergedDf[[paste(col, "x", sep=".")]] <- NULL
mergedDf[[paste(col, "y", sep=".")]] <- NULL
}
return(mergedDf)
}
## test case
fillDf <- data.frame(a = c(1,2,1,2), b = c(3,3,4,4) ,f = c(100,200, 300, 400), g = c(11, 12, 13, 14))
naDf <- data.frame( a = sample(c(1,2), 100, rep=TRUE), b = sample(c(3,4), 100, rep=TRUE), f = sample(c(0,NA), 100, rep=TRUE), g = sample(c(0,NA), 200, rep=TRUE) )
fillNaDf(naDf, fillDf, mergeCols=c("a","b"), fillCols=c("f","g") )
So after I got this running I had this odd feeling that someone has probably solved this problem before me and in a much more elegant way. Is there a better/easier/faster solution to this problem? Also, is there a way that eliminates the loop in the middle of my function? That loop is there because I am often replacing NAs in more than one column. And, yes, the function assumes the columns we're filling from are named the same and the columns we are filling to and the same applies to the merge.
Any guidance or refactoring would be helpful.
EDIT on Dec 2 I realized I had logic flaws in my example which I fixed.
What a great question.
Here's a data.table solution:
# Convert data.frames to data.tables (i.e. data.frames with extra powers;)
library(data.table)
fillDT <- data.table(fillDf, key=c("a", "b"))
naDT <- data.table(naDf, key=c("a", "b"))
# Merge data.tables, based on their keys (columns a & b)
outDT <- naDT[fillDT]
# a b f g f.1 g.1
# [1,] 1 3 NA 0 100 11
# [2,] 1 3 NA NA 100 11
# [3,] 1 3 NA 0 100 11
# [4,] 1 3 0 0 100 11
# [5,] 1 3 0 NA 100 11
# First 5 rows of 200 printed.
# In outDT[i, j], on the following two lines
# -- i is a Boolean vector indicating which rows will be operated on
# -- j is an expression saying "(sub)assign from right column (e.g. f.1) to
# left column (e.g. f)
outDT[is.na(f), f:=f.1]
outDT[is.na(g), g:=g.1]
# Just keep the four columns ultimately needed
outDT <- outDT[,list(a,b,g,f)]
# a b g f
# [1,] 1 3 0 0
# [2,] 1 3 11 0
# [3,] 1 3 0 0
# [4,] 1 3 11 0
# [5,] 1 3 11 0
# First 5 rows of 200 printed.
Here's a slightly more concise/robust version of your approach. You could replace the for-loop with a call to lapply, but I find the loop easier to read.
This function assumes any columns not in mergeCols are fair game to have their NAs filled. I'm not really sure this helps, but I'll take my chances with the voters.
fillNaDf.ju <- function(naDf, fillDf, mergeCols) {
mergedDf <- merge(fillDf, naDf, by=mergeCols, suffixes=c(".fill",""))
dataCols <- setdiff(names(naDf),mergeCols)
# loop over all columns we didn't merge by
for(col in dataCols) {
rows <- is.na(mergedDf[,col])
# skip this column if it doesn't contain any NAs
if(!any(rows)) next
rows <- which(rows)
# replace NAs with values from fillDf
mergedDf[rows,col] <- mergedDf[rows,paste(col,"fill",sep=".")]
}
# don't return ".fill" columns
mergedDf[,names(naDf)]
}
My preference would be to pull out the code from merge that does the matching and do it myself so that I could keep the ordering of the original data frame intact, both row-wise and column-wise. I also use matrix indexing to avoid any loops, though to do so I create a new data frame with the revised fillCols and replace the columns of the original with it; I thought I could fill it in directly but apparently you can't use matrix ordering to replace parts of a data.frame, so I wouldn't be surprised if a loop over the names would be faster in some situations.
With matrix indexing:
fillNaDf <- function(naDf, fillDf, mergeCols, fillCols) {
fillB <- do.call(paste, c(fillDf[, mergeCols, drop = FALSE], sep="\r"))
naB <- do.call(paste, c(naDf[, mergeCols, drop = FALSE], sep="\r"))
na.ind <- is.na(naDf[,fillCols])
fill.ind <- cbind(match(naB, fillB)[row(na.ind)[na.ind]], col(na.ind)[na.ind])
naX <- naDf[,fillCols]
fillX <- fillDf[,fillCols]
naX[na.ind] <- fillX[fill.ind]
naDf[,colnames(naX)] <- naX
naDf
}
With a loop:
fillNaDf2 <- function(naDf, fillDf, mergeCols, fillCols) {
fillB <- do.call(paste, c(fillDf[, mergeCols, drop = FALSE], sep="\r"))
naB <- do.call(paste, c(naDf[, mergeCols, drop = FALSE], sep="\r"))
m <- match(naB, fillB)
for(col in fillCols) {
fix <- which(is.na(naDf[,col]))
naDf[fix, col] <- fillDf[m[fix],col]
}
naDf
}

Resources