Delete consecutive empty rows in R - r

df presents possible name matches. Each pair of matches should be divided by an empty row. However, in some cases my output includes several empty rows between the matching pairs:
> df <- data.frame(id = c(1,2,NA,3,4,NA,NA,NA,5,6,NA), name = c("john jones", "john joners",
NA, "clara prat", "klara prat", NA, NA, NA, "alan turing", "allan turing",
NA), stringsAsFactors = F)
> df
id name
1 1 john jones
2 2 john joners
3 NA <NA>
4 3 clara prat
5 4 klara prat
6 NA <NA>
7 NA <NA>
8 NA <NA>
9 5 alan turing
10 6 allan turing
11 NA <NA>
The desired output is:
> df
id name
1 1 john jones
2 2 john joners
3 NA <NA>
4 3 clara prat
5 4 klara prat
6 NA <NA>
7 5 alan turing
8 6 allan turing
9 NA <NA>
I can do this with a for loop, which I understand is less than optimal.

Perhaps this helps
v1 <- rowSums(!is.na(df))
df[unlist(lapply(split(seq_along(v1),
cumsum(c(1, diff(!v1))<0)), function(i)
i[seq(which.max(v1[i]==0))])),]
# id name
#1 1 john jones
#2 2 john joners
#3 NA <NA>
#4 3 clara prat
#5 4 klara prat
#6 NA <NA>
#9 5 alan turing
#10 6 allan turing
#11 NA <NA>

Here is another approach using rle to look for runs of missing
miss <- rowSums(is.na(df))
# get runs of missing
r <- rle(miss)
r$values <- seq_along(r$values)
# subset data, removing rows when all columns are missing
# and rows sequentially missing
df[!(miss == ncol(df) & duplicated(inverse.rle(r))), ]
# id name
# 1 1 john jones
# 2 2 john joners
# 3 NA <NA>
# 4 3 clara prat
# 5 4 klara prat
# 6 NA <NA>
# 9 5 alan turing
# 10 6 allan turing
# 11 NA <NA>
As mentioned by Akrun, you can use data.table::rleid to avoid some of the explicit rle calculations
df[!(rowSums(is.na(df)) == ncol(df) & duplicated(data.table::rleid(is.na(df[[1]])))) , ]

Using the IRanges package.
df <- data.frame(id = c(1,2,NA,3,4,NA,NA,NA,5,6,NA), name = c("john jones", "john joners",
NA, "clara prat", "klara prat", NA, NA, NA, "alan turing", "allan turing",
NA), stringsAsFactors = F)
library(IRanges)
na.rs <- which(is.na(df$id) & is.na(df$name))
na.rs.re <- reduce(IRanges(na.rs, na.rs))
na.rs.rm <- na.rs.re[width(na.rs.re)>1]
start(na.rs.rm) <- start(na.rs.rm) + 1
df[-as.integer(na.rs.rm), ]
# id name
# 1 1 john jones
# 2 2 john joners
# 3 NA <NA>
# 4 3 clara prat
# 5 4 klara prat
# 6 NA <NA>
# 9 5 alan turing
# 10 6 allan turing
# 11 NA <NA>

Surely not the best solution but easy to follow..
miss <- rowSums(is.na(df))
r <- sum(rle(miss)[[2]])
for(i in 2:length(df$id)){
while(is.na(df$id[i-1]) & is.na(df$id[i])){
df <- df[-(i),]
if(sum(is.na(df$id)) == r) break
}
}

Related

Complex join of longitudinal tables in R

I have ~16 .txt files that I need to turn into one, wide flat file. For each new file, time has passed, and some new variables are added. What I would like to do is append those new columns to the right side of the first table, joining by an identification variable. This gets complicated quickly, so here is an MRE:
library(dplyr)
id <- as.character(1:6)
first <- c("jeff", "jimmy", "andrew", "taj", "karl-anthony", "jamal")
last <- c("teague", "butler", "wiggins", "gibson", "towns", "crawford")
set.seed(1839)
a <- c(1:4, NA, NA)
b <- c(1:4, NA, NA)
c <- c(11:13, NA, 14, NA)
d <- c(11:13, NA, 14, NA)
e <- c(21, 22, NA, 24, NA, 26)
f <- c(21, 22, NA, 24, NA, 26)
Simulating the three different files:
df_1 <- data.frame(
id = id[c(1:3,5)],
first = first[c(1:3,5)],
last = last[c(1:3,5)],
a = a[c(1:3,5)],
b = b[c(1:3,5)]
)
df_2 <- data.frame(
id = id[c(1:3,5)],
first = first[c(1:3,5)],
last = last[c(1:3,5)],
c = c[c(1:3,5)],
d = d[c(1:3,5)]
)
df_3 <- data.frame(
id = id[c(1,2,4,6)],
first = first[c(1,2,4,6)],
last = last[c(1,2,4,6)],
e = e[c(1,2,4,6)],
f = f[c(1,2,4,6)]
)
df_goal <- data.frame(id, first, last, a, b, c, d, e, f)
df_goal is what I want, and here is what it looks like:
> df_goal
id first last a b c d e f
1 1 jeff teague 1 1 11 11 21 21
2 2 jimmy butler 2 2 12 12 22 22
3 3 andrew wiggins 3 3 13 13 NA NA
4 4 taj gibson 4 4 NA NA 24 24
5 5 karl-anthony towns NA NA 14 14 NA NA
6 6 jamal crawford NA NA NA NA 26 26
Note that these are very big files, and the columns are not always in the right order, so I cannot just say to join by keeping the first three columns.
If I do a full_join on all, I get the names repeated every time:
df_all <- df_1 %>%
full_join(df_2, by = "id") %>%
full_join(df_3, by = "id")
> df_all
id first.x last.x a b first.y last.y c d first last e f
1 1 jeff teague 1 1 jeff teague 11 11 jeff teague 21 21
2 2 jimmy butler 2 2 jimmy butler 12 12 jimmy butler 22 22
3 3 andrew wiggins 3 3 andrew wiggins 13 13 <NA> <NA> NA NA
4 5 karl-anthony towns NA NA karl-anthony towns 14 14 <NA> <NA> NA NA
5 4 <NA> <NA> NA NA <NA> <NA> NA NA taj gibson 24 24
6 6 <NA> <NA> NA NA <NA> <NA> NA NA jamal crawford 26 26
What I tried to do next. I wrote a for loop, and I got each data frame, selected just (a) the id column, and (b) columns whose names have not appeared in the df_all data frame yet, and (c) did a full_join:
dfs <- c("df_2", "df_3")
df_all1 <- df_1
for (i in dfs) {
df_all1 <- get(i)[!names(get(i)) %in% names(df_all1)[-1]] %>%
full_join(df_all1, .)
}
> df_all1
id first last a b c d e f
1 1 jeff teague 1 1 11 11 21 21
2 2 jimmy butler 2 2 12 12 22 22
3 3 andrew wiggins 3 3 13 13 NA NA
4 5 karl-anthony towns NA NA 14 14 NA NA
5 4 <NA> <NA> NA NA NA NA 24 24
6 6 <NA> <NA> NA NA NA NA 26 26
Note that this means the cases that did not appear in the first file are missing the names (these represent key demographic variables in my data). I also tried going through row-by-row and doing a column join if the id was already present, and then doing a bind_row if it was not. This code threw an error:
df_all2 <- df_1
for (i in dfs) {
for (k in 1:nrow(get(i))) {
if (get(i)[k, "id"] %in% df_all2$id) {
df_all2 <- get(i)[k, !names(get(i)) %in% names(df_all2)[-1]] %>%
left_join(df_all2, ., by = "id")
} else {
df_all2 <- bind_rows(
df_all2,
get(i)[k, !names(get(i)) %in% names(df_all2)[-1]]
)
}
}
}
There has got to be a way to do a join with only select columns, but fill in missing information if necessary. Again, I am working with lots of files with lots of columns, so I cannot assume that I know the position of any columns; it has to be done by the column names.
I have also thought about just including a new variable that is the date of the file, stacking them all on top of one another ("long" format), and then using tidyr::spread and tidyr::gather, but I haven't found a solution yet.
I am not wedded to the tidyverse (base or data.table would be great, even some way to do a SQL join in R) or even R; I am open to a Python solution using pandas, as well.
Short version: How do I join new columns to an existing data set—by identification number—and fill in information from not-new columns, but since the case is new, need to be filled in?
Possible solution, per Psidom:
df_all1 <- df_1
for (i in dfs) {
df_all1 <- get(i) %>%
full_join(
df_all1, .,
by = names(get(i))[names(get(i)) %in% names(df_all1)]
)
}
df_all1
Maybe a more efficient way to do this, though?
Using melt once you have a full_join df_all.
library(data.table)
df <- melt(setDT(df_all),
measure.vars = patterns("^first", "^last"))
df <- unique(df[,-c("id", "variable")])
df[!is.na(df$value1),]
a b c d e f value1 value2
1: 1 1 11 11 21 21 jeff teague
2: 2 2 12 12 22 22 jimmy butler
3: 3 3 13 13 NA NA andrew wiggins
4: NA NA 14 14 NA NA karl-anthony towns
5: NA NA NA NA 24 24 taj gibson
6: NA NA NA NA 26 26 jamal crawford
The most simple solution using dplyr is to omit the by parameter in the calls to full_join().
library(dplyr)
df_1 %>%
full_join(df_2) %>%
full_join(df_3)
Joining, by = c("id", "first", "last")
Joining, by = c("id", "first",
"last")
id first last a b c d e f
1 1 jeff teague 1 1 11 11 21 21
2 2 jimmy butler 2 2 12 12 22 22
3 3 andrew wiggins 3 3 13 13 NA NA
4 5 karl-anthony towns NA NA 14 14 NA NA
5 4 taj gibson NA NA NA NA 24 24
6 6 jamal crawford NA NA NA NA 26 26
Warning messages:
1: Column id joining factors with different levels, coercing to character vector
2: Column first joining factors with different levels, coercing to character vector
3: Column last joining factors with different levels, coercing to character vector
The documentation of the by parameter in ?full_join says: If NULL, the default, *_join() will do a natural join, using all variables with common names across the two tables.
So this is equivivalent to explicetely passing by = c("id", "first", "last") as proposed by Psidom.
If there are many data frames to join, the code below may save a lot of typing:
Reduce(full_join, list(df_1, df_2, df_3))
The result (inluding messages) is the same as above.

merge messy dataframes r

I have 2 data frames
df1=data.frame(Col1=c('2','4','CN','CANADA',NA),Col2=c('s1','s2','s3','s4','s5'))
> df1
Col1 Col2
1 2 s1
2 4 s2
3 CN s3
4 CANADA s4
5 <NA> s5
df2=data.frame(index=1:5,code=c('AB','CA','US','CN','UK'),name=c('ALBERTA','CANADA','USA','CHINA','UK'),REGION=c('NA','NA','NA','FE','EU'))
> df2
index code name REGION
1 1 AB ALBERTA NA
2 2 CA CANADA NA
3 3 US USA NA
4 4 CN CHINA FE
5 5 UK UK EU
I want
df3=data.frame(df1,code=c('CA','CN','CN','CA',NA),name=c('CANADA','CHINA','CHINA','CANADA',NA),REGION=c('NA','FE','FE','NA',NA))
Col1 Col2 code name REGION
1 2 s1 CA CANADA NA
2 4 s2 CN CHINA FE
3 CN s3 CN CHINA FE
4 CANADA s4 CA CANADA NA
5 <NA> s5 <NA> <NA> <NA>
I have calling it by values:
df1$code=df2[df2$index[df1$Col1],2]
which fills it in incorrectly, and merging twice
m1=merge(df1,df2,by.x='Col1',by.y='index',all.x=TRUE)
m2=merge(m1,df2,by.x='Col1',by.y='name',all.x=1)
I am sure I am missing something here. Thanks for your help
Maybe not a very nice solution but it works for this example:
ind <- sapply(df1$Col1, function(x)which(df2[,c("index", "code", "name")] == as.character(x),arr.ind = T)[1])
cbind(df1, df2[ind,])
Col1 Col2 index code name REGION
2 2 s1 2 CA CANADA NA
4 4 s2 4 CN CHINA FE
4.1 CN s3 4 CN CHINA FE
2.1 CANADA s4 2 CA CANADA NA
NA <NA> s5 NA <NA> <NA> <NA>
As far as I understand the problem, Col1 of df1 contains mixed information. So my approach would be to separate the different data types. Then it should be easy to merge correctly.
chr <- as.character(df1$Col1)
index_df1 <- chr
index_df1[!grepl("^[0-9]*$", chr)] <- NA
index_df1 <- as.numeric(index_df1)
code_df1 <- chr
code_df1[!grepl("^[A-Z]{2}$", chr)] <- NA
name_df1 <- chr
name_df1[!grepl("^[A-Z]{3,}$", chr)] <- NA
df1 <- data.frame(df1, index_df1, code_df1, name_df1)

Reshape data frame with different column lengths into two columns replicating column ID

I have the following data frame, with different row lengths:
myvar <- as.data.frame(rbind(c("Walter","NA","NA","NA","NA"),
c("Walter","NA","NA","NA","NA"),
c("Walter","Jesse","NA","NA","NA"),
c("Gus","Tuco","Mike","NA","NA"),
c("Gus","Mike","Hank","Saul","Flynn")))
ID <- as.factor(c(1:5))
data.frame(ID,myvar)
ID V1 V2 V3 V4 V5
1 Walter NA NA NA NA
2 Walter NA NA NA NA
3 Walter Jesse NA NA NA
4 Gus Tuco Mike NA NA
5 Gus Mike Hank Saul Flynn
My goal is to switch this data frame into a two column data frame. The first column would be the ID and the other one would be the character name. Note that the ID must be correspondent to the row the character were originally placed. I'm expecting the following result:
ID V
1 Walter
2 Walter
3 Walter
3 Jesse
4 Gus
4 Tuco
4 Mike
5 Gus
5 Mike
5 Hank
5 Saul
5 Flynn
I've tried dcast {reshape2} but it doesn't returned what I need. It is noteworthy that my original data frame is quite big. Any tips? Cheers.
You could use unlist
res <- subset(data.frame(ID,value=unlist(myvar[-1],
use.names=FALSE)), value!='NA')
res
# ID value
#1 1 Walter
#2 2 Walter
#3 3 Walter
#4 4 Gus
#5 5 Gus
#6 3 Jesse
#7 4 Tuco
#8 5 Mike
#9 4 Mike
#10 5 Hank
#11 5 Saul
#12 5 Flynn
NOTE: The NAs are 'character' elements in the dataset, it is better to create it without quotes so that it will be real NAs and we can remove it by na.omit, is.na, complete.cases etc.
data
myvar <- data.frame(ID,myvar)
myvar <- as.data.frame(rbind(c("Walter","NA","NA","NA","NA"),
c("Walter","NA","NA","NA","NA"),
c("Walter","Jesse","NA","NA","NA"),
c("Gus","Tuco","Mike","NA","NA"),
c("Gus","Mike","Hank","Saul","Flynn")))
ID <- as.factor(c(1:5))
df <- data.frame(ID, myvar)
Using base reshape. (I'm converting your "NA" character strings to NA which you may not have to do, this is just due to how you created this example)
df[df == 'NA'] <- NA
na.omit(reshape(df, direction = 'long', varying = list(2:6))[, c('ID','V1')])
# ID V1
# 1.1 1 Walter
# 2.1 2 Walter
# 3.1 3 Walter
# 4.1 4 Gus
# 5.1 5 Gus
# 3.2 3 Jesse
# 4.2 4 Tuco
# 5.2 5 Mike
# 4.3 4 Mike
# 5.3 5 Hank
# 5.4 5 Saul
# 5.5 5 Flynn
or using reshape2
library('reshape2')
## na.omit(melt(df, id.vars = 'ID')[, c('ID','value')])
## or better yet as ananda suggests:
melt(df, id.vars = 'ID', na.rm = TRUE)[, c('ID','value')]
# ID value
# 1 1 Walter
# 2 2 Walter
# 3 3 Walter
# 4 4 Gus
# 5 5 Gus
# 8 3 Jesse
# 9 4 Tuco
# 10 5 Mike
# 14 4 Mike
# 15 5 Hank
# 20 5 Saul
# 25 5 Flynn
you get warnings that the factor levels over the columns are not the same but that's fine.
Fix your "NA" so that they are actually NA first:
mydf[mydf == "NA"] <- NA
Using some subsetting to do it all in one fell swoop:
data.frame(ID=mydf$ID[row(mydf[-1])[!is.na(mydf[-1])]], V=mydf[-1][!is.na(mydf[-1])])
# ID V
#1 1 Walter
#2 2 Walter
#3 3 Walter
#4 4 Gus
#5 5 Gus
#6 3 Jesse
#7 4 Tuco
#8 5 Mike
#9 4 Mike
#10 5 Hank
#11 5 Saul
#12 5 Flynn
Or much more readable in base R:
sel <- which(!is.na(mydf[-1]), arr.ind=TRUE)
data.frame(ID=mydf$ID[sel[,1]], V=mydf[-1][sel])
Using tidyr
library("tidyr")
myvar <- as.data.frame(rbind(c("Walter","NA","NA","NA","NA"),
c("Walter","NA","NA","NA","NA"),
c("Walter","Jesse","NA","NA","NA"),
c("Gus","Tuco","Mike","NA","NA"),
c("Gus","Mike","Hank","Saul","Flynn")))
ID <- as.factor(c(1:5))
myvar <- data.frame(ID,myvar)
myvar %>%
gather(ID, Name, V1:V5 ) %>%
select(ID, value) %>%
filter(value != "NA")
If your NAs are coded as NA instead of "NA", then we can in fact use the na.rm = TRUE option in gather. E.g.:
myvar[myvar == "NA"] <- NA
myvar %>%
gather(ID, Name, V1:V5, na.rm = TRUE ) %>%
select(ID, value)
gives
ID value
1 1 Walter
2 2 Walter
3 3 Walter
4 4 Gus
5 5 Gus
6 3 Jesse
7 4 Tuco
8 5 Mike
9 4 Mike
10 5 Hank
11 5 Saul
12 5 Flynn
Since you are thinking of huge data,
time performance would matter, even sorting afterwards could take forever
Here's my solution. You better be using data.table but here I'll use reshape2
first solution
myvar <- as.data.frame(rbind(c("Walter","NA","NA","NA","NA"),
c("Walter","NA","NA","NA","NA"),
c("Walter","Jesse","NA","NA","NA"),
c("Gus","Tuco","Mike","NA","NA"),
c("Gus","Mike","Hank","Saul","Flynn")))
ID <- as.factor(c(1:5))
dat = data.frame(ID,myvar)
dat[] <- lapply(dat, function(x) {x[x=="NA"]=NA; x})
str(dat$V5)
library(dplyr)
library(reshape2)
dat2 <- melt(dat, id.vars="ID", measure.vars = paste0("V", 1:5), na.rm=TRUE)
dat2
dat2[, c('ID', 'value')]
second solution needs some preprocessing. for huge data, i would recommend data.table
datB <- t(dat)
datB
colnames(datB) <- datB["ID", ]
datB <- datB[-1,]
melt(datB, measure.vars = 1:5, na.rm=TRUE)[, c('Var2', 'value')]
you do not need sorting afterwards

Converting data to a 2 column format?

If I have a dataset like the following:
LA NY MA
1 2 3
4 5 6
3 5
4
(In other words, each row has a different structure. LA has 3 values, NY has 4 values, etc.)
I am trying to use lm to perform an ANOVA test (to decide whether the mean number is the same in each state), and it keeps showing "an error occurred" because rows do not match. One idea I got was to convert data to a 2-column format. Which command/package should I use to perform that task?
Edit: the data is from the txt file.
Another option after you read the file to convert to a 2-column format would be
df <- read.table("Betty.txt", header=TRUE, fill=TRUE, sep="\t")
## (as #Richard Scriven mentioned in the comment)
na.omit(stack(df))
# values ind
#1 1 LA
#2 4 LA
#3 3 LA
#5 2 NY
#6 5 NY
#7 5 NY
#8 4 NY
#9 3 MA
#10 6 MA
Update
The above I got by transforming the data to have \t delimiter. But, if the file is copy/pasted directly from the OP's post without any change (making sure that there are spaces for the 3rd and 4th row after the 2nd column)
lines <- readLines('Betty1.txt')
lines2 <- gsub("(?<=[^ ]) +|^[ ]+(?<=[ ])(?=[^ ])", ",", lines, perl=TRUE)
lines2
#[1] "LA,NY,MA" "1,2,3" "4,5,6" "3,5," ",4,"
df1 <- read.table(text=lines2, sep=',', header=TRUE)
df1
# LA NY MA
#1 1 2 3
#2 4 5 6
#3 3 5 NA
#4 NA 4 NA
and then do
na.omit(stack(df1))
Update2
Another option if you have fixed width columns is to use read.fwf
df <- read.fwf('Betty1.txt', widths=c(3,3,3), skip=1)
colnames(df) <- scan('Betty1.txt', nlines=1, what="", quiet=TRUE)
df
# LA NY MA
#1 1 2 3
#2 4 5 6
#3 3 5 NA
#4 NA 4 NA
library(tidyr)
gather(df, Var, Val, LA:MA, na.rm=TRUE)
# Var Val
#1 LA 1
#2 LA 4
#3 LA 3
#4 NY 2
#5 NY 5
#6 NY 5
#7 NY 4
#8 MA 3
#9 MA 6
Just add an 'NA' to the 4th line of your text and try:
> ddf = read.table(text="
+ LA NY MA
+ 1 2 3
+ 4 5 6
+ 3 5
+ NA 4
+ ", header=T, fill=T)
>
> ddf
LA NY MA
1 1 2 3
2 4 5 6
3 3 5 NA
4 NA 4 NA
>
> dput(ddf)
structure(list(LA = c(1L, 4L, 3L, NA), NY = c(2L, 5L, 5L, 4L),
MA = c(3L, 6L, NA, NA)), .Names = c("LA", "NY", "MA"), class = "data.frame", row.names = c(NA,
-4L))
>
> mm = melt(ddf)
No id variables; using all as measure variables
>
> mm
variable value
1 LA 1
2 LA 4
3 LA 3
4 LA NA
5 NY 2
6 NY 5
7 NY 5
8 NY 4
9 MA 3
10 MA 6
11 MA NA
12 MA NA
>
> with(mm, aov(value~variable))
Call:
aov(formula = value ~ variable)
Terms:
variable Residuals
Sum of Squares 4.833333 15.166667
Deg. of Freedom 2 6
Residual standard error: 1.589899
Estimated effects may be unbalanced
3 observations deleted due to missingness

Apply a function to dataframe subsetted by all possible combinations of categorical variables

An example dataframe with categorical variables catA, catB, and catC. Obs is some observed value.
catA <- rep(factor(c("a","b","c")), length.out=100)
catB <- rep(factor(1:4), length.out=100)
catC <- rep(factor(c("d","e","f")), length.out=100)
obs <- runif(100,0,100)
dat <- data.frame(catA, catB, catC, obs)
All possible subsets of data by categorical variables.
allsubs <- expand.grid(catA = c(NA,levels(catA)), catB = c(NA,levels(catB)),
catC = c(NA,levels(catC)))
> head(allsubs, n=10)
catA catB catC
1 <NA> <NA> <NA>
2 a <NA> <NA>
3 b <NA> <NA>
4 c <NA> <NA>
5 <NA> 1 <NA>
6 a 1 <NA>
7 b 1 <NA>
8 c 1 <NA>
9 <NA> 2 <NA>
10 a 2 <NA>
Now, what is the easiest way to create an output dataframe with a results column containing results from a function applied to the corresponding subset (defined in each row by the combination of cat variables) of dat. So the output should look like the following dataframe, 'whatiwant', where the results column will contain the results of a function applied to each subset.
> whatiwant
catA catB catC results
1 <NA> <NA> <NA> *
2 a <NA> <NA> *
3 b <NA> <NA> *
4 c <NA> <NA> *
5 <NA> 1 <NA> *
6 a 1 <NA> *
7 b 1 <NA> *
8 c 1 <NA> *
9 <NA> 2 <NA> *
10 a 2 <NA> *
So, if the function applied was 'mean', the results should be:
dat$results[1] = mean(subset(dat,)$obs)
dat$results[2] = mean(subset(dat, catA=="a")$obs)
etc, etc..
ans <- with(dat, tapply(obs, list(catA, catB, catC), mean))
ans <- data.frame(expand.grid(dimnames(ans)), results=c(ans))
names(ans)[1:3] <- names(dat)[1:3]
str(ans)
# 'data.frame': 36 obs. of 4 variables:
# $ catA : Factor w/ 3 levels "a","b","c": 1 2 3 1 2 3 1 2 3 1 ...
# $ catB : Factor w/ 4 levels "1","2","3","4": 1 1 1 2 2 2 3 3 3 4 ...
# $ catC : Factor w/ 3 levels "d","e","f": 1 1 1 1 1 1 1 1 1 1 ...
# $ results: num 69.7 NA NA 55.3 NA ...
An alternative approach, one function to get all combinations of variables and another to apply a function over all subsets. The combinations function was stolen from another post...
## return all combinations of vector up to maximum length n
multicombn <- function(dat, n) {
unlist(lapply(1:n, function(x) combn(dat, x, simplify=F)), recursive=F)
}
For allsubs, vars is of form c("catA","catB","catC"), out.name = "mean".
func needs to be written in form that ddply would take,
func=function(x) mean(x$obs, na.rm=TRUE)
library(plyr)
allsubs <- function(indat, vars, func=NULL, out.name=NULL) {
results <- data.frame()
nvars <- rev(multicombn(vars,length(vars)))
for(i in 1:length(nvars)) {
results <-
rbind.fill(results, ddply(indat, unlist(nvars[i]), func))
}
if(!missing(out.name)) names(results)[length(vars)+1] <- out.name
results
}
One difference between this answer and shwaund's, this does not return rows for empty
subsets, so no NAs in results column.
allsubs(dat, c("catA","catB","catc"), func, out.name="mean")
> head(allsubs(dat, vars, func, out.name = "mean"),20)
catA catB catC mean
1 a 1 d 56.65909
2 a 2 d 54.98116
3 a 3 d 37.52655
4 a 4 d 58.29034
5 b 1 e 52.88945
6 b 2 e 50.43122
7 b 3 e 52.57115
8 b 4 e 59.45348
9 c 1 f 52.41637
10 c 2 f 34.58122
11 c 3 f 46.80256
12 c 4 f 51.58668
13 <NA> 1 d 56.65909
14 <NA> 1 e 52.88945
15 <NA> 1 f 52.41637
16 <NA> 2 d 54.98116
17 <NA> 2 e 50.43122
18 <NA> 2 f 34.58122
19 <NA> 3 d 37.52655
20 <NA> 3 e 52.57115
This isn't the cleanest solution, but I think it gets close to what you want.
getAllSubs <- function(df, lookup, fun) {
out <- lapply(1:nrow(lookup), function(i) {
df_new <- df
if(length(na.omit(unlist(lookup[i,]))) > 0) {
for(j in colnames(lookup)[which(!is.na(unlist(lookup[i,])))]) {
df_new <- df_new[df_new[,j] == lookup[i,j],]
}
}
fun(df_new)
})
if(mean(sapply(out, length) ==1) == 1) {
out <- unlist(out)
} else {
out <- do.call("rbind", out)
}
final <- cbind(lookup, out)
final[is.na(final)] <- NA
final
}
As it is currently written you have to construct the lookup table beforehand, but you could just as easily move that construction into the function itself. I added a few lines at the end to make sure it could accomodate outputs of different lengths and so NaNs were turned into NAs, just because that seemed to create a cleaner output. As it is currently written, it applies the function to the entire original data frame in cases where all columns are NA.
dat_out <- getAllSubs(dat, allsubs, function(x) mean(x$obs, na.rm = TRUE))
head(dat_out,20)
catA catB catC out
1 <NA> <NA> <NA> 47.25446
2 a <NA> <NA> 51.54226
3 b <NA> <NA> 46.45352
4 c <NA> <NA> 43.63767
5 <NA> 1 <NA> 47.23872
6 a 1 <NA> 66.59281
7 b 1 <NA> 32.03513
8 c 1 <NA> 40.66896
9 <NA> 2 <NA> 45.16588
10 a 2 <NA> 50.59323
11 b 2 <NA> 51.02013
12 c 2 <NA> 33.15251
13 <NA> 3 <NA> 51.67809
14 a 3 <NA> 48.13645
15 b 3 <NA> 57.92084
16 c 3 <NA> 49.27710
17 <NA> 4 <NA> 44.93515
18 a 4 <NA> 40.36266
19 b 4 <NA> 44.26717
20 c 4 <NA> 50.74718
Using only vectorized functions and base R
# Find all possible subsets of your data
combVars <- c("catA", "catB", "catC")
subsets <- lapply(0:length(combVars), combn, x = combVars, simplify = FALSE)
subsets <- do.call(c, subsets)
# Calculate means by each subset
meanValues <- lapply(subsets, function(x) aggregate(dat[["obs"]], by = dat[x], FUN = mean))
# Pull them all into one dataframe
Reduce(function(x,y) merge(x,y,all=TRUE), meanValues)

Resources