Apply a function to dataframe subsetted by all possible combinations of categorical variables - r

An example dataframe with categorical variables catA, catB, and catC. Obs is some observed value.
catA <- rep(factor(c("a","b","c")), length.out=100)
catB <- rep(factor(1:4), length.out=100)
catC <- rep(factor(c("d","e","f")), length.out=100)
obs <- runif(100,0,100)
dat <- data.frame(catA, catB, catC, obs)
All possible subsets of data by categorical variables.
allsubs <- expand.grid(catA = c(NA,levels(catA)), catB = c(NA,levels(catB)),
catC = c(NA,levels(catC)))
> head(allsubs, n=10)
catA catB catC
1 <NA> <NA> <NA>
2 a <NA> <NA>
3 b <NA> <NA>
4 c <NA> <NA>
5 <NA> 1 <NA>
6 a 1 <NA>
7 b 1 <NA>
8 c 1 <NA>
9 <NA> 2 <NA>
10 a 2 <NA>
Now, what is the easiest way to create an output dataframe with a results column containing results from a function applied to the corresponding subset (defined in each row by the combination of cat variables) of dat. So the output should look like the following dataframe, 'whatiwant', where the results column will contain the results of a function applied to each subset.
> whatiwant
catA catB catC results
1 <NA> <NA> <NA> *
2 a <NA> <NA> *
3 b <NA> <NA> *
4 c <NA> <NA> *
5 <NA> 1 <NA> *
6 a 1 <NA> *
7 b 1 <NA> *
8 c 1 <NA> *
9 <NA> 2 <NA> *
10 a 2 <NA> *
So, if the function applied was 'mean', the results should be:
dat$results[1] = mean(subset(dat,)$obs)
dat$results[2] = mean(subset(dat, catA=="a")$obs)
etc, etc..

ans <- with(dat, tapply(obs, list(catA, catB, catC), mean))
ans <- data.frame(expand.grid(dimnames(ans)), results=c(ans))
names(ans)[1:3] <- names(dat)[1:3]
str(ans)
# 'data.frame': 36 obs. of 4 variables:
# $ catA : Factor w/ 3 levels "a","b","c": 1 2 3 1 2 3 1 2 3 1 ...
# $ catB : Factor w/ 4 levels "1","2","3","4": 1 1 1 2 2 2 3 3 3 4 ...
# $ catC : Factor w/ 3 levels "d","e","f": 1 1 1 1 1 1 1 1 1 1 ...
# $ results: num 69.7 NA NA 55.3 NA ...

An alternative approach, one function to get all combinations of variables and another to apply a function over all subsets. The combinations function was stolen from another post...
## return all combinations of vector up to maximum length n
multicombn <- function(dat, n) {
unlist(lapply(1:n, function(x) combn(dat, x, simplify=F)), recursive=F)
}
For allsubs, vars is of form c("catA","catB","catC"), out.name = "mean".
func needs to be written in form that ddply would take,
func=function(x) mean(x$obs, na.rm=TRUE)
library(plyr)
allsubs <- function(indat, vars, func=NULL, out.name=NULL) {
results <- data.frame()
nvars <- rev(multicombn(vars,length(vars)))
for(i in 1:length(nvars)) {
results <-
rbind.fill(results, ddply(indat, unlist(nvars[i]), func))
}
if(!missing(out.name)) names(results)[length(vars)+1] <- out.name
results
}
One difference between this answer and shwaund's, this does not return rows for empty
subsets, so no NAs in results column.
allsubs(dat, c("catA","catB","catc"), func, out.name="mean")
> head(allsubs(dat, vars, func, out.name = "mean"),20)
catA catB catC mean
1 a 1 d 56.65909
2 a 2 d 54.98116
3 a 3 d 37.52655
4 a 4 d 58.29034
5 b 1 e 52.88945
6 b 2 e 50.43122
7 b 3 e 52.57115
8 b 4 e 59.45348
9 c 1 f 52.41637
10 c 2 f 34.58122
11 c 3 f 46.80256
12 c 4 f 51.58668
13 <NA> 1 d 56.65909
14 <NA> 1 e 52.88945
15 <NA> 1 f 52.41637
16 <NA> 2 d 54.98116
17 <NA> 2 e 50.43122
18 <NA> 2 f 34.58122
19 <NA> 3 d 37.52655
20 <NA> 3 e 52.57115

This isn't the cleanest solution, but I think it gets close to what you want.
getAllSubs <- function(df, lookup, fun) {
out <- lapply(1:nrow(lookup), function(i) {
df_new <- df
if(length(na.omit(unlist(lookup[i,]))) > 0) {
for(j in colnames(lookup)[which(!is.na(unlist(lookup[i,])))]) {
df_new <- df_new[df_new[,j] == lookup[i,j],]
}
}
fun(df_new)
})
if(mean(sapply(out, length) ==1) == 1) {
out <- unlist(out)
} else {
out <- do.call("rbind", out)
}
final <- cbind(lookup, out)
final[is.na(final)] <- NA
final
}
As it is currently written you have to construct the lookup table beforehand, but you could just as easily move that construction into the function itself. I added a few lines at the end to make sure it could accomodate outputs of different lengths and so NaNs were turned into NAs, just because that seemed to create a cleaner output. As it is currently written, it applies the function to the entire original data frame in cases where all columns are NA.
dat_out <- getAllSubs(dat, allsubs, function(x) mean(x$obs, na.rm = TRUE))
head(dat_out,20)
catA catB catC out
1 <NA> <NA> <NA> 47.25446
2 a <NA> <NA> 51.54226
3 b <NA> <NA> 46.45352
4 c <NA> <NA> 43.63767
5 <NA> 1 <NA> 47.23872
6 a 1 <NA> 66.59281
7 b 1 <NA> 32.03513
8 c 1 <NA> 40.66896
9 <NA> 2 <NA> 45.16588
10 a 2 <NA> 50.59323
11 b 2 <NA> 51.02013
12 c 2 <NA> 33.15251
13 <NA> 3 <NA> 51.67809
14 a 3 <NA> 48.13645
15 b 3 <NA> 57.92084
16 c 3 <NA> 49.27710
17 <NA> 4 <NA> 44.93515
18 a 4 <NA> 40.36266
19 b 4 <NA> 44.26717
20 c 4 <NA> 50.74718

Using only vectorized functions and base R
# Find all possible subsets of your data
combVars <- c("catA", "catB", "catC")
subsets <- lapply(0:length(combVars), combn, x = combVars, simplify = FALSE)
subsets <- do.call(c, subsets)
# Calculate means by each subset
meanValues <- lapply(subsets, function(x) aggregate(dat[["obs"]], by = dat[x], FUN = mean))
# Pull them all into one dataframe
Reduce(function(x,y) merge(x,y,all=TRUE), meanValues)

Related

Using mapply to set values based on values in other columns

Based on my previous question, I need help with using the mapply function correctly.
x <- data.frame(a = seq(1,3), b = seq(2,4), c = seq(3,5), d = seq(4,6), b2 = seq(5,7), c2 = seq(6,8), d2 = seq(7,9))
# a b c d b2 c2 d2
# 1 2 3 4 5 6 7
# 2 3 4 5 6 7 8
# 3 4 5 6 7 8 9
My goal is to look at the columns b2 to d2 and, based on their values, change the values in columns b to d respectively. I can do this for a single column quite easily:
x[which(x$b2 == 7),][b] <- NA_real_
My problem is that I want this applied across all my columns but I don't know how to convert this single column formula to work on multiple columns. I tried:
onez <- c(2:4)
twoz <- c(5:7)
f <- function(df, ones, twos) {
df[which(df[,twos] == 7),][ones] <- NA_real_
}
mapply(f, df = x, ones = onez, twos = twoz)
But I'm getting error messages (incorrect dimensions etc) and I see that my function is messy but I lack the knowledge how to fix it.
One way to do it is to tell it to:
Get the subset of the data frame with columns 5, 6, 7: x[5:7]
Check from that subset which values satisfy your condition: x[5:7] == 7
Replace those values with NA: ... <- NA
This gives the following,
x[5:7][x[5:7] == 7] <- NA
x
# a b c d b2 c2 d2
#1 1 2 3 4 5 6 NA
#2 2 3 4 5 6 NA 8
#3 3 4 5 6 NA 8 9
If you want the NAs to be replaced at x[2:4], then you can do,
x[2:4][x[5:7] == 7] <- NA
x
# a b c d b2 c2 d2
#1 1 2 3 NA 5 6 7
#2 2 3 NA 5 6 7 8
#3 3 NA 5 6 7 8 9

Combine Strings with Missing Value

This is my sample data.
index <- c(1,2,3,4,5,6,7,8,9,10)
a <- c('a','b','c',NA,'D','e',NA,'g','h','i')
data <- data.frame(index,a)
What I would like to is create a new column name where only 'a' and 'b' stay. All others like 'c','d','e'...will be tagged as others, while NA stays as NA.
data$name = ifelse(!grepl('(a|b)',data$a),'others',data$name)
I tried to use the grepl function and it seems it is not working with data with missing values
In base R:
data$res <- as.character(data$a)
data$res[! data$a %in% c("a","b") & !is.na(data$a)] <- "Other"
data
# index a res
# 1 1 a a
# 2 2 b b
# 3 3 c Other
# 4 4 <NA> <NA>
# 5 5 D Other
# 6 6 e Other
# 7 7 <NA> <NA>
# 8 8 g Other
# 9 9 h Other
# 10 10 i Other
Note that the new column is of type character here.
Using dplyr and its recode function, you could do
data %>% mutate(name=recode(a, a="a", b="b", .default="other"))
# index a name
# 1 1 a a
# 2 2 b b
# 3 3 c other
# 4 4 <NA> <NA>
# 5 5 D other
# 6 6 e other
# 7 7 <NA> <NA>
# 8 8 g other
# 9 9 h other
# 10 10 i other
With a more complicated match, you migth use case_when instead
data %>% mutate(name=case_when(
is.na(a) ~ NA_character_,
a %in% c("a","b") ~ as.character(a),
TRUE ~ "other"))

Deriving number of unique ID counts from a data frame and creating new columns for counts and values

I have a dataframe reshape/counting problem. Consider the following data frame with a column of non-unique IDs and a value column (could be uniques but doesn't have to be):
id<-c(1,1,1,2,2,3,4,4,4,4)
value_df<-c("A","B","C","D","E","F","G","H","I","J")
df<-data.frame(id,value_df)
df
id value_df
1 1 A
2 1 B
3 1 C
4 2 D
5 2 E
6 3 F
7 4 G
8 4 H
9 4 I
10 4 J
What I am after is to create a dataframe where the first column contains the unique id's, the second the counts of these and the rest n columns the values for each id, like this:
df_counts_reshape
id number_id value_df_1 value_df_2 value_df_3 value_df_4
1 1 3 A B C N
2 2 2 D E <NA> <NA>
3 3 1 F <NA> <NA> <NA>
4 4 4 G I J K
Using the plyr-package deriving the counts are easy, like this
count(df,"id")
but then my problem starts. To get the rest of the dataframe I have tried melt and dcast from the reshape2-package.
df_melted<-melt(df,id.vars =c("id"), measure.vars = c("value_df"))
df_cast<-dcast(df_melted,id~value)
This, however, results in the following output:
df_cast
id A B C D E F G H I J
1 1 A B C <NA> <NA> <NA> <NA> <NA> <NA> <NA>
2 2 <NA> <NA> <NA> D E <NA> <NA> <NA> <NA> <NA>
3 3 <NA> <NA> <NA> <NA> <NA> F <NA> <NA> <NA> <NA>
4 4 <NA> <NA> <NA> <NA> <NA> <NA> G H I J
This output has many more "value" columns than what I wished for but I cannot find and easy way to simplify it to the output I want above. I was also thinking that the last step to get the number_id in would be with rbind.
Needless to say my actual dataframes contain many thousands of row which make the current melt/dcast output very clumsy with thousands of columns.
Use this:
df<-data.frame(id,value_df)
df$num <- ave(as.character(df$value_df), df$id, FUN = seq_along)
df = reshape(df,idvar = "id",direction = "wide",timevar = "num" )
Result:
> df
id value_df.1 value_df.2 value_df.3 value_df.4
1 1 A B C <NA>
4 2 D E <NA> <NA>
6 3 F <NA> <NA> <NA>
7 4 G H I J
What this does is it adds a numeric column that sequences 1:n for each group of ids. Then it uses those values as the new column names

Need help in data manipulation in R [duplicate]

This question already has answers here:
Split data frame string column into multiple columns
(16 answers)
Closed 6 years ago.
i have a dataframe with 2 columns id, cat_list
id cat_list
1 A
2 A|B
3 E|F|G
4 I
5 P|R|T|Z
i want to achieve the below using R code.
id cat_list1 cat_list2 cat_list3 cat_list4
1 A
2 A B
3 E F G
4 I
5 P R T Z
tidyr::separate is handy:
library(tidyr)
df %>% separate(cat_list, into = paste0('cat_list', 1:4), fill = 'right')
## id cat_list1 cat_list2 cat_list3 cat_list4
## 1 1 A <NA> <NA> <NA>
## 2 2 A B <NA> <NA>
## 3 3 E F G <NA>
## 4 4 I <NA> <NA> <NA>
## 5 5 P R T Z
We can use cSplit. Here, we don't need to worry to about the number of splits as it will automatically detect it.
library(splitstackshape)
cSplit(df1, "cat_list", "|")
# id cat_list_1 cat_list_2 cat_list_3 cat_list_4
#1: 1 A NA NA NA
#2: 2 A B NA NA
#3: 3 E F G NA
#4: 4 I NA NA NA
#5: 5 P R T Z
NOTE: It may be better to fill with NA rather than ''.

Splitting data in a cell

I have a data set that looks like this
Code Product
1 A|B
2 A|B|C
3 A|B|C|D|E
When I split the column Product using colsplit function, duplication occurs. The output of colsplit function looks like this:
Code Product.1 Product.2 Product.3 Product.4 Product.5
1 A B A B A
2 A B C A B
3 A B C D E
This happens because one of the cells had five elements. Is there any way to avoid this duplication?
Thanks and regards
Jayaram
Update (21 Oct 2013)
The concepts below have been rolled into a family of functions called concat.split.* in my "splitstackshape" package. Here is a very straightforward solution using concat.split.multiple:
library(splitstackshape)
concat.split.multiple(temp, "Product", "|", "long")
# Code time Product
# 1 1 1 A
# 2 2 1 A
# 3 3 1 A
# 4 1 2 B
# 5 2 2 B
# 6 3 2 B
# 7 1 3 <NA>
# 8 2 3 C
# 9 3 3 C
# 10 1 4 <NA>
# 11 2 4 <NA>
# 12 3 4 D
# 13 1 5 <NA>
# 14 2 5 <NA>
# 15 3 5 E
Remove the "long" argument if you want the wide format, but your comments indicated that ultimately you wanted a long format for your output.
Original answer (17 Dec 2012)
You can do this with strsplit and sapply as follows:
# Your data
temp <- structure(list(Code = 1:3, Product = c("A|B", "A|B|C", "A|B|C|D|E"
)), .Names = c("Code", "Product"), class = "data.frame", row.names = c(NA, -3L))
temp1 <- strsplit(temp$Product, "\\|") # Split the product cell
temp1 <- data.frame(Code = temp$Code,
t(sapply(temp1,
function(x) {
temp <- matrix(NA,
nrow = max(sapply(temp1, length)));
temp[1:length(x)] <- x; temp})))
temp1
# Code X1 X2 X3 X4 X5
# 1 1 A B <NA> <NA> <NA>
# 2 2 A B C <NA> <NA>
# 3 3 A B C D E
Or... use rbind.fill from the "plyr" package, after making each of your rows into a single column data.frame:
temp1 <- strsplit(temp$Product, "\\|")
library(plyr)
data.frame(Code = temp$Code,
rbind.fill(lapply(temp1, function(x) data.frame(t(x)))))
# Code X1 X2 X3 X4 X5
# 1 1 A B <NA> <NA> <NA>
# 2 2 A B C <NA> <NA>
# 3 3 A B C D E
Or... inspired by #DWin's great answer here, re-read the second column as a data.frame in itself.
newcols <- max(sapply(strsplit(temp$Product, "\\|"), length))
temp2 <- data.frame(Code = temp$Code,
read.table(text = as.character(temp$Product),
sep="|", fill=TRUE,
col.names=paste("Product", seq(newcols))))
temp2
# Code Product.1 Product.2 Product.3 Product.4 Product.5
# 1 1 A B
# 2 2 A B C
# 3 3 A B C D E

Resources