simplify multiple one to one paired aggregation - r

I'd like to calculate mean for multiple one to one paired aggregation. For example, I have a data.frame below. I'd like to calculate the mean for for column b1 by sym & a1, and b2 by sym & a2 simultaneously.
sym a1 a2 b1 b2
1 a 1 2 1 1
2 a 2 2 2 2
3 a 1 2 3 3
4 a 2 2 4 4
5 b 1 1 5 5
6 b 2 1 6 6
7 b 1 1 7 7
8 b 2 1 8 8
Here is my code which uses lapply to iterate over each pair. Is there any more efficient way than this?
df <- data.frame(sym=c(rep('a', 4), rep('b', 4)), a1=rep(1:2, 4),
a2=rep(2:1, each=4), b1=rep(1:8), b2=rep(1:8))
tmp <- ddply(df, "sym", function(x) {
temp.ls <- lapply(1:2, function(i) {
t2 <- aggregate(x = x[3+i], by=x[1+i], FUN=function(.){mean(., na.rm = T)})
colnames(t2) <- c("a", "b")
t2
})
temp.all <- Reduce(function(x, y) merge(x, y, by=c("a"), all=T, sort=T),
temp.ls)
})

dplyr makes this pretty straightforward:
library(dplyr)
inner_join(df %>% group_by(sym, a1) %>% summarise(b1.mean=mean(b1)),
df %>% group_by(sym, a2) %>% summarise(b2.mean=mean(b2)))
# Joining by: "sym"
# Source: local data frame [4 x 5]
# Groups: sym
#
# sym a1 b1.mean a2 b2.mean
# 1 a 1 2 2 2.5
# 2 a 2 3 2 2.5
# 3 b 1 6 1 6.5
# 4 b 2 7 1 6.5
If you want a single column for a, and want to fill non-appearing combinations with NA as in your example solution, then left_join is an option:
left_join(df %>% group_by(sym, a=a1) %>% summarise(b1.mean=mean(b1)),
df %>% group_by(sym, a=a2) %>% summarise(b2.mean=mean(b2)),
by=c('sym', 'a'))
# Source: local data frame [4 x 4]
# Groups: sym
#
# sym a b1.mean b2.mean
# 1 a 1 2 NA
# 2 a 2 3 2.5
# 3 b 1 6 6.5
# 4 b 2 7 NA
Hat-tip to #beginnerR for reminding me about dplyr join operations.
EDIT
In response to the comments, if you have more than two groupings, and want to join all the resulting tables together, then here's one way to do this:
# Example data
set.seed(1)
(d <- data.frame(sym=sample(letters[1:4], 10, replace=T),
a1=sample(5, 10, replace=TRUE),
a2=sample(5, 10, replace=TRUE),
a3=sample(5, 10, replace=TRUE),
b1=runif(10), b2=runif(10), b3=runif(10)))
# sym a1 a2 a3 b1 b2 b3
# 1 b 2 5 3 0.8209463 0.47761962 0.91287592
# 2 b 1 2 3 0.6470602 0.86120948 0.29360337
# 3 c 4 4 3 0.7829328 0.43809711 0.45906573
# 4 d 2 1 1 0.5530363 0.24479728 0.33239467
# 5 a 4 2 5 0.5297196 0.07067905 0.65087047
# 6 d 3 2 4 0.7893562 0.09946616 0.25801678
# 7 d 4 1 4 0.0233312 0.31627171 0.47854525
# 8 c 5 2 1 0.4772301 0.51863426 0.76631067
# 9 c 2 5 4 0.7323137 0.66200508 0.08424691
# 10 a 4 2 3 0.6927316 0.40683019 0.87532133
L <- mapply(function(x, y) {
grpd <- eval(substitute(group_by(d, sym, a=x), list(x=as.name(x))))
eval(substitute(summarise(grpd, mean(y)), list(y=as.name(y))))
}, paste0('a', 1:3), paste0('b', 1:3), SIMPLIFY=FALSE)
Reduce(function(...) left_join(..., all=T), L)
# Source: local data frame [9 x 5]
# Groups: sym
#
# sym a mean(b1) mean(b2) mean(b3)
# 1 a 4 0.6112256 NA NA
# 2 b 1 0.6470602 NA NA
# 3 b 2 0.8209463 0.86120948 NA
# 4 c 2 0.7323137 0.51863426 NA
# 5 c 4 0.7829328 0.43809711 0.08424691
# 6 c 5 0.4772301 0.66200508 NA
# 7 d 2 0.5530363 0.09946616 NA
# 8 d 3 0.7893562 NA NA
# 9 d 4 0.0233312 NA 0.36828101

Related

Assign NA to duplicate values within dataframe prior to using reshape

I want to assign NA to a duplicate value prior to using reshape in order to avoid duplicates in my wide dataset after reshaping. In the example data frame below, I would like to assign NA to all duplicate values in X1 and X2, but not X3, for each ID in my dataset. This means for ID=3, NA should be assigned to X2 in row 4, and for ID=4 this applies to X1 for row 6 and 8, and to x2 also for row 6 and 8. Values of X3 should remain. I want to assign NA since all rows should remain in the data frame.
df <- read.table(header=TRUE,text =
"ID X1 X2 X3
1 A X 23
2 B Y 4
3 A X 32
3 B X 6
4 A Y 45
4 B Y 7
4 A Z 5
4 B Z 3
")
ID X1 X2 X3
1 1 A X 23
2 2 B Y 4
3 3 A X 32
4 3 B X 6
5 4 A Y 45
6 4 B Y 7
7 4 A Z 5
8 4 B Z 3
duplicated() is useful for identifying duplicates.
df[duplicated(df[c("ID", "X1")]), "X1"] = NA
df[duplicated(df[c("ID", "X2")]), "X2"] = NA
df
# ID X1 X2 X3
# 1 1 A X 23
# 2 2 B Y 4
# 3 3 A X 32
# 4 3 B <NA> 6
# 5 4 A Y 45
# 6 4 B <NA> 7
# 7 4 <NA> Z 5
# 8 4 <NA> <NA> 3
We can use dplyr
library(dplyr)
df %>%
group_by(ID) %>%
mutate_each(funs(replace(., duplicated(.), NA)), X1:X2)
# ID X1 X2 X3
# <int> <fctr> <fctr> <int>
#1 1 A X 23
#2 2 B Y 4
#3 3 A X 32
#4 3 B NA 6
#5 4 A Y 45
#6 4 B NA 7
#7 4 NA Z 5
#8 4 NA NA 3
You could try:
library(data.table)
setDT(df)
df[, c("X1","X2") := .(ifelse(duplicated(X1), NA, X1), ifelse(duplicated(X2), NA, X2)), by = ID]
Result:
ID X1 X2 X3
1: 1 A X 23
2: 2 B Y 4
3: 3 A X 32
4: 3 B NA 6
5: 4 A Y 45
6: 4 B NA 7
7: 4 NA Z 5
8: 4 NA NA 3

Calculate differences on a variable between factor levels

I have a data.frame with exactly one value measured for each subject at multiple timepoints. It simplifies to this:
> set.seed(42)
> x = data.frame(subject=rep(c('a', 'b', 'c'), 3), time=rep(c(1,2,3), each=3), value=rnorm(3*3, 0, 1))
> x
subject time value
1 a 1 1.37095845
2 b 1 -0.56469817
3 c 1 0.36312841
4 a 2 0.63286260
5 b 2 0.40426832
6 c 2 -0.10612452
7 a 3 1.51152200
8 b 3 -0.09465904
9 c 3 2.01842371
I want to calculate the change in value for each timepoint and for each subject. For this simple example, my My current solution is this:
> x$diff[x$time==1] = x$value[x$time==2] - x$value[x$time==1]
> x$diff[x$time==2] = x$value[x$time==3] - x$value[x$time==2]
> x
subject time value diff
1 a 1 1.37095845 -0.7380958
2 b 1 -0.56469817 0.9689665
3 c 1 0.36312841 -0.4692529
4 a 2 0.63286260 0.8786594
5 b 2 0.40426832 -0.4989274
6 c 2 -0.10612452 2.1245482
7 a 3 1.51152200 NA
8 b 3 -0.09465904 NA
9 c 3 2.01842371 NA
... and then remove the last rows. However, in my actual data set, there's way more levels of time and I need to do this for several columns instead of just value. The code gets very ugly. Is there a neat way to do this? A solution which does not assume that rows are ordered within subjects according to time would be nice.
We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(x)), grouped by 'subject', we take the difference of the next value (shift(value, type='lead')) with the current value and assign (:=) the output to create the 'Diff' column.
library(data.table)#v1.9.6+
setDT(x)[order(time),Diff := shift(value, type= 'lead') - value ,
by = subject]
# subject time value Diff
#1: a 1 1.37095845 -0.7380958
#2: b 1 -0.56469817 0.9689665
#3: c 1 0.36312841 -0.4692529
#4: a 2 0.63286260 0.8786594
#5: b 2 0.40426832 -0.4989274
#6: c 2 -0.10612452 2.1245482
#7: a 3 1.51152200 NA
#8: b 3 -0.09465904 NA
#9: c 3 2.01842371 NA
You can use dplyr for this:
library(dplyr)
x %>%
arrange(time, subject) %>%
group_by(subject) %>%
mutate(diff = c(diff(value), NA))
# Source: local data frame [9 x 4]
# Groups: subject [3]
#
# subject time value diff
# (fctr) (dbl) (dbl) (dbl)
# 1 a 1 1.30970525 -1.66596287
# 2 b 1 0.12556761 -0.06070412
# 3 c 1 -1.09423634 1.38590546
# 4 a 2 -0.35625763 0.91417329
# 5 b 2 0.06486349 0.06652424
# 6 c 2 0.29166912 -0.98495562
# 7 a 3 0.55791566 NA
# 8 b 3 0.13138773 NA
# 9 c 3 -0.69328649 NA
If you want to get rid of the NAs, add %>% na.omit.
You could try ave. ave applies a function to a subset of a values, for more details see ?ave, e.g.:
x$diff <- ave(x$value, x$subject, FUN=function(x)c(diff(x), NA))
x
# subject time value diff
# 1 a 1 1.37095845 -0.7380958
# 2 b 1 -0.56469817 0.9689665
# 3 c 1 0.36312841 -0.4692529
# 4 a 2 0.63286260 0.8786594
# 5 b 2 0.40426832 -0.4989274
# 6 c 2 -0.10612452 2.1245482
# 7 a 3 1.51152200 NA
# 8 b 3 -0.09465904 NA
# 9 c 3 2.01842371 NA
BTW the diff function requires that the time is ordered.
EDIT: Update with set.seed(42).

How to drop factors that have fewer than n members

Is there a way to drop factors that have fewer than N rows, like N = 5, from a data table?
Data:
DT = data.table(x=rep(c("a","b","c"),each=6), y=c(1,3,6), v=1:9,
id=c(1,1,1,1,2,2,2,2,2,3,3,3,3,3,3,4,4,4))
Goal: remove rows when the number of id is less than 5. The variable "id" is the grouping variable, and the groups to delete when the number of rows in a group is less than 5. In DT, need to determine which groups have less than 5 members, (groups "1" and "4") and then remove those rows.
1: a 3 5 2
2: b 6 6 2
3: b 1 7 2
4: b 3 8 2
5: b 6 9 2
6: b 1 1 3
7: c 3 2 3
8: c 6 3 3
9: c 1 4 3
10: c 3 5 3
11: c 6 6 3
Here's an approach....
Get the length of the factors, and the factors to keep
nFactors<-tapply(DT$id,DT$id,length)
keepFactors <- nFactors >= 5
Then identify the ids to keep, and keep those rows. This generates the desired results, but is there a better way?
idsToKeep <- as.numeric(names(keepFactors[which(keepFactors)]))
DT[DT$id %in% idsToKeep,]
Since you begin with a data.table, this first part uses data.table syntax.
EDIT: Thanks to Arun (comment) for helping me improve this data table answer
DT[DT[, .(I=.I[.N>=5L]), by=id]$I]
# x y v id
# 1: a 3 5 2
# 2: a 6 6 2
# 3: b 1 7 2
# 4: b 3 8 2
# 5: b 6 9 2
# 6: b 1 1 3
# 7: b 3 2 3
# 8: b 6 3 3
# 9: c 1 4 3
# 10: c 3 5 3
# 11: c 6 6 3
In base R you could use
df <- data.frame(DT)
tab <- table(df$id)
df[df$id %in% names(tab[tab >= 5]), ]
# x y v id
# 5 a 3 5 2
# 6 a 6 6 2
# 7 b 1 7 2
# 8 b 3 8 2
# 9 b 6 9 2
# 10 b 1 1 3
# 11 b 3 2 3
# 12 b 6 3 3
# 13 c 1 4 3
# 14 c 3 5 3
# 15 c 6 6 3
If using a data.table is not necessary, you can use dplyr:
library(dplyr)
data.frame(DT) %>%
group_by(id) %>%
filter(n() >= 5)

Fill missing values with new data R-Python

I have two dataset x and y
> x
a index b
1 1 1 5
2 NA 2 6
3 2 3 NA
4 NA 4 9
> y
index a
1 2 100
2 4 101
>
I would like to fill the missing values of x with the values contained in y.
I have tried to use the merge function but the result is not what I want.
> merge(x,y, by = 'index', all=T)
index a.x b a.y
1 1 1 5 NA
2 2 NA 6 100
3 3 2 7 NA
4 4 NA 9 101
In the real problem there are additional limitations:
1 - y does not fill all the missing values
2 - x and y have in common more variables (so not only a and index)
EDIT : More realistic example
> x
a index b c
1 1 1 5 NA
2 NA 2 6 NA
3 2 3 NA 5
4 NA 4 9 NA
5 NA 5 10 6
> y
index a c
1 2 100 4
2 4 101 NA
>
The solution would be accepted both in python or R
I used your merge idea and did the following using dplyr. I am sure there will be better ways of doing this task.
index <- 1:5
a <- c(1, NA, 2, NA, NA)
b <- c(5,6,NA,9,10)
c <- c(NA,NA,5,NA,6)
ana <- data.frame(index, a,b,c, stringsAsFactors=F)
index <- c(2,4)
a <- c(100, 101)
c <- c(4, NA)
bob <- data.frame(index, a,c, stringsAsFactors=F)
> ana
index a b c
1 1 1 5 NA
2 2 NA 6 NA
3 3 2 NA 5
4 4 NA 9 NA
5 5 NA 10 6
> bob
index a c
1 2 100 4
2 4 101 NA
ana %>%
merge(., bob, by = "index", all = TRUE) %>%
mutate(a.x = ifelse(a.x %in% NA, a.y, a.x)) %>%
mutate(c.x = ifelse(c.x %in% NA, c.y, c.x))
index a.x b c.x a.y c.y
1 1 1 5 NA NA NA
2 2 100 6 4 100 4
3 3 2 NA 5 NA NA
4 4 101 9 NA 101 NA
5 5 NA 10 6 NA NA
I overwrote a.x (ana$$a) using a.y (bob$a) using mutate. I did a similar thing for c.x (ana$c). If you remove a.y and c.y in the end, that will be the outcome you expect, I think.
Try:
xa = x[,c(1,2)]
m1 = merge(y,xa,all=T)
m1 = m1[!duplicated(m1$index),]
m1$b = x$b[match(m1$index, x$index)]
m1$c = x$c[match(m1$index, x$index)]
m1
index a b c
1 1 1 5 NA
2 2 100 6 NA
4 3 2 NA 5
5 4 101 9 NA
7 5 NA 10 6
or, if there many other columns like b and c:
xa = x[,c(1,2)]
m1 = merge(y,xa,all=T)
m1 = m1[!duplicated(m1$index),]
for(nn in names(x)[3:4]) m1[,nn] = x[,nn][match(m1$index, x$index)]
m1
index a b c
1 1 1 5 NA
2 2 100 6 NA
4 3 2 NA 5
5 4 101 9 NA
7 5 NA 10 6
If there are multiple columns to replace, you could try converting from wide to long form as shown in the first two methods and replace in one step
m1 <- merge(x,y, by="index", all=TRUE)
m1L <- reshape(m1, idvar="index", varying=grep("\\.", colnames(m1)), direction="long", sep=".")
row.names(m1L) <- 1:nrow(m1L)
lst1 <- split(m1L, m1L$time)
indx <- is.na(lst1[[1]][,4:5])
lst1[[1]][,4:5][indx] <- lst1[[2]][,4:5][indx]
res <- lst1[[1]][,c(4,1,2,5)]
res
# a index b c
#1 1 1 5 NA
#2 100 2 6 4
#3 2 3 NA 5
#4 101 4 9 NA
#5 NA 5 10 6
Or you could use dplyr with tidyr
library(dplyr)
library(tidyr)
z <- left_join(x, y, by="index") %>%
gather(Var, Val, matches("\\.")) %>%
separate(Var, c("Var1", "Var2"))
indx1 <- which(is.na(z$Val) & z$Var2=="x")
z$Val[indx1] <- z$Val[indx1+nrow(z)/2]
z %>%
spread(Var1, Val) %>%
filter(Var2=="x") %>%
select(-Var2)
# index b a c
#1 1 5 1 NA
#2 2 6 100 4
#3 3 NA 2 5
#4 4 9 101 NA
#5 5 10 NA 6
Or split the columns by matching names before the . and use lapply to replace the NA's.
indx <- grep("\\.", colnames(m1),value=TRUE)
res <- cbind(m1[!names(m1) %in% indx],
sapply(split(indx, gsub("\\..*", "", indx)), function(x) {
x1 <- m1[x]
indx1 <- is.na(x1[,1])
x1[,1][indx1] <- x1[,2][indx1]
x1[,1]} ))
res
# index b a c
#1 1 5 1 NA
#2 2 6 100 4
#3 3 NA 2 5
#4 4 9 101 NA
#5 5 10 NA 6

Calculate groupwise ratio of consecutive values in R

I want to calculate the ratio between consecutive values within groups. It is easy for differences using diff:
mdata <- data.frame(group = c("A","A","A","B","B","C","C"), x = c(2,3,5,6,3,7,6))
mdata$diff <- unlist(by(mdata$x, mdata$group, function(x){c(NA, diff(x))}))
mdata
group x diff
1 A 2 NA
2 A 3 1
3 A 5 2
4 B 6 NA
5 B 3 -3
6 C 7 NA
7 C 6 -1
Is there an equivalent function to calculate ratios? Desired output would be:
group x ratio
1 A 2 NA
2 A 3 1.5000000
3 A 5 1.6666667
4 B 6 NA
5 B 3 0.5000000
6 C 7 NA
7 C 6 0.8571429
Try dplyr:
install.packages(dplyr)
require(dplyr)
mdata <- data.frame(group = c("A","A","A","B","B","C","C"), x = c(2,3,5,6,3,7,6))
mdata <- group_by(mdata, group)
mutate(mdata, ratio = x / lag(x))
# Source: local data frame [7 x 3]
# Groups: group
# group x ratio
# 1 A 2 NA
# 2 A 3 1.5000000
# 3 A 5 1.6666667
# 4 B 6 NA
# 5 B 3 0.5000000
# 6 C 7 NA
# 7 C 6 0.8571429
Your diff would simplify to:
mutate(mdata, diff = x - lag(x))
# Source: local data frame [7 x 3]
# Groups: group
# group x diff
# 1 A 2 NA
# 2 A 3 1
# 3 A 5 2
# 4 B 6 NA
# 5 B 3 -3
# 6 C 7 NA
# 7 C 6 -1
Same idea, using data.table:
library(data.table)
dt = as.data.table(mdata)
dt[, ratio := x / lag(x), by = group]
dt
# group x ratio
#1: A 2 NA
#2: A 3 1.5000000
#3: A 5 1.6666667
#4: B 6 NA
#5: B 3 0.5000000
#6: C 7 NA
#7: C 6 0.8571429
Another option with ave:
transform(mdata,
ratio=ave(x, group, FUN=function(y) c(NA, tail(y, -1) / head(y, -1))))
Using by:
do.call(rbind, by(mdata, mdata$group, function(dat) {
dat$ratio <- dat$x / c(NA, head(dat$x, -1))
dat
}))
# group x ratio
# A.1 A 2 NA
# A.2 A 3 1.5000000
# A.3 A 5 1.6666667
# B.4 B 6 NA
# B.5 B 3 0.5000000
# C.6 C 7 NA
# C.7 C 6 0.8571429

Resources