Cummean column-wise and ignoring NAs - r

I have a data frame looking like this:
as.data.frame(matrix(c(1,2,3,NA,4,5,NA,NA,9), nrow = 3, ncol = 3))
V1 V2 V3
1 1 NA NA
2 2 4 NA
3 3 5 9
I would like to calculate a cumulative mean per column, which ignores NAs, so something like this:
V1 V2 V3
1 1 NA NA
2 3 4 NA
3 6 9 9
I tried this:
B[!is.na(A)] <- as.data.frame(apply(B[!is.na(A)], 2, cummean))
But received this error message:
dim(X) must have a positive length
Thanks for your help!
Cheers

This should work :
A <- as.data.frame(matrix(c(1,2,3,NA,4,5,NA,NA,9), nrow = 3, ncol = 3))
B <- as.data.frame(apply(A,2,function(col){
col[!is.na(col)] <- dplyr::cummean(col[!is.na(col)])
return(col)
}))
> B
V1 V2 V3
1 1.0 NA NA
2 1.5 4.0 NA
3 2.0 4.5 9

We can use data.table
library(data.table)
library(dplyr)
setDT(d1)
for(j in seq_along(d1)){
set(d1, i = which(!is.na(d1[[j]])), j=j, value = cummean(d1[[j]][!is.na(d1[[j]])]))
}
d1
# V1 V2 V3
#1: 1.0 NA NA
#2: 1.5 4.0 NA
#3: 2.0 4.5 9

Related

loop over 2 datasets in R to match the value of all rows from one dateset with only one column of another dateset

I am trying to write a loop in R to perform some iteration on two datasets called datasetA and datasetB.
datasetA has 600 entries and datasetB has 200’000 entries.
For each entry in datasetA, I want to perform the following:
If the value of V2 in both datasets are equal,
then calculate the ppm:
(datasetA$V3 - datasetB$V3) / datasetA$V3 * 1000000
If the ppm < |10|, then paste the ppm value in V4 column in datasetB, paste the relevant name of datasetA$V1 in column V1 of datasetB.
Say this is datasetA with 600 entries:
datasetA<- read.table(text='Alex 1 50.00042
John 1 60.000423
Janine 3 88.000123
Aline 3 117
Mark 2 79.9999')
DatasetA
and this is an example of datasetB with 200000 entries:
datasetB<- read.table(text='NA 1 50.0001 NA
NA 1 50.00032 NA
NA 2 70 NA
NA 2 80 NA
NA 3 88.0004 NA
NA 3 100 NA
NA 3 101 NA
NA 2 102 NA')
DatasetB
The final table should look like this:
datasetC <- read.table(text='Alex 1 50.0001 6.459945
Alex 1 50.00032 2.059983
NA 2 70 NA
Mark 2 80 -1.25
Janine 3 88.0004 -3.14772
NA 3 100 NA
NA 3 101 NA
NA 2 102 NA')
The final table should look like this
data<-datasetB
for(i in 1:5){
for(j in 1:8){
if (datasetA$V2[i]==datasetB$V2[j] & abs((datasetA$V3[i]-datasetB$V3[j])/datasetA$V3[i]*10**6)<10){
data[j,1]=datasetA[i,1]
data[j,4]=(datasetA$V3[i]-datasetB$V3[j])/datasetA$V3[i]*10**6
}}}
data
Try this: I am a R noob but let me know if this works for you.
library(data.table)
datasetA<- read.table(text='Alex 1 50.00042
John 1 60.000423
Janine 3 88.000123
Aline 3 117
Mark 2 79.9999')
datasetB<- read.table(text='NA 1 50.0001 NA
NA 1 50.00032 NA
NA 2 70 NA
NA 2 80 NA
NA 3 88.0004 NA
NA 3 100 NA
NA 3 101 NA
NA 2 102 NA')
# I renamed columns for my own reference, V1,V2,.. were a bit confusing
names(datasetA) <- c("Name", "ID", "ValueA")
names(datasetB) <- c("V1", "ID", "ValueB", "V4")
# Create a key for each row in datasetB
datasetB$key <- seq(nrow(datasetB))
# Left join A to B on column ID, but first set them as data table
datasetB <- as.data.table(datasetB)
datasetA <- as.data.table(datasetA)
# Using base join but you can also use data table left join see below
datasetC <- merge(x = datasetB, y = datasetA, by = c("ID"), all.x = TRUE)
# Create PPM column
datasetC[, c("ppm") := 1000000*(ValueA - ValueB)/ValueA, ]
# Filter on PPM and keep columns we need
datasetC <- datasetC[abs(ppm) < 10, list(key,Name,ppm)]
# Left join to datasetB on key
setkey(datasetC, key)
setkey(datasetB, key)
datasetB <- datasetC[datasetB]
# Keep columns we need and rename to V1,... as requested
datasetB <- datasetB[, list(V1 = Name, V2 = ID, V3 = ValueB, V4 = ppm)]
The following answer seems to do what the question asks for but I am failing to get 2 of the computed values, final column V4.
AV2 <- sort(unique(datasetA$V2))
res <- lapply(AV2, function(v2){
inx_a <- datasetA[['V2']] == v2
inx_b <- datasetB[['V2']] == v2
mrg <- merge(datasetA[inx_a, ], datasetB[inx_b, ], by = 'V2')
ppm <- ((mrg$V3.x - mrg$V3.y)/mrg$V3.x)*1000000
cbind(mrg[abs(ppm) < 10, c(2, 1, 5)], ppm = ppm[abs(ppm) < 10])
})
res <- do.call(rbind, res)
names(res) <- paste0('V', 1:4)
row.names(res) <- NULL
final <- merge(res, datasetB, by = c('V2', 'V3'), all.y = TRUE)[c(3, 1, 2, 4)]
names(final) <- paste0('V', 1:4)
final
# V1 V2 V3 V4
#1 Alex 1 50.00010 6.399946
#2 Alex 1 50.00032 1.999983
#3 <NA> 2 70.00000 NA
#4 Mark 2 80.00000 -1.250002
#5 <NA> 2 102.00000 NA
#6 Janine 3 88.00040 -3.147723
#7 <NA> 3 100.00000 NA
#8 <NA> 3 101.00000 NA
If I understand correctly, the question is asking for a join with a complex condition. This can be implemented using data.table:
library(data.table)
setDT(datasetA)[setDT(datasetB), on = "V2", {
ppm <-(x.V3- i.V3) / i.V3 * 1E6
list(V1 = ifelse(abs(ppm) < 10, x.V1, NA_character_),
V2,
V3 = i.V3,
V4 = ifelse(abs(ppm) < 10, ppm, NA_real_))
}, mult = "first"]
V1 V2 V3 V4
1: Alex 1 50.00010 6.399987
2: Alex 1 50.00032 1.999987
3: <NA> 2 70.00000 NA
4: Mark 2 80.00000 -1.250000
5: Janine 3 88.00040 -3.147713
6: <NA> 3 100.00000 NA
7: <NA> 3 101.00000 NA
8: <NA> 2 102.00000 NA
Here is an alternative approach which updates datasetB in place by an update join:
library(data.table)
tmp <- setDT(datasetA)[setDT(datasetB), on = "V2"][
, V4 := (V3- i.V3) / i.V3 * 1E6][abs(V4) < 10][, i.V1 := NULL]
datasetB[, `:=`(V1 = as.character(V1), V4 = as.double(V4))]
datasetB[tmp, on = .(V2, V3 = i.V3), `:=`(V1 = i.V1, V4 = i.V4)][]
V1 V2 V3 V4
1: Alex 1 50.00010 6.399987
2: Alex 1 50.00032 1.999987
3: <NA> 2 70.00000 NA
4: Mark 2 80.00000 -1.250000
5: Janine 3 88.00040 -3.147713
6: <NA> 3 100.00000 NA
7: <NA> 3 101.00000 NA
8: <NA> 2 102.00000 NA

Populate matrix by colname identity

I have many samples, each one of which has a corresponding abundance matrix. From these abundance matrices, I would like to create a large matrix that contains abundance information for each sample in rows.
For example, a single abundance matrix would look like:
A B C D
sample1 1 3 4 2
where A, B, C, and D represent colnames, and the abundances are the row values.
I would like to populate my larger matrix, which has as colnames all possible letters (A:Z) and all possible samples (sample1:sampleN) as rows, by matching the colname values.
For ex. :
A B C D E F G .... Z
sample1 1 3 4 2 NA NA NA ....
sample2 NA NA 2 5 7 NA NA ....
sample3 4 NA 6 9 2 NA 2 .....
....
sampleN
Different samples have a varying mix of abundances, in no guaranteed order.
When iteratively adding to this larger matrix, how could I ensure that the correct columns are populated by the right abundance values (ex. column "A" is only filled by values corresponding to abundances of "A" in different samples)? Thanks!
Starting data, changing just a little to highlight differences:
m1 <- as.matrix(read.table(header=TRUE, text="
A B C Z
sample1 1 3 4 2"))
m2 <- as.matrix(read.table(header=TRUE, text="
A B C D E F G
sample2 NA NA 2 5 7 NA NA
sample3 4 NA 6 9 2 NA 2"))
First, we need to make sure both matrices have the same column names:
newcols <- setdiff(colnames(m2), colnames(m1))
m1 <- cbind(m1, matrix(NA, nr=nrow(m1), nc=length(newcols), dimnames=list(NULL, newcols)))
newcols <- setdiff(colnames(m1), colnames(m2))
m2 <- cbind(m2, matrix(NA, nr=nrow(m2), nc=length(newcols), dimnames=list(NULL, newcols)))
m1
# A B C Z D E F G
# sample1 1 3 4 2 NA NA NA NA
m2
# A B C D E F G Z
# sample2 NA NA 2 5 7 NA NA NA
# sample3 4 NA 6 9 2 NA 2 NA
And now we combine them; regular cbind needs the column names to be aligned as well:
rbind(m2, m1[,colnames(m2),drop=FALSE])
# A B C D E F G Z
# sample2 NA NA 2 5 7 NA NA NA
# sample3 4 NA 6 9 2 NA 2 NA
# sample1 1 3 4 NA NA NA NA 2
You should be able to take advantage of matrix indexing, like so:
big[cbind(rownames(abun),colnames(abun))] <- abun
Using this example abundance matrix, and a big matrix to fill:
abun <- matrix(c(1,3,4,2),nrow=1,dimnames=list("sample1",LETTERS[1:4]))
big <- matrix(NA,nrow=5,ncol=26,dimnames=list(paste0("sample",1:5),LETTERS))
Another solution using reduce from purrr package and union_all from dplyr package:
library(purrr)
library(dplyr)
sample_names <- c("sample1","sample2","sample3")
Generating 3 random abundance dataframes:
num1 <- round(runif(runif(1,min = 1, max = 10),min = 1, max = 10))
df1 <- data.frame(t(num1))
colnames(df1) <- sample(LETTERS,length(num1))
num2 <- round(runif(runif(1,min = 1, max = 10),min = 1, max = 10))
df2 <- data.frame(t(num2))
colnames(df2) <- sample(LETTERS,length(num2))
num3 <- round(runif(runif(1,min = 1, max = 10),min = 1, max = 10))
df3 <- data.frame(t(num3))
colnames(df3) <- sample(LETTERS,length(num3))
This is actually the code that does all the magic:
A <- reduce(list(df1,df2,df3),union_all)
col_order <- sort(colnames(A),decreasing = FALSE)
A <- A[,col_order]
rownames(A) <- sample_names
Output:
> A
A C E F O P Q U W Y
sample1 9 NA NA NA 9 NA 5 6 NA NA
sample2 NA NA NA NA 5 4 NA NA 5 NA
sample3 NA 6 5 9 NA NA 3 NA 5 7

Issue with local variables in r custom function

I've got a dataset
>view(interval)
# V1 V2 V3 ID
# 1 NA 1 2 1
# 2 2 2 3 2
# 3 3 NA 1 3
# 4 4 2 2 4
# 5 NA 5 1 5
>dput(interval)
structure(list(V1 = c(NA, 2, 3, 4, NA),
V2 = c(1, 2, NA, 2, 5),
V3 = c(2, 3, 1, 2, 1), ID = 1:5), row.names = c(NA, -5L), class = "data.frame")
I would like to extract the previous not NA value (or the next, if NA is in the first row) for every row, and store it as a local variable in a custom function, because I have to perform other operations on every row based on this value(which should change for every row i'm applying the function).
I've written this function to print the local variables, but when I apply it the output is not what I want
myFunction<- function(x){
position <- as.data.frame(which(is.na(interval), arr.ind=TRUE))
tempVar <- ifelse(interval$ID == 1, interval[position$row+1,
position$col], interval[position$row-1, position$col])
return(tempVar)
}
I was expecting to get something like this
# [1] 2
# [2] 2
# [3] 4
But I get something pretty messed up instead.
Here's attempt number 1:
dat <- read.table(header=TRUE, text='
V1 V2 V3 ID
NA 1 2 1
2 2 3 2
3 NA 1 3
4 2 2 4
NA 5 1 5')
myfunc1 <- function(x) {
ind <- which(is.na(x), arr.ind=TRUE)
# since it appears you want them in row-first sorted order
ind <- ind[order(ind[,1], ind[,2]),]
# catch first-row NA
ind[,1] <- ifelse(ind[,1] == 1L, 2L, ind[,1] - 1L)
x[ind]
}
myfunc1(dat)
# [1] 2 2 4
The problem with this is when there is a second "stacked" NA:
dat2 <- dat
dat2[2,1] <- NA
dat2
# V1 V2 V3 ID
# 1 NA 1 2 1
# 2 NA 2 3 2
# 3 3 NA 1 3
# 4 4 2 2 4
# 5 NA 5 1 5
myfunc1(dat2)
# [1] NA NA 2 4
One fix/safeguard against this is to use zoo::na.locf, which takes the "last observation carried forward". Since the top-row is a special case, we do it twice, second time in reverse. This gives us the "next non-NA value in the column (up or down, depending).
library(zoo)
myfunc2 <- function(x) {
ind <- which(is.na(x), arr.ind=TRUE)
# since it appears you want them in row-first sorted order
ind <- ind[order(ind[,1], ind[,2]),]
# this is to guard against stacked NA
x <- apply(x, 2, zoo::na.locf, na.rm = FALSE)
# this special-case is when there are one or more NAs at the top of a column
x <- apply(x, 2, zoo::na.locf, fromLast = TRUE, na.rm = FALSE)
x[ind]
}
myfunc2(dat2)
# [1] 3 3 2 4

Maximum value of one data.table column based on other columns

I have a R data.table
DT = data.table(x=rep(c("b","a",NA_character_),each=3), y=rep(c('A', NA_character_, 'C'), each=3), z=c(NA_character_), v=1:9)
DT
# x y z v
#1: b A NA 1
#2: b A NA 2
#3: b A NA 3
#4: a NA NA 4
#5: a NA NA 5
#6: a NA NA 6
#7: NA C NA 7
#8: NA C NA 8
#9: NA C NA 9
For each column if the value is not NA, I want to extract the max value from column v. I am using
sapply(DT, function(x) { ifelse(all(is.na(x)), NA_integer_, max(DT[['v']][!is.na(x)])) })
#x y z v
#6 9 NA 9
Is there a simpler way to achive this?
here is a way, giving you -Inf (and a warning) if all values of the column are NA (you can later replace that by NA if you prefer):
DT[, lapply(.SD, function(x) max(v[!is.na(x)]))]
# x y z v
# 1: 6 9 -Inf 9
As suggested by #DavidArenburg, to ensure that everything goes well even when all values are NA (no warning and directly NA as result), you can do:
DT[, lapply(.SD, function(x) {
temp <- v[!is.na(x)]
if(!length(temp)) NA else max(temp)
})]
# x y z v
#1: 6 9 NA 9
We can use summarise_each from dplyr
library(dplyr)
DT %>%
summarise_each(funs(max(v[!is.na(.)])))
# x y z v
#1: 6 9 -Inf 9

R - Taking column maxes over specific subsets of a dataframe

I have a large set of data with various indices and such. I would like to change my data from something like this:
id time var1_t1 var1_t2 var1_t3 var2_t1 var2_t2 var2_t3
1 1 1.5 NA NA 3.3 NA NA
1 2 NA 2.5 NA NA 1.2 NA
1 3 NA NA 3.5 NA NA .7
to something like this:
id time var1_t1 var1_t2 var1_t3 var2_t1 var2_t2 var2_t3
1 1 1.5 2.5 3.5 3.3 1.2 .7
1 2 NA 2.5 NA NA 1.2 NA
1 3 NA NA 3.5 NA NA .7
or this:
id time var1_t1 var1_t2 var1_t3 var2_t1 var2_t2 var2_t3
1 1 1.5 2.5 3.5 3.3 1.2 .7
1 2 1.5 2.5 3.5 3.3 1.2 .7
1 3 1.5 2.5 3.5 3.3 1.2 .7
Except that there are rather more than six columns I need to do this for, and "id' has values other than 1.
I can do this for a single column as follows:
for (i in 1:max(df$id) ){
df[df$id == i & df$time == 1,]$var1_t1 <- max(df[df$id == i,]$var1_t1,
na.rm = TRUE)
}
But that uses a for loop, so it is a terrible idea. And I would have to repeat that line for each column. Is there a way I can do this more elegantly?
If you want to replace all NA's with the column-wise max value by group of id, you could define a little custom function:
f <- function(x) {
x[is.na(x)] <- max(x, na.rm = TRUE)
x
}
And then use your favorite data manipulation functions/package, for example dplyr:
library(dplyr)
df %>% group_by(id) %>% mutate_each(funs(f))
Or data.table:
library(data.table)
setDT(df)[, lapply(.SD, f), by = id]

Resources