Iterate over a subset of column names - r

I am new to R but here I have a dataframe of multiple measurements of a couple of conditions, I would like to perform a nested loop over the columns of the same condition, test if they have two true measurements (not zero) at least, if so calculate the mean of these specific conditions in a new dataset.
> sample <- list(c(8,0,12,5,0,11), c(15,5,0,10,12,13), c(1,1,0,3,0,9),
c(11,9,8,0,4,7), c(12,5,5,0,9,0), c(1,7,2,0,8,0))
> sample <- as.data.frame(sample)
> colnames(sample) <- c("x.1","x.2","x.3","y.1","y.2","y.3")
> sample
x.1 x.2 x.3 y.1 y.2 y.3
1 8 15 1 11 12 1
2 0 5 1 9 5 7
3 12 0 0 8 5 2
4 5 10 3 0 0 0
5 0 12 0 4 9 8
6 11 13 9 7 0 0
My output dataset should ideally look like this:
> Newsample
x y
1 8 8
2 2 7
3 0 5
4 6 0
5 0 7
6 11 0

We define f_rowmean function:
f_rowmean <- function(y) apply(y,1, function(x) ifelse(sum(x!=0)>=2, mean(x), 0))
And then:
data.frame(x=f_rowmean(sample[,grep("x", names(sample))]),
y=f_rowmean(sample[,grep("y", names(sample))]))
# x y
# 1 8 8
# 2 2 7
# 3 0 5
# 4 6 0
# 5 0 7
# 6 11 0
EDIT
As for OP's new problem statement (in comments), suppose your data set is in df1, then you could do:
res.cols <- c("CAOV-3 Reg", "CAOV-3 Mod", "OVCAR-3Reg", "OVCAR-4Reg", "VOA1056Reg",
"VOA4698Reg", "VOA4698Mod", "TOV112DReg", "TOV112DMod", "TOV21G Mod",
"HCC38 Reg", "HCC38 Mod")
res <- setNames(data.frame(matrix(0,nrow(df1),length(res.cols))), res.cols)
res <- sapply(res.cols, function(x) res[,x] <- f_rowmean(df1[,grep(x, names(df1))]))

We loop through the index of 'x' and 'y' columns in a list, get the rowSums of logical matrix and use ifelse to get the rowMeans
data.frame(setNames(lapply(list(grep("^x", names(sample)),
grep("^y", names(sample))), function(i) {
x1 <- sample[i]
ifelse(rowSums(x1!=0)>1, rowMeans(x1), 0)}), c("x", "y")))
# x y
#1 8 8
#2 2 7
#3 0 5
#4 6 0
#5 0 7
#6 11 0

Related

R joining on counts of elements in a vector to matching index in a dataframe

I have a dataframe df that looks like this:
indx
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
df<-structure(list(indx = 1:10), row.names = c(NA, 10L), class = "data.frame")
And a vector vec that looks like this:
vec<-c(3,5,9,9,5,4,3,3)
I would like to find the counts of each element in vec e.g. by doing this:
vec_counts<-table(vec)
3 4 5 9
3 1 2 2
And then join these counts into the matching indx in df. The final result should be:
indx vec_counts
1 1 0
2 2 0
3 3 3
4 4 1
5 5 2
6 6 0
7 7 0
8 8 0
9 9 2
10 10 0
You can use merge.
x <- merge(df, as.data.frame(table(vec)), by.x = "indx", by.y = "vec", all.x = TRUE)
names(x) <- c("indx", "vec_counts")
x$vec_counts[is.na(x$vec_counts)] <- 0
x
# indx vec_counts
# 1 1 0
# 2 2 0
# 3 3 3
# 4 4 1
# 5 5 2
# 6 6 0
# 7 7 0
# 8 8 0
# 9 9 2
# 10 10 0
Also you can use outer() to calculate it in one quick step without table().
df$vec_counts <- rowSums(outer(df$indx, vec, `==`))
# or with dplyr
library(dplyr)
df %>%
mutate(vec_counts = rowSums(outer(indx, vec, `==`)))
A possible solution, based on tidyverse:
library(tidyverse)
df %>%
mutate(count = map_dbl(indx, ~ sum(.x == vec)))
#> indx count
#> 1 1 0
#> 2 2 0
#> 3 3 3
#> 4 4 1
#> 5 5 2
#> 6 6 0
#> 7 7 0
#> 8 8 0
#> 9 9 2
#> 10 10 0
Or in base R:
df$count <- apply(df, 1, \(x) sum(x[1] == vec))
Or even, still in base R, using sapply:
df$count <- sapply(df$indx, \(x) sum(x == vec))

How select and remove rows based on position for a specific range in R

Suppose I have two data frames like this:
df1 <- data.frame(a = c(1,2,4,0,0),
b = c(0,3,5,5,0),
c = c(0,0,6,7,6))
df2 <- data.frame(a = c(3,6,8,0,0),
b = c(0,9,10,4,0),
c = c(0,0,1,4,9))
And then I joint it, like
df3 <- full_join(df1, df2)
print(df3)
a b c
1 1 0 0
2 2 3 0
3 4 5 6
4 0 5 7
5 0 0 6
6 3 0 0
7 6 9 0
8 8 10 1
9 0 4 4
10 0 0 9
Note that I have always the same pattern, with zeros in rows 1 and 2; and in rows 9 and 10. And I also have zeros between rows 4 and 7.
I want to remove, only, the zeros between rows 4 and 7.
So, I can solve it, like:
df3[4,1] <- NA
df3[5,1] <- NA
df3[5,2] <- NA
df3[6,2] <- NA
df3[6,3] <- NA
df3[7,3] <- NA
new.df3 <- as.data.frame(lapply(df3, na.omit))
print(new.df3)
a b c
1 1 0 0
2 2 3 0
3 4 5 6
4 3 5 7
5 6 9 6
6 8 10 1
7 0 4 4
8 0 0 9
But it is not elegant and very time-consuming.
Any thoughts? I really appreciate it, thanks in advance.
Best!
df3 %>%
mutate(rn = between(row_number(), 4, 7)) %>%
summarise(across(-rn, ~.x[!(.x == 0 & rn)]))
a b c
1 1 0 0
2 2 3 0
3 4 5 6
4 3 5 7
5 6 9 6
6 8 10 1
7 0 4 4
8 0 0 9
First, you find which one is zero between rows 4 and 7.
to_remove <- apply(df3[4:7, ], 1, function(x) which(x == 0))
Then, you substitute them by NAs.
for(i in seq(length(to_remove))){
df3[as.numeric(names(to_remove))[i], to_remove[[i]]] <- NA
}
And, finally, drop them.
new.df3 <- as.data.frame(lapply(df3, na.omit))
print(new.df3)
Here's a different approach:
mask <- !(seq(nrow(df3)) %in% 4:7 & df3 == 0)
df.lst <- lapply(1:3, function(x) df3[mask[, x], x])
sapply(df.lst, length)
# [1] 8 8 8 # Check to make sure the columns are the same length
names(df.lst) <- colnames(df3)
(new.df3 <- as.data.frame(df.lst))
# a b c
# 1 1 0 0
# 2 2 3 0
# 3 4 5 6
# 4 3 5 7
# 5 6 9 6
# 6 8 10 1
# 7 0 4 4
# 8 0 0 9

using column numbers for grouping in data table rather than names in R

I have code that needs to be flexible, and I cannot hard code in column names when I do grouping. As such, I want to hard code column numbers to do grouping, since these are easy to specify over range changes. (Column 1 through X or so, rather than using the names of cols 1,2,..X)
Example data set:
set.seed(007)
DF <- data.frame(X=1:20, Y=sample(c(0,1), 20, TRUE), Z=sample(0:5, 20, TRUE), Q =sample(0:5, 20, TRUE))
DF
X Y Z Q
1 1 1 3 4
2 2 0 1 2
3 3 0 5 4
4 4 0 5 2
5 5 0 5 5
6 6 1 0 1
7 7 0 3 0
8 8 1 2 4
9 9 0 5 5
10 10 0 2 5
11 11 0 4 3
12 12 0 1 4
13 13 1 1 4
14 14 0 1 3
15 15 0 2 4
16 16 0 5 2
17 17 1 2 0
18 18 0 4 1
19 19 1 5 2
20 20 0 2 1
A grouping (by Z and Q) that finds the X that maximizes Y, and returns both:
DF =data.table(DF)
DF[, list(Y=max(Y),X=X[which.max(Y)]), by=list(Z, Q)]
Result:
DF[, list(Y=max(Y),X=X[which.max(Y)]), by=list(Z, Q)]
Z Q Y X
1: 3 4 1 1
2: 1 2 0 2
3: 5 4 0 3
4: 5 2 1 19
5: 5 5 0 5
6: 0 1 1 6
7: 3 0 0 7
8: 2 4 1 8
9: 2 5 0 10
10: 4 3 0 11
11: 1 4 1 13
12: 1 3 0 14
13: 2 0 1 17
14: 4 1 0 18
15: 2 1 0 20
I want to do this purely using column numbers, because of the nature of my code. Additionally, If there were another column, I would potentially want to group by that extra column. And I would also want to potentially return another argmax in the first part.
Maybe just pick off names(DF) with column numbers, combined with eval(parse(...))?
useColNums <- function(data, a, b) {
n <- names(data)
y <- n[a[1]]
x <- n[a[2]]
groupby <- sprintf("list(%s)", paste(n[b], collapse=","))
argmax <- sprintf("list(%1$s=max(%1$s),%2$s=%2$s[which.max(%1$s)])", y, x)
data[, eval(parse(text=argmax)), by=eval(parse(text=groupby))]
}
x <- useColNums(DF, 2:1, 3:4)
y <- DF[, list(Y=max(Y),X=X[which.max(Y)]), by=list(Z, Q)]
identical(x, y)
# [1] TRUE
Did you find an answer that works for you? Something like this is possible, but it is not pretty, which may mean it is hard to maintain:
DF[, list(Y=max(eval(as.symbol(colnames(DF)[2]))),
X=eval(as.symbol(colnames(DF)[1]))[which.max(eval(as.symbol(colnames(DF)[2])))]),
by=list(Z=eval(as.symbol(colnames(DF)[3])),
Q=eval(as.symbol(colnames(DF)[4])))]
Now you could put those as.symbol(colnames()) into a function and make this easier to read:
cn <- function( dt, col ) { as.symbol(colnames(dt)[col]) }
DF[, list(Y=max(eval(cn(DF,2))),
X=eval(cn(DF,1))[which.max(eval(cn(DF,2)))]),
by=list(Z=eval(cn(DF,3)), Q=eval(cn(DF,4)))]
Does this solve that problem of grouping by column numbers for you?
You could use a combination of grep with your code:
> set.seed(007)
> DF <- data.frame(X=1:20, Y=sample(c(0,1), 20, TRUE), Z=sample(0:5, 20, TRUE), Q =sample(0:5, 20, TRUE))
> DF = data.table(DF)
> coly <- na
> DF[, list(Y=max(Y),X=X[which.max(Y)]), by=c(col1 <- names(DF)[grep("Q", colnames(DF))], names(DF)[grep("Z", colnames(DF))])]
Q Z Y X
1: 4 3 1 1
2: 2 1 0 2
3: 4 5 0 3
4: 2 5 1 19
5: 5 5 0 5
6: 1 0 1 6
7: 0 3 0 7
8: 4 2 1 8
9: 5 2 0 10
10: 3 4 0 11
11: 4 1 1 13
12: 3 1 0 14
13: 0 2 1 17
14: 1 4 0 18
15: 1 2 0 20

Cumulative Sum Starting at Center of Data Frame - R

I have this data.frame called dum
dummy <- data.frame(label = "a", x = c(1,1,1,1,0,1,1,1,1,1,1,1,1))
dummy1 <- data.frame(label = "b", x = c(1,1,1,1,1,1,1,1,0,1,1,1,1))
dum <- rbind(dummy,dummy1)
What I am trying to do is take the cumulative sum starting at 0 in the x column of dum. The summing would be grouped by the label column, which can be implemented in dplyr or plyr. The part that I am struggling with is how to start the cumulative sum from the 0 position in x and go outward.
The resulting data.frame should look like this :
>dum
label x output
1 a 1 4
2 a 1 3
3 a 1 2
4 a 1 1
5 a 0 0
6 a 1 1
7 a 1 2
8 a 1 3
9 a 1 4
10 a 1 5
11 a 1 6
12 a 1 7
13 a 1 8
14 b 1 8
15 b 1 7
16 b 1 6
17 b 1 5
18 b 1 4
19 b 1 3
20 b 1 2
21 b 1 1
22 b 0 0
23 b 1 1
24 b 1 2
25 b 1 3
26 b 1 4
This would need to be iterated thousands of times over millions of rows of data.
As usual, thanks for any and all help
It seems more like you just want to find the distance to a zero, rather than any sort of cumulative sum. If that's the case, then
#find zeros for each group
zeros <- tapply(seq.int(nrow(dum)) * as.numeric(dum$x==0), dum$label, max)
#calculate distance from zero for each point
dist <- abs(zeros[dum$label]-seq.int(nrow(dum)))
And that gives
cbind(dum, dist)
# label x dist
# 1 a 1 4
# 2 a 1 3
# 3 a 1 2
# 4 a 1 1
# 5 a 0 0
# 6 a 1 1
# 7 a 1 2
# 8 a 1 3
# 9 a 1 4
# 10 a 1 5
# 11 a 1 6
# 12 a 1 7
# 13 a 1 8
# 14 b 1 8
# 15 b 1 7
# 16 b 1 6
# 17 b 1 5
# 18 b 1 4
# 19 b 1 3
# 20 b 1 2
# 21 b 1 1
# 22 b 0 0
# 23 b 1 1
# 24 b 1 2
# 25 b 1 3
# 26 b 1 4
Or even ave will let you do it in one step
dist <- with(dum, ave(x,label,FUN=function(x) abs(seq_along(x)-which.min(x))))
cbind(dum, dist)
You can do this with by but also with plyr, data.table, etc. The function that is used on each subset is
f <- function(d) {
x <- d$x
i <- match(0, x)
v1 <- rev(cumsum(rev(x[1:i])))
v2 <- cumsum(x[(i+1):length(x)])
transform(d, output = c(v1, v2))
}
To call it on each subset e.g. with by
res <- by(dum, list(dum$label), f)
do.call(rbind, res)
If you want to use ddply
library(plyr)
ddply(dum, .(label), f)
May be faster with data.table
library(data.table)
dumdt <- as.data.table(dum)
setkey(dumdt, label)
dumdt[, f(.SD), by = key(dumdt)]
Using dplyr
library(dplyr)
dum%>%
group_by(label)%>%
mutate(dist=abs(row_number()-which.min(x)))

subtract first value from each subset of dataframe

I want to subtract the smallest value in each subset of a data frame from each value in that subset i.e.
A <- c(1,3,5,6,4,5,6,7,10)
B <- rep(1:4, length.out=length(A))
df <- data.frame(A, B)
df <- df[order(B),]
Subtracting would give me:
A B
1 0 1
2 3 1
3 9 1
4 0 2
5 2 2
6 0 3
7 1 3
8 0 4
9 1 4
I think the output you show is not correct. In any case, from what you explain, I think this is what you want. This uses ave base function:
within(df, { A <- ave(A, B, FUN=function(x) x-min(x))})
A B
1 0 1
5 3 1
9 9 1
2 0 2
6 2 2
3 0 3
7 1 3
4 0 4
8 1 4
Of course there are other alternatives such as plyr and data.table.
Echoing Arun's comment above, I think your expected output might be off. In any event, you should be able to use can use tapply to calculate subsets and then use match to line those subsets up with the original values:
subs <- tapply(df$A, df$B, min)
df$A <- df$A - subs[match(df$B, names(subs))]
df
A B
1 0 1
5 3 1
9 9 1
2 0 2
6 2 2
3 0 3
7 1 3
4 0 4
8 1 4

Resources