Calculating cumulative sum of columns with loop - r

I have a dataframe with gene expression data by lane (column). What I would like to do is write a loop that takes the sum of each row but progressively adds in another column each time. So each time I loop through I add another column to my dataframe that contains the sums of each row plus another column to the end of the dataframe. In the example below I did this using the apply() function by hand but this is very inefficient and not feasible for a large data set. I messed around with the cumsum() function but couldn't seem to get it to work for this. Very possible I missed something obvious but any guidance would be great!
#Example dataframe
c1 <- c('G1', 'G2', 'G3')
c2 <- c(5, 3, 1)
c3 <- c(3, 7, 1)
c4 <- c(6, 3, 4)
c5 <- c(6, 4, 3)
df <- data.frame(c1, c2, c3, c4, c5)
#Cal cumulative sums
sum.2.3 <- apply(df[,2:3],1,sum)
sum.2.4 <- apply(df[,2:4],1,sum)
sum.2.5 <- apply(df[,2:5],1,sum)
df <- cbind(df, sum.2.3, sum.2.4, sum.2.5)

If the problem is the loop, you use apply inside it.
Code
start_col <- 2
end_col <- ncol(df)
for(i in (start_col+1):end_col){
var_name <- paste("sum",start_col,i,sep = ".")
df[,var_name] <- apply(df[,start_col:i],1,sum)
}
Output
c1 c2 c3 c4 c5 sum.2.3 sum.2.4 sum.2.5
1 G1 5 3 6 6 8 14 20
2 G2 3 7 3 4 10 13 17
3 G3 1 1 4 3 2 6 9

You can use Reduce()
Reduce(`+`, df[-1], accumulate = TRUE)[-1]
[[1]]
[1] 8 10 2
[[2]]
[1] 14 13 6
[[3]]
[1] 20 17 9
Assign into the data frame:
df[paste0("sum.2.", 3:5)] <- Reduce(`+`, df[-1], accumulate = TRUE)[-1]
Gives:
c1 c2 c3 c4 c5 sum.2.3 sum.2.4 sum.2.5
1 G1 5 3 6 6 8 14 20
2 G2 3 7 3 4 10 13 17
3 G3 1 1 4 3 2 6 9

No loop needed.
df <- data.frame(
c1 = c('G1', 'G2', 'G3'),
c2 = c(5, 3, 1),
c3 = c(3, 7, 1),
c4 = c(6, 3, 4),
c5 = c(6, 4, 3))
cbind(df, setNames(as.data.frame(t(apply(df[,-1], 1, cumsum))[,-1]), paste0("sum.2.", 3:5)))
#> c1 c2 c3 c4 c5 sum.2.3 sum.2.4 sum.2.5
#> 1 G1 5 3 6 6 8 14 20
#> 2 G2 3 7 3 4 10 13 17
#> 3 G3 1 1 4 3 2 6 9

Using rowCumsums from matrixStats
library(matrixStats)
df[paste0("sum.2.", 3:5)] <- rowCumsums(as.matrix(df[2:5]))[,-1]
-output
> df
c1 c2 c3 c4 c5 sum.2.3 sum.2.4 sum.2.5
1 G1 5 3 6 6 8 14 20
2 G2 3 7 3 4 10 13 17
3 G3 1 1 4 3 2 6 9

You can use both the mutate function from the dplyr package and the rowSums base function.
library(dplyr)
c1 <- c('G1', 'G2', 'G3')
c2 <- c(5, 3, 1)
c3 <- c(3, 7, 1)
c4 <- c(6, 3, 4)
c5 <- c(6, 4, 3)
df <- data.frame(c1, c2, c3, c4, c5)
df <- df %>%
dplyr::mutate(sum.2.3 = rowSums(across(c2:c3)),
sum.2.4 = rowSums(across(c2:c4)),
sum.2.5 = rowSums(across(c2:c5)))
Result
c1 c2 c3 c4 c5 sum.2.3 sum.2.4 sum.2.5
1 G1 5 3 6 6 8 14 20
2 G2 3 7 3 4 10 13 17
3 G3 1 1 4 3 2 6 9

Related

Collecting the lowest value of a column based on other columns in R

How do I collect the minimum value of a column for each level of another column, while the new data frame is grouped by the other columns?
Here is a sample data set:
I want to collect the minimum time for each number, grouped by A2 and A3.
(In my original data frame, each column has more levels)
df <- structure(list(ID=c('a','a','a','a','b','b','b','b','c','c','c','c','d','d','d','d','e','e','e','e','f','f','f','f','g','g','g','g','h','h','h','h'),
A2=c('d1','d1','d1','d1','d1','d1','d1','d1','d2','d2','d2','d2','d2','d2','d2','d2','d1','d1','d1','d1','d1','d1','d1','d1','d2','d2','d2','d2','d2','d2','d2','d2'),
A3=c('g1','g1','g1','g1','g1','g1','g1','g1','g1','g1','g1','g1','g1','g1','g1','g1','g2','g2','g2','g2','g2','g2','g2','g2','g2','g2','g2','g2','g2','g2','g2','g2'),
number=c('1','1','2','2','1','1','2','2','1','1','2','2','1','1','2','2','1','1','2','2','1','1','2','2','1','1','2','2','1','1','2','2'),
time=c(23,345,123,4,434,76,245,34,135,98,45,678,32,134,76,578,32,145,256,79,311,356,67,12,689,467,98,456,23,45,23,34)),
class = "data.frame", row.names = c(NA,-32L))
the result would look like the following:
df.result<-structure(list(ID=c('a','a','b','b','c','c','d','d','e','e','f','f','g','g','h','h'),
A2=c('d1','d1','d1','d1','d2','d2','d2','d2','d1','d1','d1','d1','d2','d2','d2','d2'),
A3=c('g1','g1','g1','g1','g1','g1','g1','g1','g2','g2','g2','g2','g2','g2','g2','g2'),
number=c('1','2','1','2','1','2','1','2','1','2','1','2','1','2','1','2'),
time=c(23,4,76,34,98,45,32,76,32,79,311,12,467,98,23,23)),
class = "data.frame", row.names = c(NA,-16L))
Thanks
This seems like a job for aggregate, update number first:
transform(dat, number=with(rle(number), rep.int(seq_along(values), lengths))) |>
aggregate(time ~ number + A2 + A3, FUN=min)
# number A2 A3 time
# 1 1 d1 g1 234
# 2 2 d1 g1 12
# 3 3 d1 g1 232
# 4 4 d1 g1 44
# 5 5 d1 g1 21
# 6 6 d1 g1 34
# 7 13 d2 g1 345
# 8 14 d2 g1 34
# 9 15 d2 g1 56
# 10 16 d2 g1 98
# 11 7 d1 g2 23
# 12 8 d1 g2 12
# 13 9 d1 g2 689
# 14 10 d1 g2 4
# 15 11 d1 g2 43
# 16 12 d1 g2 21
# 17 17 d2 g2 245
# 18 18 d2 g2 134
# 19 19 d2 g2 567
# 20 20 d2 g2 1
Try this:
library(data.table)
setDT(data)
data[, numberR := rleid(number)]
data[, min(time), by = .(A2, A3, numberR)]
This will match your exact expected output.
The tidyverse solution would be:
df %>%
group_by(A2, A3, number) %>%
slice_min(time, n=1, with_ties = FALSE) %>%
ungroup()
Thanks everyone for your answers:
this code workes (based on the code #Karsten W. provided)
df.result <- aggregate(df, time ~ ID + number + A2 + A3, FUN=min)

Row by row computation

I have started learning R.
Need your help on how to do this computation in loop:
C1 C2 C3
A 5 10
A 4 6
B 2 3
B 8 9
I would like the output as
C1 C2 C3 C4(c2*c3)
A 5 10 50
A 4+50=54 6 54*6= 324
B 2 3 6
B 8+6=14 9 14*9 = 126
For each distinct value in column C1 the value calculated at C4 should be added to the new row value at C2.
One dplyr possibility could be:
df %>%
group_by(C1) %>%
mutate(C4 = (C2 + lag(C2 * C3, default = 0)) * C3,
C2 = C2 + lag(C2 * C3, default = 0))
C1 C2 C3 C4
<chr> <dbl> <int> <dbl>
1 A 5 10 50
2 A 54 6 324
3 B 2 3 6
4 B 14 9 126
Or using data.table (by #markus):
setDT(df)[, `:=`(C4 = (C2 + shift(C2 * C3, fill = 0)) * C3,
C2 = C2 + shift(C4, fill = 0)), by = C1]
As Loop ;-):
d <- read.table(text = "C1 C2 C3
A 5 10
A 4 6
B 2 3
B 8 9", header = TRUE)
my_calc <- function(d) {
d[, 4] <- d[, 2] * d[, 3]
for (i in seq_len(nrow(d) - 1)) {
d[i+1, 2] <- d[i, 4] + d[i+1, 2]
d[i+1, 4] <- d[i+1, 2] * d[i+1, 3]
}
d
}
do.call(rbind, lapply(split(d, d$C1), my_calc))
#C1 C2 C3 V4
#A.1 A 5 10 50
#A.2 A 54 6 324
#B.3 B 2 3 6
#B.4 B 14 9 126
This loop should do the trick:
df = data.frame(C1=LETTERS[c(1,1,2,2)],C2=c(5,4,2,8),C3=c(10,6,3,9),stringsAsFactors = FALSE)
df$C4=rep(0,nrow(df))
for (i in 1:nrow(df)){
if (i!=1){
if (df$C1[i]==df$C1[i-1]){
df$C2[i]=df$C2[i]+df$C4[i-1]
}
}
df$C4[i] = df$C2[i]*df$C3[i]
}
C1 C2 C3 C4
1 A 5 10 50
2 A 54 6 324
3 B 2 3 6
4 B 14 9 126

how to subtract a column to the other colums in a data frame

I have a data frame that consist of 1000 rows and 156 columns. I'm trying to subtract the first column to the next 38 columns, then subtract column 39 to the next 38, and so, but I can't find a way to do it. I'm only using ncdf4 and nothing else. Something like this
C1 C2 C3 C4 C5 C6 C7 C8
1 2 3 4 5 6 4 5
3 4 6 5 4 3 2 7
And I'd like it to be
C1 C2 C3 C4 C5 C6 C7 C8
0 1 2 3 4 5 3 4
0 1 3 2 1 0 -1 4
The logic would be
First 38 columns - First column
Columns 39:77 - Column 39
and so on.
Solved it by simply doing
{
z[,1:38] <- z[,1:38]-z[,1]
z[,39:77] <-z[,39:77]-z[,39]
z[,78:118] <-z[,78:118]-z[,78]
z[,119:156] <-z[,119:156]-z[,119]
}
Where z is the dataframe. Might not be the nicest way but it did the trick
You can also do the following without any loop:
# sample data frame
df <- data.frame(matrix(data = seq(1,316),ncol = 158))
# split the data frame into list of data frame having columns
# 1 to 38, 39 to 77 and so on
df <- split.default(df, gl(round(ncol(df)/38),k = 38))
# subtract the last column from each
df <- do.call(cbind, lapply(df, function(f) f - f[,ncol(f)]))
colnames(df) <- paste0('C', seq(1,158))
print(head(df))
C1 C2 C3 C4 C5
1 -74 -72 -70 -68 -66
2 -74 -72 -70 -68 -66
Here is a user defined function: You can add else if statements as desired.
mydiff<-function(df){
mydiff<-df
for(i in 1:ncol(df)){
if(i<=38){
mydiff[,i]<-df[,i]-df[,1]
}
else if(i%in%c(39:77)){
mydiff[,i]<-df[,i]-df[,39]
}
}
mydiff
}
mydiff(df1)
Output:
C1 C2 C3 C4 C5 C6 C7 C8
0 1 2 3 4 5 3 4
0 1 3 2 1 0 -1 4
Benchmark:
system.time(result<-as.tibble(iris2) %>%
select_if(is.numeric) %>%
mydiff())
Result:
user system elapsed
0.02 0.00 0.01
You should consider using tidyverse to solve this, loading a package into R does little to the overhead of your environment and can make your life much easier.
library(tidyverse)
> df %>%
mutate_at(.vars = vars(num_range(prefix = 'C', 1:38)), .funs = function(x) x - .$C1) %>%
mutate_at(.vars = vars(num_range(prefix = 'C', 39:77)), .funs = function(x) x - .$C39)
C1 C2 C3 C4 C38 C39 C40 C41 C42 C77
1 0 1 2 3 4 0 1 2 3 4
2 0 0 3 2 4 0 0 3 2 4
Data
df <-
data.frame(
C1 = c(1, 3),
C2 = c(2, 3),
C3 = c(3, 6),
C4 = c(4, 5),
C38 = c(5, 7),
C39 = c(1, 3),
C40 = c(2, 3),
C41 = c(3, 6),
C42 = c(4, 5),
C77 = c(5, 7)
)

When merging data in r, how do we check if a record id is in one dataset but not the other?

I have 4 datasets that I would like to perform a full_join. For brevity, I would use two datasets, df1 and df2 here.
df1 <- data.frame(ID = c(1, 3, 4, 5), V1 = LETTERS[11:14], V2 = letters[17:20])
df2 <- data.frame(ID = c(1, 10, 4, 9, 13), X5 = paste0(LETTERS[14:17], 1:5), X16 = paste0(letters[17:20], 1:5, 6:10), X23 = 56:60)
I would like to know if a record appears in one dataset but not the other and vice versa. I included a column (an indicator) in each dataset before performing the join.
df1 <- df1 %>% mutate(in_df1 = 1) # 1 if record is inside df1
df2 <- df2 %>% mutate(in_df2 = 1) # 1 if record is inside df2
Then, I perform a full join and I replace NAs in in_df1 and in_df2 columns to 0.
df <- full_join(df1, df2, by = "ID") %>%
mutate_at(vars(in_df1, in_df2), funs(coalesce(., 0))) %>%
select(ID, V1, V2, X5, X16, X23, in_df1, in_df2)
This works as follows:
# df
# ID V1 V2 X5 X16 X23 in_df1 in_df2
# 1 1 K q N1 q16 56 1 1
# 2 3 L r <NA> <NA> NA 1 0
# 3 4 M s P3 s38 58 1 1
# 4 5 N t <NA> <NA> NA 1 0
# 5 10 <NA> <NA> O2 r27 57 0 1
# 6 9 <NA> <NA> Q4 t49 59 0 1
# 7 13 <NA> <NA> N5 q510 60 0 1
However, I would like to know of a nicer way to do this.
I don't know if it's nice but at least it's the way to do it on R base.
df <- merge(df1, df2, all = TRUE)
df$InDf1 <- ifelse(is.na(match(df$ID, df1$ID)),0,1)
df$InDf2 <- ifelse(is.na(match(df$ID, df$ID)),0,1)
> df
ID V1 V2 X5 X16 X23 InDf1 InDf2
1 1 K q N1 q16 56 1 1
2 3 L r <NA> <NA> NA 1 0
3 4 M s P3 s38 58 1 1
4 5 N t <NA> <NA> NA 1 0
5 9 <NA> <NA> Q4 t49 59 0 1
6 10 <NA> <NA> O2 r27 57 0 1
7 13 <NA> <NA> N5 q510 60 0 1

Run call columns data of another dataframe, row by row

This is my First dataframe,
df1 <- as.data.frame(matrix(rbinom(9*9, 1, 0.5), ncol=9, nrow =9))
colnames(df1) <- paste(rep(c("a","b","c"), each=3), rep(c(1,2,3), 3), sep = "")
set.seed(11)
This is my Second dataframe,
factor.1 <- paste(rep(c("a","b"), each=3), rep(c(1,2,3), 2), sep = "")
factor.2 <- rep(paste(rep("c", 3), c(1,2,3), sep = ""), 2)
df2 <- as.data.frame(cbind(factor.1,factor.2))
I want to calculate the result in each column and put it inside the second dataframe. I use dplyr
fun1 <- function(x){sum(ds1[, x])}
df2%>% mutate(value = fun1(factor.1))
But what I get is this,
factor.1 factor.2 value
1 a1 c1 22
2 a2 c2 22
3 a3 c3 22
4 b1 c1 22
5 b2 c2 22
6 b3 c3 22
But What I want is this,
factor.1 factor.2 value
1 a1 c1 4
2 a2 c2 4
3 a3 c3 4
4 b1 c1 1
5 b2 c2 4
6 b3 c3 5
Is this what you are looking for ?
df2 %>% mutate(value = sapply(factor.1, fun1) )

Resources