I have started learning R.
Need your help on how to do this computation in loop:
C1 C2 C3
A 5 10
A 4 6
B 2 3
B 8 9
I would like the output as
C1 C2 C3 C4(c2*c3)
A 5 10 50
A 4+50=54 6 54*6= 324
B 2 3 6
B 8+6=14 9 14*9 = 126
For each distinct value in column C1 the value calculated at C4 should be added to the new row value at C2.
One dplyr possibility could be:
df %>%
group_by(C1) %>%
mutate(C4 = (C2 + lag(C2 * C3, default = 0)) * C3,
C2 = C2 + lag(C2 * C3, default = 0))
C1 C2 C3 C4
<chr> <dbl> <int> <dbl>
1 A 5 10 50
2 A 54 6 324
3 B 2 3 6
4 B 14 9 126
Or using data.table (by #markus):
setDT(df)[, `:=`(C4 = (C2 + shift(C2 * C3, fill = 0)) * C3,
C2 = C2 + shift(C4, fill = 0)), by = C1]
As Loop ;-):
d <- read.table(text = "C1 C2 C3
A 5 10
A 4 6
B 2 3
B 8 9", header = TRUE)
my_calc <- function(d) {
d[, 4] <- d[, 2] * d[, 3]
for (i in seq_len(nrow(d) - 1)) {
d[i+1, 2] <- d[i, 4] + d[i+1, 2]
d[i+1, 4] <- d[i+1, 2] * d[i+1, 3]
}
d
}
do.call(rbind, lapply(split(d, d$C1), my_calc))
#C1 C2 C3 V4
#A.1 A 5 10 50
#A.2 A 54 6 324
#B.3 B 2 3 6
#B.4 B 14 9 126
This loop should do the trick:
df = data.frame(C1=LETTERS[c(1,1,2,2)],C2=c(5,4,2,8),C3=c(10,6,3,9),stringsAsFactors = FALSE)
df$C4=rep(0,nrow(df))
for (i in 1:nrow(df)){
if (i!=1){
if (df$C1[i]==df$C1[i-1]){
df$C2[i]=df$C2[i]+df$C4[i-1]
}
}
df$C4[i] = df$C2[i]*df$C3[i]
}
C1 C2 C3 C4
1 A 5 10 50
2 A 54 6 324
3 B 2 3 6
4 B 14 9 126
Related
I have a dataframe with gene expression data by lane (column). What I would like to do is write a loop that takes the sum of each row but progressively adds in another column each time. So each time I loop through I add another column to my dataframe that contains the sums of each row plus another column to the end of the dataframe. In the example below I did this using the apply() function by hand but this is very inefficient and not feasible for a large data set. I messed around with the cumsum() function but couldn't seem to get it to work for this. Very possible I missed something obvious but any guidance would be great!
#Example dataframe
c1 <- c('G1', 'G2', 'G3')
c2 <- c(5, 3, 1)
c3 <- c(3, 7, 1)
c4 <- c(6, 3, 4)
c5 <- c(6, 4, 3)
df <- data.frame(c1, c2, c3, c4, c5)
#Cal cumulative sums
sum.2.3 <- apply(df[,2:3],1,sum)
sum.2.4 <- apply(df[,2:4],1,sum)
sum.2.5 <- apply(df[,2:5],1,sum)
df <- cbind(df, sum.2.3, sum.2.4, sum.2.5)
If the problem is the loop, you use apply inside it.
Code
start_col <- 2
end_col <- ncol(df)
for(i in (start_col+1):end_col){
var_name <- paste("sum",start_col,i,sep = ".")
df[,var_name] <- apply(df[,start_col:i],1,sum)
}
Output
c1 c2 c3 c4 c5 sum.2.3 sum.2.4 sum.2.5
1 G1 5 3 6 6 8 14 20
2 G2 3 7 3 4 10 13 17
3 G3 1 1 4 3 2 6 9
You can use Reduce()
Reduce(`+`, df[-1], accumulate = TRUE)[-1]
[[1]]
[1] 8 10 2
[[2]]
[1] 14 13 6
[[3]]
[1] 20 17 9
Assign into the data frame:
df[paste0("sum.2.", 3:5)] <- Reduce(`+`, df[-1], accumulate = TRUE)[-1]
Gives:
c1 c2 c3 c4 c5 sum.2.3 sum.2.4 sum.2.5
1 G1 5 3 6 6 8 14 20
2 G2 3 7 3 4 10 13 17
3 G3 1 1 4 3 2 6 9
No loop needed.
df <- data.frame(
c1 = c('G1', 'G2', 'G3'),
c2 = c(5, 3, 1),
c3 = c(3, 7, 1),
c4 = c(6, 3, 4),
c5 = c(6, 4, 3))
cbind(df, setNames(as.data.frame(t(apply(df[,-1], 1, cumsum))[,-1]), paste0("sum.2.", 3:5)))
#> c1 c2 c3 c4 c5 sum.2.3 sum.2.4 sum.2.5
#> 1 G1 5 3 6 6 8 14 20
#> 2 G2 3 7 3 4 10 13 17
#> 3 G3 1 1 4 3 2 6 9
Using rowCumsums from matrixStats
library(matrixStats)
df[paste0("sum.2.", 3:5)] <- rowCumsums(as.matrix(df[2:5]))[,-1]
-output
> df
c1 c2 c3 c4 c5 sum.2.3 sum.2.4 sum.2.5
1 G1 5 3 6 6 8 14 20
2 G2 3 7 3 4 10 13 17
3 G3 1 1 4 3 2 6 9
You can use both the mutate function from the dplyr package and the rowSums base function.
library(dplyr)
c1 <- c('G1', 'G2', 'G3')
c2 <- c(5, 3, 1)
c3 <- c(3, 7, 1)
c4 <- c(6, 3, 4)
c5 <- c(6, 4, 3)
df <- data.frame(c1, c2, c3, c4, c5)
df <- df %>%
dplyr::mutate(sum.2.3 = rowSums(across(c2:c3)),
sum.2.4 = rowSums(across(c2:c4)),
sum.2.5 = rowSums(across(c2:c5)))
Result
c1 c2 c3 c4 c5 sum.2.3 sum.2.4 sum.2.5
1 G1 5 3 6 6 8 14 20
2 G2 3 7 3 4 10 13 17
3 G3 1 1 4 3 2 6 9
I have a list (L) which some data frames have a value of -9999. How should I remove all rows in any data frame which contains -9999?
the result should be like R list.
L1 = data.frame(A1 = c(1:4) , B1 = c(1,2,-9999,3) , C1 = c(1:4))
L2 = data.frame(A2 = c(1:4) , B2 = c(1:4))
L3 = data.frame(A3 = c(1:4) , B3 = c(1,-9999,2,3))
L=list(L1,L2, L3)
R1 = data.frame(A1 = c(1,2,4) , B1 = c(1,2,3) , C1 = c(1,2,4))
R2 = data.frame(A2 = c(1:4) , B2 = c(1:4))
R3 = data.frame(A3 = c(1,3,4) , B3 = c(1,2,3))
R=list(R1,R2, R3)
library(dplyr)
L %>% purrr::map(~.x %>% filter(if_all(everything(), ~ .x != -9999)))
Result:
[[1]]
A1 B1 C1
1 1 1 1
2 2 2 2
3 4 3 4
[[2]]
A2 B2
1 1 1
2 2 2
3 3 3
4 4 4
[[3]]
A3 B3
1 1 1
2 3 2
3 4 3
You could use the following code with lapply that remove rows of every dataframe where we can use apply with any that removes every row containing -9999 like this:
L1 = data.frame(A1 = c(1:4) , B1 = c(1,2,-9999,3) , C1 = c(1:4))
L2 = data.frame(A2 = c(1:4) , B2 = c(1:4))
L3 = data.frame(A3 = c(1:4) , B3 = c(1,-9999,2,3))
L=list(L1,L2, L3)
lapply(L, function(x) {x <- x[!apply(x==-9999,1,any),]})
#> [[1]]
#> A1 B1 C1
#> 1 1 1 1
#> 2 2 2 2
#> 4 4 3 4
#>
#> [[2]]
#> A2 B2
#> 1 1 1
#> 2 2 2
#> 3 3 3
#> 4 4 4
#>
#> [[3]]
#> A3 B3
#> 1 1 1
#> 3 3 2
#> 4 4 3
Created on 2022-07-31 by the reprex package (v2.0.1)
A data frame is given and the objective is to calculate the direct dependency value between two columns of the data frame.
c1 c2 N
a b 30
a c 5
a d 10
c a 5
b a 10
what we are looking for is that to get the direct dependency relations, for example, for aand b this value is ab - ba = 20.
The final result should be like this:
c1 c2 N DepValue
a b 30 ab - ba = 20
a c 5 ac - ca = 0
a d 10 ad- 0 = 10
c a 5 ca - ac= 0
b a 10 ba - ab = 20
Thank you for your help.
D <- read.table(header=TRUE, stringsAsFactors = FALSE, text=
"c1 c2 N
a b 30
a c 5
a d 10
c a 5
b a 10")
N12 <- D$N
names(N12) <- paste0(D$c1, D$c2)
N21 <- N12[paste0(D$c2, D$c1)]
D$depValue <- D$N - ifelse(is.na(N21), 0, N21)
result:
> D
c1 c2 N depValue
1 a b 30 20
2 a c 5 0
3 a d 10 10
4 c a 5 0
5 b a 10 -20
One option is to create groups with pmin and pmax values of c1 and c2 and take difference between the two values. This will return NA for groups with only one value, we can replace those NAs to the first value in the group.
library(dplyr)
df %>%
group_by(group1 = pmin(c1, c2), group2 = pmax(c1, c2)) %>%
mutate(dep = N[1] - N[2],
dep = replace(dep, is.na(dep), N[1])) %>%
ungroup() %>%
select(-group1, -group2)
# c1 c2 N dep
# <chr> <chr> <int> <int>
#1 a b 30 20
#2 a c 5 0
#3 a d 10 10
#4 c a 5 0
#5 b a 10 20
An idea via base R is to sort columns c1 and c2, split based on those values and subtract N, i.e.
i1 <- paste(pmin(df$c1, df$c2), pmax(df$c1, df$c2))
i1
#[1] "a b" "a c" "a d" "a c" "a b"
do.call(rbind, lapply(split(df, i1), function(i) {i['DepValue'] <- Reduce(`-`, i$N); i}))
# c1 c2 N DepValue
#a b.1 a b 30 20
#a b.5 b a 10 20
#a c.2 a c 5 0
#a c.4 c a 5 0
#a d a d 10 10
I have a data frame that consist of 1000 rows and 156 columns. I'm trying to subtract the first column to the next 38 columns, then subtract column 39 to the next 38, and so, but I can't find a way to do it. I'm only using ncdf4 and nothing else. Something like this
C1 C2 C3 C4 C5 C6 C7 C8
1 2 3 4 5 6 4 5
3 4 6 5 4 3 2 7
And I'd like it to be
C1 C2 C3 C4 C5 C6 C7 C8
0 1 2 3 4 5 3 4
0 1 3 2 1 0 -1 4
The logic would be
First 38 columns - First column
Columns 39:77 - Column 39
and so on.
Solved it by simply doing
{
z[,1:38] <- z[,1:38]-z[,1]
z[,39:77] <-z[,39:77]-z[,39]
z[,78:118] <-z[,78:118]-z[,78]
z[,119:156] <-z[,119:156]-z[,119]
}
Where z is the dataframe. Might not be the nicest way but it did the trick
You can also do the following without any loop:
# sample data frame
df <- data.frame(matrix(data = seq(1,316),ncol = 158))
# split the data frame into list of data frame having columns
# 1 to 38, 39 to 77 and so on
df <- split.default(df, gl(round(ncol(df)/38),k = 38))
# subtract the last column from each
df <- do.call(cbind, lapply(df, function(f) f - f[,ncol(f)]))
colnames(df) <- paste0('C', seq(1,158))
print(head(df))
C1 C2 C3 C4 C5
1 -74 -72 -70 -68 -66
2 -74 -72 -70 -68 -66
Here is a user defined function: You can add else if statements as desired.
mydiff<-function(df){
mydiff<-df
for(i in 1:ncol(df)){
if(i<=38){
mydiff[,i]<-df[,i]-df[,1]
}
else if(i%in%c(39:77)){
mydiff[,i]<-df[,i]-df[,39]
}
}
mydiff
}
mydiff(df1)
Output:
C1 C2 C3 C4 C5 C6 C7 C8
0 1 2 3 4 5 3 4
0 1 3 2 1 0 -1 4
Benchmark:
system.time(result<-as.tibble(iris2) %>%
select_if(is.numeric) %>%
mydiff())
Result:
user system elapsed
0.02 0.00 0.01
You should consider using tidyverse to solve this, loading a package into R does little to the overhead of your environment and can make your life much easier.
library(tidyverse)
> df %>%
mutate_at(.vars = vars(num_range(prefix = 'C', 1:38)), .funs = function(x) x - .$C1) %>%
mutate_at(.vars = vars(num_range(prefix = 'C', 39:77)), .funs = function(x) x - .$C39)
C1 C2 C3 C4 C38 C39 C40 C41 C42 C77
1 0 1 2 3 4 0 1 2 3 4
2 0 0 3 2 4 0 0 3 2 4
Data
df <-
data.frame(
C1 = c(1, 3),
C2 = c(2, 3),
C3 = c(3, 6),
C4 = c(4, 5),
C38 = c(5, 7),
C39 = c(1, 3),
C40 = c(2, 3),
C41 = c(3, 6),
C42 = c(4, 5),
C77 = c(5, 7)
)
I have one table containing data which looks like this
Samp depth value
A1 0 2
A1 1 4
A1 2 3
A1 3 6
A1 4 8
A1 5 6
A1 6 2
A1 7 3
A2 0 2
A2 1 8
A2 2 6
A2 3 3
A2 4 6
A2 5 6
A3 0 7
A3 1 3
A3 2 2
A3 3 8
A3 4 3
...
Second table with intervals
Samp d_top d_bot
A1 0 2
A2 0 5
A3 1 2
A4 3 5
...
Now I would like to query the first table, using intervals from the second table.
Samp d_int sum_value
A1 0-2 9
A2 0-5 29
A3 1-2 5
...
It should work with aggregate or ddply, by specifying a list, which I tried. The problem is that it is not fixed intervals, but as defined in the second table. Any help is appreciated.
This 'group by other table' can be expressed in SQL with the help of the sqldf package.
Your table one is x in the code below and the range-table is y. The trick is the double join conditions. The first in the join clause (on Samp) and one (non-equi join) implicit in the where conditions.
library(sqldf)
sqldf('
SELECT
x.Samp,
y.d_top || "-" || y.d_bot as d_int,
sum(x.value) as sum_value
FROM x
JOIN y on y.Samp = x.Samp
WHERE
y.d_top <= x.depth and
x.depth <= y.d_bot
GROUP BY
y.d_top, y.d_bot, x.Samp
')
Which yields
Samp d_int sum_value
1 A1 0-2 9
2 A2 0-5 31
3 A3 1-2 5
Here's one approach:
do.call(rbind, by(dat1, dat1$Samp, function(x) {
Samp <- as.character(x$Samp[1])
idx <- Samp == as.character(dat2$Samp)
sequ <- seq(dat2$d_top[idx], dat2$d_bot[idx])
idx2 <- x$depth %in% sequ
data.frame(Samp, d_int = paste(range(sequ), collapse = "-"),
sum_value = sum(x$value[idx2]))
}))
where dat1 is your larger data frame and dat2 your shorter data frame.
This returns:
Samp d_int sum_value
A1 A1 0-2 9
A2 A2 0-5 31
A3 A3 1-2 5
Another idea:
f = function(samp, dt, db) {
inds = DF1$Samp == samp
sum(DF1[inds,'value'][DF1[inds,'depth'] %in% `:`(dt, db)])
} #DF1 and DF2 are your large and small dataframes, respectively
data.frame(Samp = DF2$Samp,
d_int = paste(DF2$d_top, DF2$d_bot, sep = " - "),
sum_value = mapply(f, DF2$Samp, DF2$d_top, DF2$d_bot, USE.NAMES = F))
# Samp d_int sum_value
#1 A1 0 - 2 9
#2 A2 0 - 5 31
#3 A3 1 - 2 5
A benchmarking:
set.seed(11)
DF1 = data.frame(Samp = rep(letters, each = 20),
depth = sample(1:10, 26*20, T),
value = runif(26*20),
stringsAsFactors = F)
set.seed(11)
DF2 = data.frame(Samp = letters,
d_top = sample(1:5, 26, T),
d_bot = sample(3:10, 26, T),
stringsAsFactors = F)
dat1 = DF1; dat2 = DF2; x = DF1; y = DF2
#> head(alex())
# Samp d_int sum_value
#1 a 2 - 6 5.127813
#2 b 1 - 3 4.043807
#3 c 3 - 4 3.356880
#4 d 1 - 6 9.209616
#5 e 1 - 7 7.452329
#6 f 5 - 5 2.241515
#> head(sven())
# Samp d_int sum_value
#a a 2-6 5.127813
#b b 1-3 4.043807
#c c 3-4 3.356880
#d d 1-6 9.209616
#e e 1-7 7.452329
#f f 5-5 2.241515
#> head(rick()[order(rick()[,1]),])
# Samp d_int sum_value
#10 a 2-6 5.127813
#1 b 1-3 4.043807
#16 c 3-4 3.356880
#4 d 1-6 9.209616
#6 e 1-7 7.452329
#22 f 5-5 2.241515
#> microbenchmark(alex(), sven(), rick())
#Unit: milliseconds
# expr min lq median uq max neval
# alex() 3.10070 3.230853 3.306196 3.461753 4.269292 100
# sven() 24.33163 25.525797 26.184391 26.868042 63.197223 100
# rick() 17.89463 18.622127 19.182584 19.820124 23.278920 100