How do I collect the minimum value of a column for each level of another column, while the new data frame is grouped by the other columns?
Here is a sample data set:
I want to collect the minimum time for each number, grouped by A2 and A3.
(In my original data frame, each column has more levels)
df <- structure(list(ID=c('a','a','a','a','b','b','b','b','c','c','c','c','d','d','d','d','e','e','e','e','f','f','f','f','g','g','g','g','h','h','h','h'),
A2=c('d1','d1','d1','d1','d1','d1','d1','d1','d2','d2','d2','d2','d2','d2','d2','d2','d1','d1','d1','d1','d1','d1','d1','d1','d2','d2','d2','d2','d2','d2','d2','d2'),
A3=c('g1','g1','g1','g1','g1','g1','g1','g1','g1','g1','g1','g1','g1','g1','g1','g1','g2','g2','g2','g2','g2','g2','g2','g2','g2','g2','g2','g2','g2','g2','g2','g2'),
number=c('1','1','2','2','1','1','2','2','1','1','2','2','1','1','2','2','1','1','2','2','1','1','2','2','1','1','2','2','1','1','2','2'),
time=c(23,345,123,4,434,76,245,34,135,98,45,678,32,134,76,578,32,145,256,79,311,356,67,12,689,467,98,456,23,45,23,34)),
class = "data.frame", row.names = c(NA,-32L))
the result would look like the following:
df.result<-structure(list(ID=c('a','a','b','b','c','c','d','d','e','e','f','f','g','g','h','h'),
A2=c('d1','d1','d1','d1','d2','d2','d2','d2','d1','d1','d1','d1','d2','d2','d2','d2'),
A3=c('g1','g1','g1','g1','g1','g1','g1','g1','g2','g2','g2','g2','g2','g2','g2','g2'),
number=c('1','2','1','2','1','2','1','2','1','2','1','2','1','2','1','2'),
time=c(23,4,76,34,98,45,32,76,32,79,311,12,467,98,23,23)),
class = "data.frame", row.names = c(NA,-16L))
Thanks
This seems like a job for aggregate, update number first:
transform(dat, number=with(rle(number), rep.int(seq_along(values), lengths))) |>
aggregate(time ~ number + A2 + A3, FUN=min)
# number A2 A3 time
# 1 1 d1 g1 234
# 2 2 d1 g1 12
# 3 3 d1 g1 232
# 4 4 d1 g1 44
# 5 5 d1 g1 21
# 6 6 d1 g1 34
# 7 13 d2 g1 345
# 8 14 d2 g1 34
# 9 15 d2 g1 56
# 10 16 d2 g1 98
# 11 7 d1 g2 23
# 12 8 d1 g2 12
# 13 9 d1 g2 689
# 14 10 d1 g2 4
# 15 11 d1 g2 43
# 16 12 d1 g2 21
# 17 17 d2 g2 245
# 18 18 d2 g2 134
# 19 19 d2 g2 567
# 20 20 d2 g2 1
Try this:
library(data.table)
setDT(data)
data[, numberR := rleid(number)]
data[, min(time), by = .(A2, A3, numberR)]
This will match your exact expected output.
The tidyverse solution would be:
df %>%
group_by(A2, A3, number) %>%
slice_min(time, n=1, with_ties = FALSE) %>%
ungroup()
Thanks everyone for your answers:
this code workes (based on the code #Karsten W. provided)
df.result <- aggregate(df, time ~ ID + number + A2 + A3, FUN=min)
I have started learning R.
Need your help on how to do this computation in loop:
C1 C2 C3
A 5 10
A 4 6
B 2 3
B 8 9
I would like the output as
C1 C2 C3 C4(c2*c3)
A 5 10 50
A 4+50=54 6 54*6= 324
B 2 3 6
B 8+6=14 9 14*9 = 126
For each distinct value in column C1 the value calculated at C4 should be added to the new row value at C2.
One dplyr possibility could be:
df %>%
group_by(C1) %>%
mutate(C4 = (C2 + lag(C2 * C3, default = 0)) * C3,
C2 = C2 + lag(C2 * C3, default = 0))
C1 C2 C3 C4
<chr> <dbl> <int> <dbl>
1 A 5 10 50
2 A 54 6 324
3 B 2 3 6
4 B 14 9 126
Or using data.table (by #markus):
setDT(df)[, `:=`(C4 = (C2 + shift(C2 * C3, fill = 0)) * C3,
C2 = C2 + shift(C4, fill = 0)), by = C1]
As Loop ;-):
d <- read.table(text = "C1 C2 C3
A 5 10
A 4 6
B 2 3
B 8 9", header = TRUE)
my_calc <- function(d) {
d[, 4] <- d[, 2] * d[, 3]
for (i in seq_len(nrow(d) - 1)) {
d[i+1, 2] <- d[i, 4] + d[i+1, 2]
d[i+1, 4] <- d[i+1, 2] * d[i+1, 3]
}
d
}
do.call(rbind, lapply(split(d, d$C1), my_calc))
#C1 C2 C3 V4
#A.1 A 5 10 50
#A.2 A 54 6 324
#B.3 B 2 3 6
#B.4 B 14 9 126
This loop should do the trick:
df = data.frame(C1=LETTERS[c(1,1,2,2)],C2=c(5,4,2,8),C3=c(10,6,3,9),stringsAsFactors = FALSE)
df$C4=rep(0,nrow(df))
for (i in 1:nrow(df)){
if (i!=1){
if (df$C1[i]==df$C1[i-1]){
df$C2[i]=df$C2[i]+df$C4[i-1]
}
}
df$C4[i] = df$C2[i]*df$C3[i]
}
C1 C2 C3 C4
1 A 5 10 50
2 A 54 6 324
3 B 2 3 6
4 B 14 9 126
I have a data frame that consist of 1000 rows and 156 columns. I'm trying to subtract the first column to the next 38 columns, then subtract column 39 to the next 38, and so, but I can't find a way to do it. I'm only using ncdf4 and nothing else. Something like this
C1 C2 C3 C4 C5 C6 C7 C8
1 2 3 4 5 6 4 5
3 4 6 5 4 3 2 7
And I'd like it to be
C1 C2 C3 C4 C5 C6 C7 C8
0 1 2 3 4 5 3 4
0 1 3 2 1 0 -1 4
The logic would be
First 38 columns - First column
Columns 39:77 - Column 39
and so on.
Solved it by simply doing
{
z[,1:38] <- z[,1:38]-z[,1]
z[,39:77] <-z[,39:77]-z[,39]
z[,78:118] <-z[,78:118]-z[,78]
z[,119:156] <-z[,119:156]-z[,119]
}
Where z is the dataframe. Might not be the nicest way but it did the trick
You can also do the following without any loop:
# sample data frame
df <- data.frame(matrix(data = seq(1,316),ncol = 158))
# split the data frame into list of data frame having columns
# 1 to 38, 39 to 77 and so on
df <- split.default(df, gl(round(ncol(df)/38),k = 38))
# subtract the last column from each
df <- do.call(cbind, lapply(df, function(f) f - f[,ncol(f)]))
colnames(df) <- paste0('C', seq(1,158))
print(head(df))
C1 C2 C3 C4 C5
1 -74 -72 -70 -68 -66
2 -74 -72 -70 -68 -66
Here is a user defined function: You can add else if statements as desired.
mydiff<-function(df){
mydiff<-df
for(i in 1:ncol(df)){
if(i<=38){
mydiff[,i]<-df[,i]-df[,1]
}
else if(i%in%c(39:77)){
mydiff[,i]<-df[,i]-df[,39]
}
}
mydiff
}
mydiff(df1)
Output:
C1 C2 C3 C4 C5 C6 C7 C8
0 1 2 3 4 5 3 4
0 1 3 2 1 0 -1 4
Benchmark:
system.time(result<-as.tibble(iris2) %>%
select_if(is.numeric) %>%
mydiff())
Result:
user system elapsed
0.02 0.00 0.01
You should consider using tidyverse to solve this, loading a package into R does little to the overhead of your environment and can make your life much easier.
library(tidyverse)
> df %>%
mutate_at(.vars = vars(num_range(prefix = 'C', 1:38)), .funs = function(x) x - .$C1) %>%
mutate_at(.vars = vars(num_range(prefix = 'C', 39:77)), .funs = function(x) x - .$C39)
C1 C2 C3 C4 C38 C39 C40 C41 C42 C77
1 0 1 2 3 4 0 1 2 3 4
2 0 0 3 2 4 0 0 3 2 4
Data
df <-
data.frame(
C1 = c(1, 3),
C2 = c(2, 3),
C3 = c(3, 6),
C4 = c(4, 5),
C38 = c(5, 7),
C39 = c(1, 3),
C40 = c(2, 3),
C41 = c(3, 6),
C42 = c(4, 5),
C77 = c(5, 7)
)
I have 4 datasets that I would like to perform a full_join. For brevity, I would use two datasets, df1 and df2 here.
df1 <- data.frame(ID = c(1, 3, 4, 5), V1 = LETTERS[11:14], V2 = letters[17:20])
df2 <- data.frame(ID = c(1, 10, 4, 9, 13), X5 = paste0(LETTERS[14:17], 1:5), X16 = paste0(letters[17:20], 1:5, 6:10), X23 = 56:60)
I would like to know if a record appears in one dataset but not the other and vice versa. I included a column (an indicator) in each dataset before performing the join.
df1 <- df1 %>% mutate(in_df1 = 1) # 1 if record is inside df1
df2 <- df2 %>% mutate(in_df2 = 1) # 1 if record is inside df2
Then, I perform a full join and I replace NAs in in_df1 and in_df2 columns to 0.
df <- full_join(df1, df2, by = "ID") %>%
mutate_at(vars(in_df1, in_df2), funs(coalesce(., 0))) %>%
select(ID, V1, V2, X5, X16, X23, in_df1, in_df2)
This works as follows:
# df
# ID V1 V2 X5 X16 X23 in_df1 in_df2
# 1 1 K q N1 q16 56 1 1
# 2 3 L r <NA> <NA> NA 1 0
# 3 4 M s P3 s38 58 1 1
# 4 5 N t <NA> <NA> NA 1 0
# 5 10 <NA> <NA> O2 r27 57 0 1
# 6 9 <NA> <NA> Q4 t49 59 0 1
# 7 13 <NA> <NA> N5 q510 60 0 1
However, I would like to know of a nicer way to do this.
I don't know if it's nice but at least it's the way to do it on R base.
df <- merge(df1, df2, all = TRUE)
df$InDf1 <- ifelse(is.na(match(df$ID, df1$ID)),0,1)
df$InDf2 <- ifelse(is.na(match(df$ID, df$ID)),0,1)
> df
ID V1 V2 X5 X16 X23 InDf1 InDf2
1 1 K q N1 q16 56 1 1
2 3 L r <NA> <NA> NA 1 0
3 4 M s P3 s38 58 1 1
4 5 N t <NA> <NA> NA 1 0
5 9 <NA> <NA> Q4 t49 59 0 1
6 10 <NA> <NA> O2 r27 57 0 1
7 13 <NA> <NA> N5 q510 60 0 1
This is my First dataframe,
df1 <- as.data.frame(matrix(rbinom(9*9, 1, 0.5), ncol=9, nrow =9))
colnames(df1) <- paste(rep(c("a","b","c"), each=3), rep(c(1,2,3), 3), sep = "")
set.seed(11)
This is my Second dataframe,
factor.1 <- paste(rep(c("a","b"), each=3), rep(c(1,2,3), 2), sep = "")
factor.2 <- rep(paste(rep("c", 3), c(1,2,3), sep = ""), 2)
df2 <- as.data.frame(cbind(factor.1,factor.2))
I want to calculate the result in each column and put it inside the second dataframe. I use dplyr
fun1 <- function(x){sum(ds1[, x])}
df2%>% mutate(value = fun1(factor.1))
But what I get is this,
factor.1 factor.2 value
1 a1 c1 22
2 a2 c2 22
3 a3 c3 22
4 b1 c1 22
5 b2 c2 22
6 b3 c3 22
But What I want is this,
factor.1 factor.2 value
1 a1 c1 4
2 a2 c2 4
3 a3 c3 4
4 b1 c1 1
5 b2 c2 4
6 b3 c3 5
Is this what you are looking for ?
df2 %>% mutate(value = sapply(factor.1, fun1) )