calculate summary by group and bring value back in the dataframe [duplicate] - r

This question already has answers here:
Calculate group mean, sum, or other summary stats. and assign column to original data
(4 answers)
Closed 5 years ago.
df <- data.frame(
id = c('A1','A2','A4','A2','A1','A4','A3','A2','A1','A3'),
value = c(4,3,1,3,4,6,6,1,8,4))
I want to get max value within each id group. I tried following but got an error saying replacement has 4 rows and data has 10 which i understand but don't know how to correct
df$max.by.id <- aggregate(value ~ id, df, max)
this is how i ended up successfully doing it
max.by.id <- aggregate(value ~ id, df, max)
names(max.by.id) <- c("id", "max")
df2 <- merge(df,max.by.id, by.x = "id", by.y = "id")
df2
# id value max
#1 A1 4 8
#2 A1 4 8
#3 A1 8 8
#4 A2 3 3
#5 A2 3 3
#6 A2 1 3
#7 A3 6 6
#8 A3 4 6
#9 A4 1 6
#10 A4 6 6
any better way? thanks in advance

ave() is the function for that task:
df$max.by.id <- ave(df$value, df$id, FUN=max)
example:
df <- data.frame(
id = c('A1','A2','A4','A2','A1','A4','A3','A2','A1','A3'),
value = c(4,3,1,3,4,6,6,1,8,4))
df$max.by.id <- ave(df$value, df$id, FUN=max)
The result of ave() has the same length as the original vector of values (what is also the length of the grouping variables). The values of the result are going to the right positions with respect to the grouping variables. For more information read the documentation of ave().

with data.table, you can compute the max by id "inside" the data, automatically adding the newly computed value (unique by id):
library(data.table)
setDT(df)[, max.by.id := max(value), by=id]
df
# id value max.by.id
# 1: A1 4 8
# 2: A2 3 3
# 3: A4 1 6
# 4: A2 3 3
# 5: A1 4 8
# 6: A4 6 6
# 7: A3 6 6
# 8: A2 1 3
# 9: A1 8 8
#10: A3 4 6

tapply(df$value, df$id, max)
# A1 A2 A3 A4
8 3 6 6
library(plyr)
ddply(df, .(id), function(df){max(df$value)})
# id V1
# 1 A1 8
# 2 A2 3
# 3 A3 6
# 4 A4 6
library(dplyr)
df %>% group_by(id) %>% arrange(desc(value)) %>% do(head(., 1))
# Source: local data frame [4 x 2]
# Groups: id [4]
# id value
# (fctr) (dbl)
# 1 A1 8
# 2 A2 3
# 3 A3 6
# 4 A4 6
UPDATE:
If you need to keep the raw value, use the following code.
library(plyr)
ddply(df, .(id), function(df){
df$max.val = max(df$value)
return(df)
})
library(dplyr)
df %>% group_by(id) %>% mutate(max.val=max(value))
# Source: local data frame [10 x 3]
# Groups: id [4]
# id value max.val
# (fctr) (dbl) (dbl)
# 1 A1 4 8
# 2 A2 3 3
# 3 A4 1 6
# 4 A2 3 3
# 5 A1 4 8
# 6 A4 6 6
# 7 A3 6 6
# 8 A2 1 3
# 9 A1 8 8
# 10 A3 4 6

Related

R: creating combinations of elements within a group and adding up numbers associated with combinations in a new data frame

I have the following dataset:
Letter ID Number
A A1 1
A A2 2
A A3 3
B B1 1
B B2 2
B B3 3
B B4 4
My aim is first to create all possible combinations of IDs within the same "Letter" group. For example, for the letter A, it would be only three combinations: A1-A2,A2-A3,and A1-A3. The same IDs ordered differently don't count as a new combination, so for example A1-A2 is the same as A2-A1.
Then, within those combinations, I want to add up the numbers from the "Number" column associated with those IDs. So for the combination A1-A2, which are associated with 1 and 2 in the "Number" column, this would result in the number 1+2=3.
Finally, I want to place the ID combinations, added numbers and original Letter in a new data frame. Something like this:
Letter Combination Add.Number
A A1-A2 3
A A2-A3 5
A A1-A3 4
B B1-B2 3
B B2-B3 5
B B3-B4 7
B B1-B3 4
B B2-B4 6
B B1-B4 5
How can I do this in R, ideally using the package dplyr?
library(dplyr)
letter <- c("A","A","A","B","B","B","B")
df <-
data.frame(letter) %>%
group_by(letter) %>%
mutate(
number = row_number(),
id = paste0(letter,number)
)
df %>%
full_join(df,by = "letter") %>%
filter(number.x < number.y) %>%
mutate(
combination = paste0(id.x,"-",id.y),
add_number = number.x + number.y) %>%
select(letter,combination,add_number)
# A tibble: 9 x 3
# Groups: letter [2]
letter combination add_number
<chr> <chr> <int>
1 A A1-A2 3
2 A A1-A3 4
3 A A2-A3 5
4 B B1-B2 3
5 B B1-B3 4
6 B B1-B4 5
7 B B2-B3 5
8 B B2-B4 6
9 B B3-B4 7
In base R, using combn:
df <- data.frame(
Letter = c("A","A","A","B","B","B","B"),
Id = c("A1","A2","A3","B1","B2","B3","B4"),
Number = c(1,2,3,1,2,3,4))
# combinations
l<-lapply(split(df$Id, df$Letter) ,function(x)
setNames(data.frame(t(combn(x,2))), c("L1","L2")))
n<-lapply(split(df$Number, df$Letter) ,function(x)
setNames(data.frame(t(combn(x,2))), c("N1","N2")))
# rbind all
result <- do.call(rbind, mapply(cbind, Letter=names(l), l, n, SIMPLIFY = F))
result$combination <- paste(result$L1, result$L2, sep="-")
result$sum = result$N1 + result$N2
result
#> Letter L1 L2 N1 N2 combination sum
#> A.1 A A1 A2 1 2 A1-A2 3
#> A.2 A A1 A3 1 3 A1-A3 4
#> A.3 A A2 A3 2 3 A2-A3 5
#> B.1 B B1 B2 1 2 B1-B2 3
#> B.2 B B1 B3 1 3 B1-B3 4
#> B.3 B B1 B4 1 4 B1-B4 5
#> B.4 B B2 B3 2 3 B2-B3 5
#> B.5 B B2 B4 2 4 B2-B4 6
#> B.6 B B3 B4 3 4 B3-B4 7

R data.table - add row based on value

I have data.table x below
x <- data.table(id=c('A1', 'B1'), start=c(1,1), stop=c(4,5))
id
start
stop
A1
1
4
B1
1
5
I would like to expand row. Is it possible to use rbindlist using Map to generate data.table as below?
id
start
stop
A1
1
2
A1
2
3
A1
3
4
B1
1
2
B1
2
3
B1
3
4
B1
4
5
You can create a sequence from start to stop for each id. Use shift to get next value and drop the NA rows.
library(data.table)
x <- x[, .(start = seq(start, stop)), id]
x[, stop := shift(start, type = 'lead'), id]
x[!is.na(stop)]
# id start stop
#1: A1 1 2
#2: A1 2 3
#3: A1 3 4
#4: B1 1 2
#5: B1 2 3
#6: B1 3 4
#7: B1 4 5
Here's an equivalent tidyverse way -
library(tidyverse)
x %>%
mutate(start = map2(start, stop, seq)) %>%
unnest(start) %>%
group_by(id) %>%
mutate(stop = lead(start)) %>%
ungroup %>%
filter(!is.na(stop))

R calculating the sum of values according to condition

Here is a data frame:
ID<-c(rep("A",3),rep("B",2), rep("C",3),rep("D",5))
cell<-c("a1","a2","a3","a1","a2","a1","a2", "a3","a1","a2","a1","a2","a3")
value<-c(2,5,3,4,5,6,9,8,7,2,5,2,4)
df<-as.data.frame(cbind(ID, cell, value))
I want to calculate the sum of all values for each ID up to cell a2 (incl.). The sequence of cells and ID’s must be taken into account. If there isn’t any cell “a2” after calculating of the sum, this rows should not be taken into account.
As a result I would like to get this table:
Could You please help me to code this condition?
Thanks in advance.
Best regards, Inna
assuming the file is already correctly ordered by cell
library( tidyverse )
df %>%
group_by( ID ) %>%
mutate( value = cumsum( value ) ) %>%
filter( cell == "a2" )
# # A tibble: 5 x 3
# # Groups: ID [4]
# ID cell value
# <chr> <chr> <dbl>
# 1 A a2 7
# 2 B a2 9
# 3 C a2 15
# 4 D a2 9
# 5 D a2 16
Treating each occurrence of "a2" as different group we can do :
library(dplyr)
df %>%
#Create a group column with every value of cell == 'a2' as different group
group_by(ID, grp = cumsum(lag(cell == 'a2', default = TRUE))) %>%
#Remove those groups that do not have 'a2' in them
filter(any(cell == 'a2')) %>%
#Sum till 'a2' value
summarise(value = sum(value[seq_len(match('a2', cell))]),
cell = last(cell)) %>%
select(-grp)
# ID value cell
# <chr> <dbl> <chr>
#1 A 7 a2
#2 B 9 a2
#3 C 15 a2
#4 D 9 a2
#5 D 7 a2
A succinct solution using ave.
r <- transform(df, value=ave(value, ID, FUN=cumsum))[df$cell == "a2", ]
r
# ID cell value
# 2 A a2 7
# 5 B a2 9
# 7 C a2 15
# 10 D a2 9
# 12 D a2 16
An option with data.table
library(data.table)
setDT(df)[, value := cumsum(value) , ID][cell == 'a2']
-output
# ID cell value
#1: A a2 7
#2: B a2 9
#3: C a2 15
#4: D a2 9
#5: D a2 16

Nonparametric test to compare rows in different dataframes in R

This is my first post here.
I have 4 dataframes for which I would like to do stepwise nonparametric tests for each row.
Eg. I would like to compare the values for each row in dataframe A with the values for each row in dataframe B.
I would need a non parametric test eg. Wilcoxon or whatever.
I thought of making a new column with the median, but I am certain that there is something better.
Could you give me an idea how to do this?
Thank you in advance!
Edit:
Here are my imaginary dataframes.
I want to compare each dataframe row-wise eg do a nonparametric test for John in dataframes A and B, then for Dora, etc.
A <- data.frame("A" = c("John","Dora","Robert","Jim"),
"A1" = c(8,1,10,5),
"A2"= c(9,1,1,4))
B <- data.frame("B" = c("John","Dora","Robert","Jim"),
"B1" = c(1,1,1,5),
"B2"= c(3,2,1,5),
"B3"=c(4,3,1,5),
"B4"=c(6,8,8,1))
I think you are looking for the function wilcox.test (in stats package).
Solution 1: Using a for loop
One way to compare each row of A with the corresponding row of B (and extract the p value) is to create a for loop such as this:
pval = NULL
for(i in 1:nrow(A))
{
vec_a = as.numeric(A[i,2:ncol(A)])
vec_b = as.numeric(B[B$B == A$A[i],2:ncol(B)])
p <- wilcox.test(vec_a,vec_b)
pval = c(pval, p$p.value)
print(p)
}
At the end, you will get a vector pval containing the pvalue for each row.
pval
[1] 0.1333333 0.2188194 0.5838824 1.0000000
Solution 2: Using tidyverse
A more elegant solution is to have the use of the tidyverse packages (in particular dplyr and tidyr) to assemble your dataframe into a single one, and compare each name by group by passing a formula in the function wilcox.test.
First, we can merge your dataframes by their name using left_join function from dplyr:
library(dplyr)
DF <- left_join(A,B, by = c("A"="B"))
A A1 A2 B1 B2 B3 B4
1 John 8 9 1 3 4 6
2 Dora 1 1 1 2 3 8
3 Robert 10 1 1 1 1 8
4 Jim 5 4 5 5 5 1
Then using dplyr and tidyr packages, you can reshape your dataframe into a longer format:
library(dplyr)
library(tidyr)
DF %>% pivot_longer(., -A, names_to = "var", values_to = "values")
# A tibble: 24 x 3
A var values
<fct> <chr> <dbl>
1 John A1 8
2 John A2 9
3 John B1 1
4 John B2 3
5 John B3 4
6 John B4 6
7 Dora A1 1
8 Dora A2 1
9 Dora B1 1
10 Dora B2 2
# … with 14 more rows
We will create a new column "group" that will indicate A or B depending of values in the column var:
DF %>% pivot_longer(., -A, names_to = "var", values_to = "values") %>%
mutate(group = gsub("\\d","",var))
# A tibble: 24 x 4
A var values group
<fct> <chr> <dbl> <chr>
1 John A1 8 A
2 John A2 9 A
3 John B1 1 B
4 John B2 3 B
5 John B3 4 B
6 John B4 6 B
7 Dora A1 1 A
8 Dora A2 1 A
9 Dora B1 1 B
10 Dora B2 2 B
# … with 14 more rows
Finally, we can group by A and summarise the dataframe to get the p value of the function wilcox.test when comparing values in each group for each name:
DF %>% pivot_longer(., -A, names_to = "var", values_to = "values") %>%
mutate(group = gsub("\\d","",var)) %>%
group_by(A) %>%
summarise(Pval = wilcox.test(values~group)$p.value)
# A tibble: 4 x 2
A Pval
<fct> <dbl>
1 Dora 0.219
2 Jim 1
3 John 0.133
4 Robert 0.584
It looks longer (especially because I explain each steps) but at the end, you can see that we need fewer lines than the first solution.
Does it answer your question ?

Using lapply to transpose part of a column and add it as new columns to a data frame

I've been searching for some clarity on this one, but cannot find something that applies to my case, I constructed a DF very similar to this one (but with considerably more data, over a million rows in total)
Key1 <- c("A", "B", "C", "A", "C", "B", "B", "C", "A", "C")
Key2 <- c("A1", "B1", "C1", "A2", "C2", "B2", "B3", "C3", "A3", "C4")
NumVal <- c(2, 3, 1, 4, 6, 8, 2, 3, 1, 0)
DF1 <- as.data.frame(cbind(Key1, Key2, NumVal), stringsAsFactors = FALSE) %>% arrange(Key2)
ConsId <- c(1:10)
DF1 <- cbind(DF1, ConsId)
Now, what I want to do is to add lets say 3 new columns (in real life I need 12, but in order to be more graphic in this toy example we'll use 3) to the data frame, where each row corresponds to the values of $NumVal with the same $Key1 and greater than or equal $ConsId to the ones in each row and filling the remaining spaces with NA's, here is the expected result in case I wasn't very clear:
Key1 Key2 NumVal ConsId V1 V2 V3
A A1 2 1 2 4 1
A A2 4 2 4 1 NA
A A3 1 3 1 NA NA
B B1 3 4 3 8 2
B B2 8 5 8 2 NA
B B3 2 6 2 NA NA
C C1 1 7 1 6 3
C C2 6 8 6 3 0
C C3 3 9 3 0 NA
C C4 0 10 0 NA NA
Now I'm using a do.call(rbind), and even tough it works fine, it takes way too long for my real data with a bit over 1 million rows (around 6 hrs), I also tried with the bind_rows dplyr function but it took a bit longer so I stuck with the do.call option, here's an example of the code I'm using:
# Function
TranspNumVal <- function(i){
Id <- DF1[i, "Key1"]
IdCons <- DF1[i, "ConsId"]
myvect <- as.matrix(filter(DF1, Id == Key1, ConsId >= IdCons) %>% select(NumVal))
Result <- as.data.frame(t(myvect[1:3]))
return(Result)
}
# Applying the function to the entire data frame
DF2 <- do.call(rbind, lapply(1:NROW(DF1), function(i) TranspNumVal(i)))
DF3 <- cbind(DF1, DF2)
Maybe changing the class is causing the code to be so inefficient, or maybe I'm just not finding a better way to vectorize my problem (you don't want to know how long it took with a nested loop), I'm fairly new to R and have just started fooling around with dplyr, so I'm open to any suggestion about how to optimize my code
We can use dplyr::lead
DF1 %>%
group_by(Key1) %>%
mutate(
V1 = NumVal,
V2 = lead(NumVal, n = 1),
V3 = lead(NumVal, n = 2))
## A tibble: 10 x 7
## Groups: Key1 [3]
# Key1 Key2 NumVal ConsId V1 V2 V3
# <chr> <chr> <chr> <int> <chr> <chr> <chr>
# 1 A A1 2 1 2 4 1
# 2 A A2 4 2 4 1 NA
# 3 A A3 1 3 1 NA NA
# 4 B B1 3 4 3 8 2
# 5 B B2 8 5 8 2 NA
# 6 B B3 2 6 2 NA NA
# 7 C C1 1 7 1 6 3
# 8 C C2 6 8 6 3 0
# 9 C C3 3 9 3 0 NA
#10 C C4 0 10 0 NA NA
Explanation: We group entries by Key1 and then use lead to shift NumVal values for columns V2 and V3. V1 is simply a copy of NumVal.
A dplyr pipeline.
First utility function will filter a (NumVal) based on the values of b (ConsId):
myfunc1 <- function(a,b) {
n <- length(b)
lapply(seq_along(b), function(i) a[ b >= b[i] ])
}
Second utility function converts a ragged list into a data.frame. It works with arbitrary number of columns to append, but we've limited it to 3 based on your requirements:
myfunc2 <- function(x, ncols = 3) {
n <- min(ncols, max(lengths(x)))
as.data.frame(do.call(rbind, lapply(x, `length<-`, n)))
}
Now the pipeline:
dat %>%
group_by(Key1) %>%
mutate(lst = myfunc1(NumVal, ConsId)) %>%
ungroup() %>%
bind_cols(myfunc2(.$lst)) %>%
select(-lst) %>%
arrange(Key1, ConsId)
# # A tibble: 10 × 7
# Key1 Key2 NumVal ConsId V1 V2 V3
# <chr> <chr> <int> <int> <int> <int> <int>
# 1 A A1 2 1 2 4 1
# 2 A A2 4 2 4 1 NA
# 3 A A3 1 3 1 NA NA
# 4 B B1 3 4 3 8 2
# 5 B B2 8 5 8 2 NA
# 6 B B3 2 6 2 NA NA
# 7 C C1 1 7 1 6 3
# 8 C C2 6 8 6 3 0
# 9 C C3 3 9 3 0 NA
# 10 C C4 0 10 0 NA NA
After grouping by 'Key1', use shift (from data.table) to get the next value of 'NumVal' in a list, convert it to tibble and unnest the nested list elements to individual columns of the dataset. By default, shift fill NA at the end.
library(data.table)
library(tidyverse)
DF1 %>%
group_by(Key1) %>%
mutate(new = shift(NumVal, 0:(n()-1), type = 'lead') %>%
map(~
as.list(.x) %>%
set_names(paste0("V", seq_along(.))) %>%
as_tibble)) %>%
unnest %>%
select(-V4)
# A tibble: 10 x 7
# Groups: Key1 [3]
# Key1 Key2 NumVal ConsId V1 V2 V3
# <chr> <chr> <dbl> <int> <dbl> <dbl> <dbl>
# 1 A A1 2 1 2 4 1
# 2 A A2 4 2 4 1 NA
# 3 A A3 1 3 1 NA NA
# 4 B B1 3 4 3 8 2
# 5 B B2 8 5 8 2 NA
# 6 B B3 2 6 2 NA NA
# 7 C C1 1 7 1 6 3
# 8 C C2 6 8 6 3 0
# 9 C C3 3 9 3 0 NA
#10 C C4 0 10 0 NA NA
data
DF1 <- data.frame(Key1, Key2, NumVal, stringsAsFactors = FALSE) %>%
arrange(Key2)
DF1$ConsId <- 1:10

Resources