Combine group_by, ifelse and filter - r

I would like to combine group_by, ifelse and filter my code for the example dataframe below. What I would like is the following: 1) Group by x. 2) Check if result > 1. If TRUE, check if month for which result >1 == max(month) for that group. If TRUE, select all rows for that group. All other rows should be discarded (so both in case result <= 1 or (month where result > 1 != max(month)) . So in my example data frame all rows for B should be kept and all rows for A should be discarded.
x month result
1 A 1 0.5
2 A 2 0.6
3 A 3 1.2
4 A 4 1.1
5 A 5 0.9
6 B 1 0.3
7 B 2 0.4
8 B 3 0.5
9 B 4 0.9
10 B 5 1.2
dat <- data.frame(x = c("A","A","A","A","A","B","B","B","B","B"),
month = c(1,2,3,4,5,1,2,3,4,5),
result = c(.5,.6,1.2,1.1,.9,.3,.4,.5,.9,1.2))

Using data.table
library(data.table)
setDT(dat)[, .SD[result[which.max(month)] > 1], x]
# x month result
#1: B 1 0.3
#2: B 2 0.4
#3: B 3 0.5
#4: B 4 0.9
#5: B 5 1.2
Or with dplyr
library(dplyr)
dat %>%
group_by(x) %>%
filter(result[which.max(month)] > 1)
# A tibble: 5 x 3
# Groups: x [1]
# x month result
# <fct> <dbl> <dbl>
#1 B 1 0.3
#2 B 2 0.4
#3 B 3 0.5
#4 B 4 0.9
#5 B 5 1.2

If you want to stay in the tidyverse and not venture into base selection, we can easily get there, as well, by just using any to check whether any in the group meet your critera:
dat %>%
group_by(x) %>%
filter(any(result > 1 & month == max(month)))
# A tibble: 5 x 3
# Groups: x [1]
x month result
<fct> <dbl> <dbl>
1 B 1 0.3
2 B 2 0.4
3 B 3 0.5
4 B 4 0.9
5 B 5 1.2
Alternatively, sometimes I'll create a "keep" variable to check if I've got the right ones, initially, or to make the code more easily readable by someone looking at my code years later:
dat %>%
group_by(x) %>%
mutate(keep = (result > 1 & month == max(month))) %>%
filter(any(keep))

Here is a solution With base R (without group_by or filter)
res <- Reduce(rbind,lapply(split(dat,dat$x), function(v) {
if (v$result[which.max(v$month)]>1) v else NULL}))
such that
> res
x month result
6 B 1 0.3
7 B 2 0.4
8 B 3 0.5
9 B 4 0.9
10 B 5 1.2

Related

Want to get the dataframe of values that are deviations from the mean based on a factor column [duplicate]

This question already has answers here:
Subtracting values group-wise by the average of each group in R
(4 answers)
Closed 2 years ago.
Example:
So lets say I have this data frame.
x = data.frame(factor = as.factor(c('a','a','b','b','c','c')),value1 = c(1,3,2,4,5,3), value2 = c(7,9,3,4,9,3))
factor value1 value2
1 a 1 7
2 a 3 9
3 b 2 3
4 b 4 4
5 c 5 9
6 c 3 3
I know how to get the mean per factor, I use this method:
aggregate(x[,c(2,3)], list(x$factor), mean, na.rm = T )
This give me the following output:
Group.1 value1 value2
1 a 2 8.0
2 b 3 3.5
3 c 4 6.0
How do I now go about subtracting from each value in the original dataframe the corresponding mean of its factor. The actual dataset I am using is big so need to have a nice way, I have managed to do it but I used complicated for loops.
So the output that I want would be:
factor value1 value2
1 a -1 -1.0
2 a 1 1.0
3 b -1 -0.5
4 b 1 0.5
5 c 1 3.0
6 c -1 -3.0
Any help would be great. Thanks.
A dplyr solution
library(dplyr)
x %>% group_by(factor) %>% mutate(across(c(value1, value2), ~. - mean(.)))
Output
# A tibble: 6 x 3
# Groups: factor [3]
factor value1 value2
<fct> <dbl> <dbl>
1 a -1 -1
2 a 1 1
3 b -1 -0.5
4 b 1 0.5
5 c 1 3
6 c -1 -3
You can try this dplyr approach:
library(dplyr)
#Data
x = data.frame(factor = as.factor(c('a','a','b','b','c','c')),value1 = c(1,3,2,4,5,3), value2 = c(7,9,3,4,9,3))
#Code
x <- x %>% group_by(factor) %>%
mutate(Mv1=mean(value1),
Mv2=mean(value2),
value1=value1-Mv1,
value2=value2-Mv2) %>% select(-c(Mv1,Mv2))
Output:
# A tibble: 6 x 3
# Groups: factor [3]
factor value1 value2
<fct> <dbl> <dbl>
1 a -1 -1
2 a 1 1
3 b -1 -0.5
4 b 1 0.5
5 c 1 3
6 c -1 -3
Here is a solution with data.table
library("data.table")
setDT(x)
cols <- paste0("value", 1:2)
x[, lapply(.SD, function(x) x - mean(x)), .SDcols=cols, by=factor]
or
library("data.table")
setDT(x)
x[, sweep(.SD, 2, STATS=colMeans(.SD)), by=factor, .SDcols=2:3]

Manipulating large dataset with dcast

Apologies if this is a repeat question but I could not find the specific answer I am looking for. I have a dataframe with counts of different species caught on a given trip. A simplified example with 5 trips and 4 species is below:
trip = c(1,1,1,2,2,3,3,3,3,4,5,5)
species = c("a","b","c","b","d","a","b","c","d","c","c","d")
count = c(5,7,3,1,8,10,1,4,3,1,2,10)
dat = cbind.data.frame(trip, species, count)
dat
> dat
trip species count
1 1 a 5
2 1 b 7
3 1 c 3
4 2 b 1
5 2 d 8
6 3 a 10
7 3 b 1
8 3 c 4
9 3 d 3
10 4 c 1
11 5 c 2
12 5 d 10
I am only interested in the counts of species b for each trip. So I want to manipulate this data frame so I end up with one that looks like this:
trip2 = c(1,2,3,4,5)
species2 = c("b","b","b","b","b")
count2 = c(7,1,1,0,0)
dat2 = cbind.data.frame(trip2, species2, count2)
dat2
> dat2
trip2 species2 count2
1 1 b 7
2 2 b 1
3 3 b 1
4 4 b 0
5 5 b 0
I want to keep all trips, including trips where species b was not observed. So I can't just subset the data by species b. I know I can cast the data so species are the columns and then just remove the columns for the other species like so:
library(dplyr)
library(reshape2)
test = dcast(dat, trip ~ species, value.var = "count", fun.aggregate = sum)
test
> test
trip a b c d
1 1 5 7 3 0
2 2 0 1 0 8
3 3 10 1 4 3
4 4 0 0 1 0
5 5 0 0 2 10
However, my real dataset has several hundred species caught on thousands of trips, and if I try to cast that many species to columns R chokes. There are way too many columns. Is there a way to specify in dcast that I only want to cast species b? Or is there another way to do this that doesn't require casting the data? Thank you.
Here is a data.table approach which I suspect will be very fast for you:
library(data.table)
setDT(dat)
result <- dat[,.(species = "b", count = sum(.SD[species == "b",count])),by = trip]
result
trip species count
1: 1 b 7
2: 2 b 1
3: 3 b 1
4: 4 b 0
5: 5 b 0
We can use tidyverse
library(dplyr)
library(tidyr)
dat %>%
filter(species == 'b') %>%
group_by(trip, species) %>%
summarise(count = sum(count)) %>%
ungroup %>%
complete(trip = unique(dat$trip), fill = list(species = 'b', count = 0))
# A tibble: 5 x 3
# trip species count
# <dbl> <chr> <dbl>
#1 1 b 7
#2 2 b 1
#3 3 b 1
#4 4 b 0
#5 5 b 0

Combine Multiple Dataframes in R by Average (Mixed datatypes)

I've done some research about this (here and here), but I haven't found what I actually want to achieve. The closest I've found to what I'm looking for is here, but the code doesn't seem to work or do what I desire. Besides, I found out that rbindlist has been deprecated in favour of bind_rows, but I haven't been able to use bind_rows to achieve what I want.
I have a list of 30 dataframes each with the same number of rows and columns, as well as the same column datatypes (though each column could be either continuous and categorical). I want to merge them into a single dataframe of the same number of rows and columns, but with each cell as a mean/median/majority voting of the corresponding 30 cells from the list of dataframes, for continuous, integer, and categorical columns, respectively. Here's an example with three dataframes:
df 1:
A B C
2.3 5 3
12 3 1
0.4 13 2
df_2:
A B C
4.3 23 1
1 7 2
0.4 10 2
df_3:
A B C
1.3 3 3
2.2 4 2
12.4 10 1
And the resulting dataframe would be something like:
df_result:
A B C
2.63 5 3
5.06 4 2
4.4 10 2
Any directions to more appropriate ways of combining each of the datatypes would also be highly appreciated.
Put a ROW ID on your tables
df_1 <- read_table("A B C
2.3 5 3
12 3 1
0.4 13 2") %>%
rowid_to_column("ROW")
df_2 <- read_table("A B C
4.3 23 1
1 7 2
0.4 10 2") %>%
rowid_to_column("ROW")
df_3 <- read_table("A B C
1.3 3 3
2.2 4 2
12.4 10 1") %>%
rowid_to_column("ROW")
Bind them together in an ensemble
ensamb <- bind_rows(df_1, df_2, df_3)
group_by row and then summarize each one by its own method
ensamb %>%
group_by(ROW) %>%
summarise(A = mean(A), B = median(B),
C = C[which.max(C)])
# A tibble: 3 x 4
ROW A B C
<int> <dbl> <dbl> <dbl>
1 1 2.63 5 3
2 2 5.07 4 2
3 3 4.4 10 2
You can put all the dataframes in a list :
list_df <- mget(ls(pattern = 'df_\\d+'))
Then calculate the stats for each column separately.
data.frame(A = Reduce(`+`, lapply(list_df, `[[`, 1))/length(list_df),
B = apply(do.call(rbind, lapply(list_df, `[[`, 2)), 2, median),
C = apply(do.call(rbind, lapply(list_df, `[[`, 3)), 2, Mode),
row.names = NULL)
# A B C
#1 2.633333 5 3
#2 5.066667 4 2
#3 4.400000 10 2
where Mode function is taken from here :
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}

Collapsing rows with dplyr

I am new to R and am trying to collapse rows based on row values with dplyr. The following example shows the sample data.
set.seed(123)
df<-data.frame(A=c(rep(1:4,4)),
B=runif(16,min=0,max=1),
C=rnorm(16, mean=1,sd=0.5))
A B c
1 1 0.36647435 0.7485365
2 2 0.51864614 0.8654337
3 3 0.04596929 0.9858012
4 4 0.15479619 1.1294208
5 1 0.76712372 1.2460700
6 2 0.17666676 0.7402996
7 3 0.89759874 1.2699954
8 4 0.90267735 0.7101804
9 1 0.91744223 0.3451281
10 2 0.25472599 0.8604743
11 3 0.10933985 0.8696796
12 4 0.71656017 1.2648846
13 1 0.21157810 1.3170205
14 2 0.14947268 1.2789700
15 3 0.92251060 1.5696901
16 4 0.30090579 1.7642853
I want to summarize/collapse two rows based on the condition that the rows in column A with values 1 and 2 as one row (as mean of row 1 and 2) . Therefore the final result will have only 12 rows because the other 4 rows has been collapsed.
I tried to use the following dplyr function but to little avail.
install.packages ("tidyverse")
library (tidyverse)
df %>% summarize_each( fun(i){ for i %in% c(1,2)funs(mean) })
The expected output is something like:
A B C
1 1.5 0.4425602 0.8069851
3 3 0.04596929 0.9858012
4 4 0.15479619 1.1294208
5 1.5 0.4718952 0.9931848
7 3 0.89759874 1.2699954
8 4 0.90267735 0.7101804
9 1.5 0.5860841 0.6028012
11 3 0.10933985 0.8696796
12 4 0.71656017 1.2648846
13 1.5 0.1805254 1.297995
15 3 0.92251060 1.5696901
16 4 0.30090579 1.7642853
Thank you in advance.
By making the implicit, order based groupings explicit, the summary can
be done with a single summarise_all call.
# Generate the data
set.seed(1)
df <- data.frame(
A = c(rep(1:4, 4)),
B = runif(16, min = 0, max = 1),
C = rnorm(16, mean = 1, sd = 0.5)
)
library(dplyr)
new <- df %>%
group_by(grp = rep(
1:4, # vector containing names of groups to create
each = 4 # number of elements in each group
)) %>%
group_by(mean_grp = cumsum(A > 2) + 1, add = T) %>%
summarise_all(mean) %>%
ungroup()
new
#> # A tibble: 12 x 5
#> grp mean_grp A B C
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 1.5 0.3188163 1.067598241
#> 2 1 2 3.0 0.5728534 1.755890584
#> 3 1 3 4.0 0.9082078 1.194921618
#> 4 2 1 1.5 0.5500358 0.291014883
#> 5 2 2 3.0 0.9446753 1.562465459
#> 6 2 3 4.0 0.6607978 0.977533195
#> 7 3 1 1.5 0.3454502 1.231911487
#> 8 3 2 3.0 0.2059746 1.410610598
#> 9 3 3 4.0 0.1765568 1.296950661
#> 10 4 1 1.5 0.5355633 1.425278418
#> 11 4 2 3.0 0.7698414 1.037282492
#> 12 4 3 4.0 0.4976992 0.005324152
I would recommend keeping the grouping variables in your data after the
summary (everything is simpler if you include them in the first place),
but if you want to, you can drop them with
new %>% select(-grp, -mean_grp).
PS. In order to avoid having "magic numbers" (such as the 1:4 and each = 4 when creating grp) included in the code, you could also create the first grouping variable as:
grp = cumsum(A < lag(A, default = A[1])) + 1
Assuming that the original data are ordered such that a new group starts each time the value of A is less than the previous value of A.
One option would be to process the rows with A equal to 1 or 2 separately from the other rows and then bind them back together:
set.seed(3)
df<-data.frame(A=c(rep(1:4,4)),B=runif(16,min=0,max=1),c=rnorm(16, mean=1,sd=0.5))
df %>%
filter(A %in% 1:2) %>%
group_by(tmp=cumsum(A==1)) %>%
summarise_all(mean) %>%
ungroup %>% select(-tmp) %>%
bind_rows(df %>% filter(!A %in% 1:2))
A B c
<dbl> <dbl> <dbl>
1 1.5 0.4877790 1.0121278
2 1.5 0.6032474 0.8840735
3 1.5 0.6042946 0.5996850
4 1.5 0.5456424 0.6198039
5 3.0 0.3849424 0.6276092
6 4.0 0.3277343 0.4343907
7 3.0 0.1246334 1.0760229
8 4.0 0.2946009 0.8461718
9 3.0 0.5120159 1.6121568
10 4.0 0.5050239 1.0999058
11 3.0 0.8679195 0.8981359
12 4.0 0.8297087 0.1667626

summarise by group of columns using min and maintaing row number

I have a data frame with 3 columns
df <- data.frame(ID1=c(rep(1,4),rep(2,4)), ID2=rep(1:2,4), value=1:8)
I need to recover the min for each group (ID1, ID2) and the position(row.name) of this min in the original table.
Using group_by and summarise, I have obtained the min but I can't see a way to obtain the position as summarise gets rid of the columns not summarised and not used for group.
df<-data.frame(ID1=c(rep(1,4),rep(2,4)), ID2=rep(1:2,4), value=1:8)
df[['X']] <- paste0(df$ID1,'.',df$ID2)
df <- group_by( df, X )
df <- summarise( df, Objective=min(value) )
Any ideas on how to solve this to get?
X Objective Position
1 1.1 1 1
2 1.2 2 2
3 2.1 5 5
4 2.2 6 6
Thanks in advance
If I understand correct and since you're already using dplyr, you could do it like this:
library(dplyr); library(tidyr)
unite(df, X, ID1:ID2, sep = ".") %>%
mutate(Position = row_number()) %>%
group_by(X) %>% slice(which.min(value))
#Source: local data frame [4 x 3]
#Groups: X
#
# X value Position
#1 1.1 1 1
#2 1.2 2 2
#3 2.1 5 5
#4 2.2 6 6
Or alternatively (only dplyr) - I'd rather use this one:
mutate(df, Position = row_number()) %>% group_by(ID1, ID2) %>% slice(which.min(value))
#Source: local data frame [4 x 4]
#Groups: ID1, ID2
#
# ID1 ID2 value Position
#1 1 1 1 1
#2 1 2 2 2
#3 2 1 5 5
#4 2 2 6 6
data
df <- data.frame(ID1=rep(1:2, each = 4), ID2=rep(1:2,4), value=1:8)
Here's how would I approach this using data.table (rn would be your row number).
library(data.table)
setDT(df, keep.rownames = TRUE)[, .SD[which.min(value)], list(ID1, ID2)]
# ID1 ID2 rn value
# 1: 1 1 1 1
# 2: 1 2 2 2
# 3: 2 1 5 5
# 4: 2 2 6 6
Another option is ordering and then picking the unique values
unique(setorder(df, value), by = c("ID1", "ID2"))
# ID1 ID2 rn value
# 1: 1 1 1 1
# 2: 1 2 2 2
# 3: 2 1 5 5
# 4: 2 2 6 6
Both approaches don't require creating X column
Or using base R
df <- df[order(df$value), ]
df[!duplicated(df[, 1:2]), ]
# ID1 ID2 value
# 1 1 1 1
# 2 1 2 2
# 5 2 1 5
# 6 2 2 6
data
df <- data.frame(ID1=c(rep(1,4),rep(2,4)), ID2=rep(1:2,4), value=1:8)
Using Aggregate:
Data:
df<-data.frame(ID1=c(rep(1,4),rep(2,4)), ID2=rep(1:2,4), value=1:8)
df[['X']] <- paste0(df$ID1,'.',df$ID2)
df$rn<-row.names(df) #rn is the row number
df<-df[c("X","rn","value")]
#> df
# X rn value
#1 1.1 1 1
#2 1.2 2 2
#3 1.1 3 3
#4 1.2 4 4
#5 2.1 5 5
#6 2.2 6 6
#7 2.1 7 7
#8 2.2 8 8
Aggregate step:
df2<- aggregate(df, by=list(c(df$X)), min)
#> df2
# Group.1 X rn value
#1 1.1 1.1 1 1
#2 1.2 1.2 2 2
#3 2.1 2.1 5 5
#4 2.2 2.2 6 6

Resources