Calculation on every pair from grouped data.frame - r

My question is about performing a calculation between each pair of groups in a data.frame, I'd like it to be more vectorized.
I have a data.frame that has a consists of the following columns: Location , Sample , Var1, and Var2. I'd like to find the closet match for each Sample for each pair of Locations for both Var1 and Var2.
I can accomplish this for one pair of locations as such:
df0 <- data.frame(Location = rep(c("A", "B", "C"), each =30),
Sample = rep(c(1:30), times =3),
Var1 = sample(1:25, 90, replace =T),
Var2 = sample(1:25, 90, replace=T))
df00 <- data.frame(Location = rep(c("A", "B", "C"), each =30),
Sample = rep(c(31:60), times =3),
Var1 = sample(1:100, 90, replace =T),
Var2 = sample(1:100, 90, replace=T))
df000 <- rbind(df0, df00)
df <- sample_n(df000, 100) # data
dfl <- df %>% gather(VAR, value, 3:4)
df1 <- dfl %>% filter(Location == "A")
df2 <- dfl %>% filter(Location == "B")
df3 <- merge(df1, df2, by = c("VAR"), all.x = TRUE, allow.cartesian=TRUE)
df3 <- df3 %>% mutate(DIFF = abs(value.x-value.y))
result <- df3 %>% group_by(VAR, Sample.x) %>% top_n(-1, DIFF)
I tried other possibilities such as using dplyr::spread but could not avoid the "Error: Duplicate identifiers for rows" or columns half filled with NA.
Is there a more clean and automated way to do this for each possible group pair? I'd like to avoid the manual subset and merge routine for each pair.

One option would be to create the pairwise combination of 'Location' with combn and then do the other steps as in the OP's code
library(tidyverse)
df %>%
# get the unique elements of Location
distinct(Location) %>%
# pull the column as a vector
pull %>%
# it is factor, so convert it to character
as.character %>%
# get the pairwise combinations in a list
combn(m = 2, simplify = FALSE) %>%
# loop through the list with map and do the full_join
# with the long format data df1
map(~ full_join(df1 %>%
filter(Location == first(.x)),
df1 %>%
filter(Location == last(.x)), by = "VAR") %>%
# create a column of absolute difference
mutate(DIFF = abs(value.x - value.y)) %>%
# grouped by VAR, Sample.x
group_by(VAR, Sample.x) %>%
# apply the top_n with wt as DIFF
top_n(-1, DIFF))
Also, as the OP mentioned about automatically picking up instead of doing double filter (not clear about the expected output though)
df %>%
distinct(Location) %>%
pull %>%
as.character %>%
combn(m = 2, simplify = FALSE) %>%
map(~ df1 %>%
# change here i.e. filter both the Locations
filter(Location %in% .x) %>%
# spread it to wide format
spread(Location, value, fill = 0) %>%
# create the DIFF column by taking the differene
mutate(DIFF = abs(!! rlang::sym(first(.x)) -
!! rlang::sym(last(.x)))) %>%
group_by(VAR, Sample) %>%
top_n(-1, DIFF))

Related

How to combine lapply with dplyr in a function

Below is a sample data frame that I have created along with the expected output.
df = data.frame(color = c("Yellow", "Blue", "Green", "Red", "Magenta"),
values = c(24, 24, 34, 45, 49),
Quarter = c("Period1","Period2" , "Period3", "Period3", "Period1"),
Market = c("Camden", "StreetA", "DansFireplace", "StreetA", "DansFireplace"))
dfXQuarter = df %>% group_by(Quarter) %>% summarise(values = sum(values)) %>%
mutate(cut = "Quarter") %>% data.frame()
colnames(dfXQuarter)[1] = "Grouping"
dfXMarket = df %>% group_by(Market) %>% summarise(values = sum(values)) %>%
mutate(cut = "Market")%>% data.frame()
colnames(dfXMarket)[1] = "Grouping"
df_all = rbind(dfXQuarter, dfXMarket)
Now I for the sake brevity I want to compile this into a function and using lapply.
Below is my attempt at the same-
list = c("Market", "Quarter")
df_all <- do.call(rbind, lapply(list, function(x){
df_l= df %>% group_by(x) %>%
summarise(values = sum(values)) %>%
mutate(cut= x) %>%
data.frame()
colnames(df_l)[df_l$x] = "Grouping"
df_l
}))
This block of code is giving me error.
I need the output to be the exact replica of the 'df_all' output for further operations.
How I do write this function correctly?
We can use purrr::map_dfr
library(dplyr)
library(purrr)
#Don't use the R build-in type e.g. list in variables name
lst <- c("Market", "Quarter")
#Use map if you need the output as a list
map_dfr(lst, ~df %>% group_by("Grouping"=!!sym(.x)) %>%
summarise(values = sum(values)) %>%
mutate(cut = .x) %>%
#To avoid the warning massage from bind_rows
mutate_if(is.factor, as.character))
# A tibble: 6 x 3
Grouping values cut
<chr> <dbl> <chr>
1 Camden 24 Market
2 DansFireplace 83 Market
3 StreetA 69 Market
4 Period1 73 Quarter
5 Period2 24 Quarter
6 Period3 79 Quarter
We can fix the first solution by
change group_by(x) to group_by_at(x), since x is a string here.
Use colnames(df_l)[colnames(df_l)==x] <- "Grouping" in naming the grouping variable.
Not pretty but works and doesn't require tidy functions:
groupwise_summation <- function(df, grouping_vecs){
# Split, apply, combine:
tmpdf <- do.call(rbind, lapply(split(df, df[,grouping_vecs]), function(x){sum(x$values)}))
# Clean up the df:
data.frame(cbind(cut = row.names(tmpdf), value = as.numeric(tmpdf)), row.names = NULL)
}
# Apply and combine:
df_all <- rbind(groupwise_summation(df, c("Quarter")), groupwise_summation(df, c("Market")))
# Note inside the c(), you can use multiple grouping variables.

Use dplyr to get index of first column with certain value per group or row

I have the following script. Option 1 uses a long format and group_by to identify the first step of many where the status equals 0.
Another option (2) is to use apply to calculate this value for each row, and then transform the data to a long format.
The firs option does not scale well. The second does, but I was unable to get it into a dplyr pipe. I tried to solve this with purrr but did not succeeed.
Questions:
Why does the first option not scale well?
How can I transform the second option in a dplyr pipe?
require(dplyr)
require(tidyr)
require(ggplot2)
set.seed(314)
# example data
dat <- as.data.frame(matrix(sample(c(0,1),
size = 9000000,
replace = TRUE,
prob = c(5,95)),
ncol = 9))
names(dat) <- paste("step",1:9, sep="_")
steps <- dat %>% select(starts_with("step_")) %>% names()
# option 1 is slow
dat.cum <- dat %>%
mutate(id = row_number()) %>%
gather(step, status,-id) %>%
group_by(id) %>%
mutate(drop = min(if_else(status==0,match(step, steps),99L))) %>%
mutate(status = if_else(match(step, steps)>=drop,0,1))
ggplot(dat.cum, aes(x = step, fill = factor(status))) +
geom_bar()
# option 2 is faster
dat$drop <- apply(dat,1,function(x) min(which(x==0),99))
dat.cum <- dat %>%
gather(step,status,-drop) %>%
mutate(status = if_else(match(step,steps)>=drop,0,1))
ggplot(dat.cum, aes(x = step, fill = factor(status))) +
geom_bar()
If you would like to map along rows you could do:
dat %>%
mutate(drop2 = map_int(seq_len(nrow(dat)), ~ min(which(dat[.x, ] == 0L), 99L)))
It could be that "gathering and grouping" is faster than Looping:
dat %>%
as_tibble() %>%
select(starts_with("step_")) %>%
mutate(row_nr = row_number()) %>%
gather(key = "col", value = "value", -row_nr) %>%
arrange(row_nr, col) %>%
group_by(row_nr) %>%
mutate(col_index = row_number()) %>%
filter(value == 0) %>%
summarise(drop3 = min(col_index)) %>%
ungroup() %>%
right_join(dat %>%
mutate(row_nr = row_number()),
by = "row_nr") %>%
mutate(drop3 = if_else(is.na(drop3), 99, drop3))

removing groups with a certain NA number

Sorry to bother with a relatively simple question perhaps.
I have this type of dataframe:
A long list of names in the column "NAME" c(a, b, c, d, e ...) , two potential classes in the column "SURNAME" c(A, B) and a third column containing values.
I want to remove all NAMES for which at least in one of the SURNAME classes I have more than 2 "NA" in the VALUE column.
I wanted to post an example dataset but I am struggling to format it properly
I was trying to use
df <- df %>%
group_by(NAME) %>%
group_by(SURNAME) %>%
filter(!is.na(VALUE)) %>%
filter(length(VALUE)>=3)
it does not throw an error but I have the impression that something is wrong. Any suggestion? Many thanks
Let's create a dataset to work with:
set.seed(1234)
df <- data.frame(
name = sample(x=letters, size=1e3, replace=TRUE),
surname = sample(x=c("A", "B"), size=1e3, replace=TRUE),
value = sample(x=c(1:10*10,NA), size=1e3, replace=TRUE),
stringsAsFactors = FALSE
)
Here's how to do it with Base R:
# count NAs by name-surname combos (na.action arg is important!)
agg <- aggregate(value ~ name + surname, data=df, FUN=function(x) sum(is.na(x)), na.action=NULL)
# rename is count of NAs column
names(agg)[3] <- "number_of_na"
#add count of NAs back to original data
df <- merge(df, agg, by=c("name", "surname"))
# subset the original data
result <- df[df$number_of_na < 3, ]
Here's how to do it with data.table:
library(data.table)
dt <- as.data.table(df)
dt[ , number_of_na := sum(is.na(value)), by=.(name, surname)]
result <- dt[number_of_na < 3]
Here's how to do it with dplr/tidyverse:
library(dplyr) # or library(tidyverse)
result <- df %>%
group_by(name, surname) %>%
summarize(number_of_na = sum(is.na(value))) %>%
right_join(df, by=c("name", "surname")) %>%
filter(number_of_na < 3)
After grouping by 'NAME', 'SURNAME', create a column with the number of NA elements in that group and then filter out any 'NAME' that have an 'ind' greater than or equal to 3
df %>%
group_by(NAME, SURNAME) %>%
mutate(ind = sum(is.na(VALUE))) %>%
group_by(NAME) %>%
filter(!any(ind >=3)) %>%
select(-ind)
Or do an anti_join after doing the filtering by 'NAME', 'SURNAME' based on the condition
df %>%
group_by(NAME, SURNAME) %>%
filter(sum(is.na(VALUE))>=3) %>%
ungroup %>%
distinct(NAME) %>%
anti_join(df, .)
data
set.seed(24)
df <- data.frame(NAME = rep(letters[1:5], each = 20),
SURNAME = sample(LETTERS[1:4], 5 * 20, replace = TRUE),
VALUE = sample(c(NA, 1:3), 5 *20, replace = TRUE),
stringsAsFactors = FALSE)

summarise mean of a specific column in dplyr

I would like to summarise a grouped data.frame without knowing the name of the column. But what I know is, that the feature is always at position 3 (column) in this data.frame, is that possible?
df <- data_frame(date = rep(c("2017-01-01", "2017-01-02", "2017-01-03"), 2),
group = rep(c("A", "B"), 3),
temperature = runif(6, -10, 30),
percipitation = runif(6, 0,5)
)
parameter <- "perc"
df1 <- df %>%
select(date, group, starts_with(parameter)) %>%
group_by(group) %>%
summarise(
avg = mean(percipitation)
)
In this example the code works, but of course only for the parameter 'perc' and not for 'temp' or so.
avg = mean(df[[3]])
or something like this doesn't work. Any suggestions?
You could keep just the grouping variable and the third column using select(group, 3). The function summarise_all() can then be used to calculate the mean.
df %>%
select(group, 3) %>%
group_by(group) %>%
summarise_all(
funs(mean)
)

Create table containing column names in order of highest value to lowest

I have a data set recording values for various metrics by name. I want to sort these metrics for each name and use them to create a new data set with columns for each choice. I have it to the point where i can sort the row, but i don't want the value, I want the name of the metric...
How can I get the column name to populate the cell instead of the value?
name <- c('jim', 'sal', 'xiu')
x <- c(100, 200, 100)
y <- c(300, 100, 300)
z <- c(400, 0, 200)
have <- data.frame(name, x, y, z)
choice1 <- c('z', 'x', 'y')
choice2 <- c('y', 'y', 'z')
choice3 <- c('x', 'z', 'x')
want <- data.frame(name, choice1, choice2, choice3)
attempt <- data.frame(t(apply(have, 1, sort, decreasing = TRUE)))
Here's an approach with dplyr tools:
library(dplyr)
library(tidyr)
library(reshape2)
have %>%
# convert from wide to long format
gather(metric, value,
-name) %>%
group_by(name) %>%
# arrange each group in descending order
arrange(desc(value)) %>%
# with data arranged, the row number coincides with the ranking
mutate(rank = sprintf("choice%s", row_number())) %>%
# recast to wide format
dcast(name ~ rank,
value.var = "metric")
Here's a solution that relies only on tidyverse.
library(tidyverse)
want <- have %>% group_by(name) %>% gather(var, value, 2:4) %>%
arrange(name, desc(value)) %>% mutate(choice = paste0("choice", row_number())) %>%
select(-value) %>%
spread(choice, var)

Resources