Sum Specific Rows by Multiple Groups - r

I have a dataframe like the one below...
df <- data.frame(row.names = c(1,2,3,4,5,6,7,8), Week = c(1,1,2,2,52,52,53,53), State = c("Florida", "Georgia","Florida", "Georgia","Florida", "Georgia","Florida", "Georgia"), Count_2001 = c(25,16,83,45,100,98,22,34), Count_2002 = c(3, 78, 22, 5, 78, 6, 88, 97))
I am now trying to manipulate this dataset such that only weeks 52 and 53 get summed together for each state in the list, across all of the Count columns. Similar to this example.. GROUP BY for specific rows
The new dataset should have these rows summed together to create the new Week 52 row for each state, like this example below...
df2 <- data.frame(row.names = c(1,2,3,4,5,6), Week = c(1,1,2,2,52,52), State = c("Florida", "Georgia","Florida", "Georgia","Florida", "Georgia"), Count_2001 = c(25,16,83,45,122,132), Count_2002 = c(3, 78, 22, 5, 166, 103))
Is there an easy solution for this in R?

Change your 53s to 52s and do a sum by group:
library(dplyr)
df %>%
mutate(Week = case_when(Week == 53 ~ 52, TRUE ~ Week)) %>%
group_by(State, Week) %>%
summarize(across(everything(), sum))
# # A tibble: 6 x 4
# # Groups: State [2]
# State Week Count_2001 Count_2002
# <chr> <dbl> <dbl> <dbl>
# 1 Florida 1 25 3
# 2 Florida 2 83 22
# 3 Florida 52 122 166
# 4 Georgia 1 16 78
# 5 Georgia 2 45 5
# 6 Georgia 52 132 103

Using aggregate.
s <- 52:53
tp <- transform(aggregate(cbind(Count_2001, Count_2002) ~ State, df[df$Week %in% s, ], sum),
Week=52)
df <- merge(df[!df$Week %in% s, ], tp, all=T)
df
# Week State Count_2001 Count_2002
# 1 1 Florida 25 3
# 2 1 Georgia 16 78
# 3 2 Florida 83 22
# 4 2 Georgia 45 5
# 5 52 Florida 122 166
# 6 52 Georgia 132 103

A simple alternative to using anything state specific would just be to create a new column with weeks at the level of aggregation that works!
I'd get this by doing: (using the tidyverse library)
df <- df %>%
mutate(week1 = if_else(week %in% c(52,53),52,week)
and then you can summate as
dfsumm <- df %>%
group_by(state, week1)%>%
summarise()

Related

lag() with group_by between current and last observation in R

Edit: I found the solution with na.locf().
data <-
data %>%
group_by(country) %>%
arrange(wave) %>%
mutate(weight.io = na.locf(weight)) %>%
mutate(lag_weight = weight - lag(weight.io)
I have a dataset below.
set.seed(42000)
data <- data_frame(
country = sample(letters[1:20], size = 100, replace = TRUE),
weight = round(runif(100, min = 48, max = 90)))
data <- data %>%
group_by(country) %>%
arrange(weight) %>%
mutate(wave = seq_along(weight))
n_rows <- nrow(data)
perc_missing <- 10
data[sample(1:n_rows, sample(1:n_rows, round(perc_missing/100 * n_rows, 0))), c("weight")] <- NA
I would like to obtain the difference between one country's current "weight" and the last observed "weight for each wave.
For country "a" wave 5, I want the value to be 69 - 65 (last observed weight at wave < 5).
And for wave 8, 82(weight at wave 8) - 69(weight at wave 5).
My approach was the one below, but it didn't work.
data <-
data %>%
group_by(country) %>%
arrange(wave) %>%
mutate(lag_weight = weight - lag(weight, default = first(weight, na.rm = TRUE)))
Thank you!
I think this is a combination of diff (instead of lag, though that could work just as well) and more important tidyr::fill (or zoo::na.locf, not demonstrated):
BTW, na.rm= is not an argument for first, I've removed it.
library(dplyr)
# library(tidyr) # fill
data %>%
group_by(country) %>%
tidyr::fill(weight) %>%
filter(country == "a") %>%
mutate(lag_weight = weight - lag(weight, default = first(weight)))
# # A tibble: 10 x 4
# # Groups: country [1]
# country weight wave lag_weight
# <chr> <dbl> <int> <dbl>
# 1 a 54 1 0
# 2 a 55 2 1
# 3 a 65 3 10
# 4 a 65 4 0
# 5 a 69 5 4
# 6 a 69 6 0
# 7 a 69 7 0
# 8 a 82 8 13
# 9 a 82 9 0
# 10 a 85 10 3
The issue here is that weight is over-written with the LOCF (last-observation carried forward) value instead of preserving the NA values. If that's important, then you can make another weight variable for temporary use (and remove it):
data %>%
mutate(tmpweight = weight) %>%
group_by(country) %>%
tidyr::fill(tmpweight) %>%
filter(country == "a") %>%
mutate(lag_weight = tmpweight - lag(tmpweight, default = first(tmpweight))) %>%
select(-tmpweight)
# # A tibble: 10 x 4
# # Groups: country [1]
# country weight wave lag_weight
# <chr> <dbl> <int> <dbl>
# 1 a 54 1 0
# 2 a 55 2 1
# 3 a 65 3 10
# 4 a NA 4 0
# 5 a 69 5 4
# 6 a NA 6 0
# 7 a NA 7 0
# 8 a 82 8 13
# 9 a 82 9 0
# 10 a 85 10 3
FYI, you can use c(0, diff(weight)) instead of weight - lag(weight) for the same effect. Since it returns length of 1 shorter (since it is the gap between each value), we prepend a 0 here:
data %>%
group_by(country) %>%
tidyr::fill(weight) %>%
filter(country == "a") %>%
mutate(lag_weight = c(0, diff(weight)))
(The filter(country == "a") is purely for demonstration to match your example, not that it is required for this solution.)

Count number of outliers by group in r and store count in new dataframe

I have a dataset that has 2 columns; column A is State_Name and has 5 different options of state, and column B is Total_Spend which has the average total spend of that state per day. There are 365 observations for each state.
What I want to do is count the number of outliers PER STATE using the 1.5 IQR rule and save the count of outliers per state to a new df or table.
So I would expect an output something like:
State
Outlier Count
ATL
5
GA
20
MI
11
NY
50
TX
23
I have managed to get it to work by doing it one state at a time but I can't figure out what to do to achieve this in a single go.
Here is my code at the moment (to return the result for a single state):
daily_agg %>%
select(State_Name, Total_Spend) %>%
filter(State_Name == "NY")
outlier_NY <- length(boxplot.stats(outlier_df$Total_Spend)$out)
Any help would be appreciated.
Thanks!
EDIT WITH TEST DATASET
outlier_mtcars <-
df %>%
select(cyl, disp) %>%
filter(cyl == "6")
outliers <- length(boxplot.stats(outlier_mtcars$disp)$out)
The above shows me 1 outlier for 6 cyl cars but I want a table that shows how many outliers for 4, 6, 8 cyl cars
Since I'm not very familiar with the function boxplot.stats, I didn't use this in my solution and instead manually calculates 1.5 * IQR + upper quantile.
Here mtcars was used as an example. For the records that are outliers, they are "flagged" as TRUE, where we can sum them up in summarize.
library(dplyr)
mtcars %>%
group_by(cyl) %>%
mutate(flag = disp >= (IQR(disp) * 1.5 + quantile(disp, probs = 0.75)), .keep = "used") %>%
summarize(Outlier = sum(flag))
# A tibble: 3 × 2
cyl Outlier
<dbl> <int>
1 4 0
2 6 1
3 8 0
Since I don't have your data, I'll make some up with the two columns you mention:
df<-data.frame(state=sample(c("ny","fl"),100, replace=TRUE),
spend=sample(1:100, 100, replace=TRUE))
> head(df)
state spend
1 ny 3
2 fl 87
3 ny 91
4 fl 97
5 ny 47
6 fl 8
Then set your upper and lower bounds (could be quartiles, absolutes, whatever..)
df%>%
group_by(state)%>%
mutate(lower_bound=quantile(spend,0.25),
upper_bound=quantile(spend,0.75))%>%
mutate(is_outlier=if_else(spend<lower_bound|spend>upper_bound,TRUE,FALSE))
# A tibble: 10 × 5
# Groups: state [2]
state spend lower_bound upper_bound is_outlier
<chr> <int> <dbl> <dbl> <lgl>
1 ny 3 38 84 TRUE
2 fl 87 26 87 FALSE
3 ny 91 38 84 TRUE
4 fl 97 26 87 TRUE
Then if you only want to see the output, summarise by is_outlier:
df%>%
group_by(state)%>%
mutate(lower_bound=quantile(spend,0.25),upper_bound=quantile(spend,0.75))%>%
mutate(is_outlier=if_else(spend<lower_bound|spend>upper_bound,TRUE,FALSE))%>%
summarise(outliers=sum(is_outlier))
state outliers
<chr> <int>
1 fl 19
2 ny 30

In R , how to summarize data frame in multiple dimensions

There is dataframe raw_data as below, How can i change it to wished_data in easy way ?
I currently know group_by/summarise the data serval times (and add variables) , then rbind them. But this is little boring , especially when variables more then this example in occasion.
I want to know ,if is there any general method for similar situation ? Thanks!
library(tidyverse)
country <- c('UK','US','UK','US')
category <- c("A", "B", "A", "B")
y2021 <- c(17, 42, 21, 12)
y2022 <- c(49, 23, 52, 90)
raw_data <- data.frame(country,category,y2021,y2022)
We may use rollup/cube/groupingsets from data.table
library(data.table)
out <- rbind(setDT(raw_data), groupingsets(raw_data, j = lapply(.SD, sum),
by = c("country", "category"),
sets = list("country", "category", character())))
out[is.na(out)] <- 'TOTAL'
-output
> out
country category y2021 y2022
<char> <char> <num> <num>
1: UK A 17 49
2: US B 42 23
3: UK A 21 52
4: US B 12 90
5: UK TOTAL 38 101
6: US TOTAL 54 113
7: TOTAL A 38 101
8: TOTAL B 54 113
9: TOTAL TOTAL 92 214
Or with cube
out <- rbind(raw_data, cube(raw_data,
j = .(y2021= sum(y2021), y2022=sum(y2022)), by = c("country", "category")))
out[is.na(out)] <- 'TOTAL'
We can use the adorn_totals function from janitor. get_totals accepts a data frame and a column and it outputs the data frame with totals for the numeric columns, one such row for each level of the specified column. It then extracts out the total rows and since adorn_totals can rearrange the column order uses select to put the order back to the original so that we can later bind mulitiple instances together. We then bind together the orignal data frame and each of the total row data frames that we want.
library(dplyr)
library(janitor)
get_totals <- function(data, col) {
data %>%
group_by({{col}}) %>%
group_modify(~ adorn_totals(.)) %>%
ungroup %>%
filter(rowSums(. == "Total") > 0) %>%
select(any_of(names(data)))
}
bind_rows(
raw_data,
get_totals(raw_data, category),
get_totals(raw_data, country),
get_totals(raw_data)
)
giving:
country category y2021 y2022
1 UK A 17 49
2 US B 42 23
3 UK A 21 52
4 US B 12 90
5 Total A 38 101
6 Total B 54 113
7 UK Total 38 101
8 US Total 54 113
9 Total - 92 214

combine/merge rows with dplyr

The data that looks like
Month Location Money
1 Miami 12
1 Cal 15
2 Miami 5
2 Cal 3
...
12 Miami 6
12 Cal 8
I want to transform it so it looks like
Month Location Money
Spring Miami sum(from month=1,2,3)
spring Cal sum (from month= 1,2,3)
summer...
summer...
fall...
fall...
winter...
winter...
I dont' know how to ask the question directly (merging rows, aggregating rows?) but googling it only returns dplyr::group_by and summarize which collapses the rows based on a single value of the row.
I want to collpase/summarise the data based on multiple row values.
Is there an easy way? Any help would be appreciated Thanks!
It sounds like you want to
assign season to each record,
group_by season,
summarize.
If this is where you are going, you can either create a new column, Or you can do it directly. You could also create a separate table with month and season and left_join to your data.
library(dplyr)
## simulate data
df = tibble(
month = rep(1:12, each = 4),
location = rep(c("Cal", "Miami"), times = 24),
money = as.integer(runif(48, 10, 100 ))
)
head(df)
# # A tibble: 6 x 3
# month location money
# <int> <chr> <int>
# 1 1 Cal 69
# 2 1 Miami 84
# 3 1 Cal 38
# 4 1 Miami 44
# 5 2 Cal 33
# 6 2 Miami 64
## Create season based on month in groups of 3
df %>%
mutate(season = (month-1) %/% 3 +1) %>%
group_by(season, location) %>%
summarize(Monthly_Total = sum(money))
# # A tibble: 8 x 3
# # Groups: season [4]
# season location Monthly_Total
# <dbl> <chr> <int>
# 1 1 Cal 360
# 2 1 Miami 265
# 3 2 Cal 392
# 4 2 Miami 380
# 5 3 Cal 348
# 6 3 Miami 278
# 7 4 Cal 358
# 8 4 Miami 411
Using the same data you can skip the column creation and include it in group_by:
df %>%
group_by(season = (month-1) %/% 3 +1, location) %>%
summarize(Monthly_Total = sum(money))
## results identical to above.
It may make more sense to just create a season table:
seasons = tibble(
month = 1:12,
season = rep(c("Spring", "Summer", "Winter", "Fall"), each = 3)
)
df %>%
left_join(seasons) %>%
group_by(season, location) %>%
summarize(Monthly_Total = sum(money))
## again identical to above
The latter has the advantage of being more transparent.
You could aggregate after transforming the Month variable:
aggregate(Money ~ Month + Location, transform(data, Month = (Month - 1) %/% 3), sum)

linear interpolation (approx) by group in a dplyr pipe in R

I have a question that I find kind of hard to explain with a MRE and in an easy
way to answer, mostly because I don't fully understand where the problem lies
myself. So that's my sorry for being vague preamble.
I have a tibble with many sample and reference measurements, for which I want
to do some linear interpolation for each sample. I do this now by taking out
all the reference measurements, rescaling them to sample measurements using
approx, and then patching it back in. But because I take it out first, I
cannot do it nicely in a group_by dplyr pipe way. right now I do it with a
really ugly workaround where I add empty (NA) newly created columns to the
sample tibble, then do it with a for-loop.
So my question is really: how can I implement the approx part within groups
into the pipe, so that I can do everything within groups? I've experimented
with dplyr::do(), and ran into the vignette on "programming with dplyr", but
searching mostly gives me broom::augment and lm stuff that I think operates
differently... (e.g. see
Using approx() with groups in dplyr). This thread also seems promising: How do you use approx() inside of mutate_at()?
Somebody on irc recommended using a conditional mutate, with case_when, but I
don't fully understand where and how within this context yet.
I think the problem lies in the fact that I want to filter out part of the data
for the following mutate operations, but the mutate operations rely on the
grouped data that I just filtered out, if that makes any sense.
Here's a MWE:
library(tidyverse) # or just dplyr, tibble
# create fake data
data <- data.frame(
# in reality a dttm with the measurement time
timestamp = c(rep("a", 7), rep("b", 7), rep("c", 7)),
# measurement cycle, normally 40 for sample, 41 for reference
cycle = rep(c(rep(1:3, 2), 4), 3),
# wheather the measurement is a reference or a sample
isref = rep(c(rep(FALSE, 3), rep(TRUE, 4)), 3),
# measurement intensity for mass 44
r44 = c(28:26, 30:26, 36, 33, 31, 38, 34, 33, 31, 18, 16, 15, 19, 18, 17)) %>%
# measurement intensity for mass 45, normally also masses up to mass 49
mutate(r45 = r44 + rnorm(21, 20))
# of course this could be tidied up to "intensity" with a new column "mass"
# (44, 45, ...), but that would make making comparisons even harder...
# overview plot
data %>%
ggplot(aes(x = cycle, y = r44, colour = isref)) +
geom_line() +
geom_line(aes(y = r45), linetype = 2) +
geom_point() +
geom_point(aes(y = r45), shape = 1) +
facet_grid(~ timestamp)
# what I would like to do
data %>%
group_by(timestamp) %>%
do(target_cycle = approx(x = data %>% filter(isref) %>% pull(r44),
y = data %>% filter(isref) %>% pull(cycle),
xout = data %>% filter(!isref) %>% pull(r44))$y) %>%
unnest()
# immediately append this new column to the original dataframe for all the
# samples (!isref) and then apply another approx for those values.
# here's my current attempt for one of the timestamps
matchref <- function(dat) {
# split the data into sample gas and reference gas
ref <- filter(dat, isref)
smp <- filter(dat, !isref)
# calculate the "target cycle", the points at which the reference intensity
# 44 matches the sample intensity 44 with linear interpolation
target_cycle <- approx(x = ref$r44,
y = ref$cycle, xout = smp$r44)
# append the target cycle to the sample gas
smp <- smp %>%
group_by(timestamp) %>%
mutate(target = target_cycle$y)
# linearly interpolate each reference gas to the target cycle
ref <- ref %>%
group_by(timestamp) %>%
# this is needed because the reference has one more cycle
mutate(target = c(target_cycle$y, NA)) %>%
# filter out all the failed ones (no interpolation possible)
filter(!is.na(target)) %>%
# calculate interpolated value based on r44 interpolation (i.e., don't
# actually interpolate this value but shift it based on the 44
# interpolation)
mutate(r44 = approx(x = cycle, y = r44, xout = target)$y,
r45 = approx(x = cycle, y = r45, xout = target)$y) %>%
select(timestamp, target, r44:r45)
# add new reference gas intensities to the correct sample gasses by the target cycle
left_join(smp, ref, by = c("time", "target"))
}
matchref(data)
# and because now "target" must be length 3 (the group size) or one, not 9
# I have to create this ugly for-loop
# for which I create a copy of data that has the new columns to be created
mr <- data %>%
# filter the sample gasses (since we convert ref to sample)
filter(!isref) %>%
# add empty new columns
mutate(target = NA, r44 = NA, r45 = NA)
# apply matchref for each group timestamp
for (grp in unique(data$timestamp)) {
mr[mr$timestamp == grp, ] <- matchref(data %>% filter(timestamp == grp))
}
Here's one approach that spreads the references and samples to new columns. I drop r45 for simplicity in this example.
data %>%
select(-r45) %>%
mutate(isref = ifelse(isref, "REF", "SAMP")) %>%
spread(isref, r44) %>%
group_by(timestamp) %>%
mutate(target_cycle = approx(x = REF, y = cycle, xout = SAMP)$y) %>%
ungroup
gives,
# timestamp cycle REF SAMP target_cycle
# <fct> <dbl> <dbl> <dbl> <dbl>
# 1 a 1 30 28 3
# 2 a 2 29 27 4
# 3 a 3 28 26 NA
# 4 a 4 27 NA NA
# 5 b 1 31 26 NA
# 6 b 2 38 36 2.5
# 7 b 3 34 33 4
# 8 b 4 33 NA NA
# 9 c 1 15 31 NA
# 10 c 2 19 18 3
# 11 c 3 18 16 2.5
# 12 c 4 17 NA NA
Edit to address comment below
To retain r45 you can use a gather-unite-spread approach like this:
df %>%
mutate(isref = ifelse(isref, "REF", "SAMP")) %>%
gather(r, value, r44:r45) %>%
unite(ru, r, isref, sep = "_") %>%
spread(ru, value) %>%
group_by(timestamp) %>%
mutate(target_cycle_r44 = approx(x = r44_REF, y = cycle, xout = r44_SAMP)$y) %>%
ungroup
giving,
# # A tibble: 12 x 7
# timestamp cycle r44_REF r44_SAMP r45_REF r45_SAMP target_cycle_r44
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 a 1 30 28 49.5 47.2 3
# 2 a 2 29 27 48.8 48.7 4
# 3 a 3 28 26 47.2 46.8 NA
# 4 a 4 27 NA 47.9 NA NA
# 5 b 1 31 26 51.4 45.7 NA
# 6 b 2 38 36 57.5 55.9 2.5
# 7 b 3 34 33 54.3 52.4 4
# 8 b 4 33 NA 52.0 NA NA
# 9 c 1 15 31 36.0 51.7 NA
# 10 c 2 19 18 39.1 37.9 3
# 11 c 3 18 16 39.2 35.3 2.5
# 12 c 4 17 NA 39.0 NA NA

Resources