sum by group including intermediate groups - r

I have:
df <- data.frame(group=c(1,1,2,4,4,5), value=c(3,1,5,2,3,6))
aggregate(value ~ group, data = df, FUN = 'sum')
group value
1 1 4
2 2 5
3 4 5
4 5 6
is there a way to include intermediate groups to return the below? I realise this could be done by creating a dataframe with all the desired groups and matching in the results from aggregate() but I am hoping there is a cleaner way to do this. it would need to be as fast as using aggregate and only use base r packages - this is due to restrictions in my workplace.
group value
1 1 4
2 2 5
3 3 0
4 4 5
5 5 6

You can try this .
library(tidyr)
library(dplyr)
df %>%
mutate(group=factor(group, 1:5)) %>%
complete(group) %>%group_by(group)%>%
dplyr::summarise(value=sum(value,na.rm = T))
group value
<fctr> <dbl>
1 1 4
2 2 5
3 3 0
4 4 5
5 5 6

You can do this easily with the tidyverse:
library(dplyr)
library(tidyr)
df %>%
group_by(group) %>%
summarise(valuesum = sum(value)) %>%
full_join(., expand(df, group = 1:5)) %>%
complete(group, fill = list(valuesum = 0))
The result:
# A tibble: 5 x 2
group valuesum
<dbl> <dbl>
1 1 4
2 2 5
3 3 0
4 4 5
5 5 6
Or a bit more difficult to understand with data.table:
library(data.table)
setDT(df)[.(group = 1:5), on = 'group', sum(value, na.rm = TRUE), by = .EACHI]

You can use mergefrom base R. I've changed the name of your data.frame to dat, since df is the name of an R function.
dat <- read.table(text = "
group value
1 4
2 5
4 5
5 6
", header = TRUE)
str(dat)
res <- aggregate(value ~ group, data = dat, FUN = 'sum')
merge(res, data.frame(group = seq(from = min(res$group), to = max(res$group))), all = TRUE)
Note that there will be a NA, not a zero. I believe that you should solve that by leaving it as a missing value.

Related

Row mean of two matching columns with same name but differ by: '_1' and '_2'

Lets say I have the dataframe:
z = data.frame(col_1 = c(1,2,3,4), col_2 = c(3,4,5,6))
col_1 col_2
1 1 3
2 2 4
3 3 5
4 4 6
I want to take columns with the same name that only differ by the number e.g. '_1' and '_2' and take the pairwise mean. In reality I have a big dataframe with many pairs and they are not in a nice order, therefore looking for a clever solution that can be applied to this.
So the output should look like this:
col
1 2
2 3
3 4
4 5
With the column name given as the same as the column pair but with the additional label removed.
Any help would be great thanks.
Here is a base R option using list2DF + split.default + rowMeans
list2DF(lapply(split.default(z,gsub("_\\d+","",names(z))),rowMeans))
which gives
col
1 2
2 3
3 4
4 5
Try this tidyverse approach. By using separate() you can extract the name and then with reshaping you can reach the desired output. Here the code:
library(dplyr)
library(tidyr)
#Data
z = data.frame(col_1 = c(1,2,3,4), col_2 = c(3,4,5,6))
#Code
z1 <- z %>% mutate(id=1:n()) %>%
pivot_longer(-id) %>%
separate(name,c('var1','var2'),sep='_') %>%
group_by(id,var1) %>% summarise(Mean=mean(value)) %>%
pivot_wider(names_from = var1,values_from=Mean) %>% ungroup() %>% select(-id)
Output:
# A tibble: 4 x 1
col
<dbl>
1 2
2 3
3 4
4 5
Here is a purrr oriented solution:
library(purrr)
library(stringr)
split.default(z, str_remove(names(z), "[:digit:]+$")) %>% map_dfc(rowMeans)
#> # A tibble: 4 x 1
#> col_
#> <dbl>
#> 1 2
#> 2 3
#> 3 4
#> 4 5
It works even if z is:
z <- data.frame(col_1 = c(1,2,3,4),
col_2 = c(3,4,5,6),
anothercol_1 = c(1,2,3,4),
anothercol_2 = c(3,4,5,6))

Rolling sum of one variable in data.frame in number of steps defined by another variable

I'm trying to sum up the values in a data.frame in a cumulative way.
I have this:
df <- data.frame(
a = rep(1:2, each = 5),
b = 1:10,
step_window = c(2,3,1,2,4, 1,2,3,2,1)
)
I'm trying to sum up the values of b, within the groups a. The trick is, I want the sum of b values that corresponds to the number of rows following the current row given by step_window.
This is the output I'm looking for:
data.frame(
a = rep(1:2, each = 5),
step_window = c(2,3,1,2,4,
1,2,3,2,1),
b = 1:10,
sum_b_step_window = c(3, 9, 3, 9, 5,
6, 15, 27, 19, 10)
)
I tried to do this using the RcppRoll but I get an error Expecting a single value:
df %>%
group_by(a) %>%
mutate(sum_b_step_window = RcppRoll::roll_sum(x = b, n = step_window))
I'm not sure if having variable window size is possible in any of the rolling function. Here is one way to do this using map2_dbl :
library(dplyr)
df %>%
group_by(a) %>%
mutate(sum_b_step_window = purrr::map2_dbl(row_number(), step_window,
~sum(b[.x:(.x + .y - 1)], na.rm = TRUE)))
# a b step_window sum_b_step_window
# <int> <int> <dbl> <dbl>
# 1 1 1 2 3
# 2 1 2 3 9
# 3 1 3 1 3
# 4 1 4 2 9
# 5 1 5 4 5
# 6 2 6 1 6
# 7 2 7 2 15
# 8 2 8 3 27
# 9 2 9 2 19
#10 2 10 1 10
1) rollapply
rollapply in zoo supports vector widths. partial=TRUE says that if the width goes past the end then use just the values within the data. (Another possibility would be to use fill=NA instead in which case it would fill with NA's if there were not enough data left) . align="left" specifies that the current value at each step is the left end of the range to sum.
library(dplyr)
library(zoo)
df %>%
group_by(a) %>%
mutate(sum = rollapply(b, step_window, sum, partial = TRUE, align = "left")) %>%
ungroup
2) SQL
This can also be done in SQL by left joining df to itself on the indicated condition and then for each row summing over all rows for which the condition matches.
library(sqldf)
sqldf("select A.*, sum(B.b) as sum
from df A
left join df B on B.rowid between A.rowid and A.rowid + A.step_window - 1
and A.a = B.a
group by A.rowid")
Here is a solution with the package slider.
library(dplyr)
library(slider)
df %>%
group_by(a) %>%
mutate(sum_b_step_window = hop_vec(b, row_number(), step_window+row_number()-1, sum)) %>%
ungroup()
It is flexible on different window sizes.
Output:
# A tibble: 10 x 4
a b step_window sum_b_step_window
<int> <int> <dbl> <int>
1 1 1 2 3
2 1 2 3 9
3 1 3 1 3
4 1 4 2 9
5 1 5 4 5
6 2 6 1 6
7 2 7 2 15
8 2 8 3 27
9 2 9 2 19
10 2 10 1 10
slider is a couple-of-months-old tidyverse package specific for sliding window functions. Have a look here for more info: page, vignette
hop is the engine of slider. With this solution we are triggering different .start and .stop to sum the values of b according to the a groups.
With _vec you're asking hop to return a vector: a double in this case.
row_number() is a dplyr function that allows you to return the row number of each group, thus allowing you to slide along the rows.
data.table solution using cumulative sums
setDT(df)
df[, sum_b_step_window := {
cs <- c(0,cumsum(b))
cs[pmin(.N+1, 1:.N+step_window)]-cs[pmax(1, (1:.N))]
},by = a]

Flip group_by variable to columns, and flip columns to rows dplyr

thank you in advance for your response! I am working in Rstudio, trying to create a specific table format that my customer is looking for. Specifically, I would like to show each metric as a row and the group_by variable, in this case application type, as a column. I'm using group_by to consolidate all my data by application type, and I'm using the summarise function to create the new variables.
subs <- data.frame(
App_type = c('A','A','A','B','B','B','C','C','C','C'),
Has_error = c(1,1,1,0,0,1,1,0,1,1),
Has_critical_error = c(1,0,1,0,0,1,0,0,1,1)
)
I'm able to group the submissions together by application type to see total submissions with errors and total with critical errors. Here's what I've done -
subs %>%
group_by(App_type) %>%
summarise(
total_sub = n(),
total_error = sum(Has_error),
total_critical_error = sum(Has_critical_error)
)
# A tibble: 3 x 4
App_type total_sub total_error total_critical_error
<fct> <int> <dbl> <dbl>
1 A 3 3 2
2 B 3 1 1
3 C 4 3 2
However, my customer would like to see it this way with application totals.
A B C TOTAL
1 total_sub 3 3 4 10
2 total_error 3 1 3 7
3 total_critical_error 2 1 2 5
We can pivot to 'wide' format after reshaping to 'long' and then change the column name 'name' to rowname
library(dplyr)
library(tidyr)
library(tibble)
subs %>%
group_by(App_type) %>%
summarise(
total_sub = n(),
total_error = sum(Has_error),
total_critical_error = sum(Has_critical_error)) %>%
pivot_longer(cols = -App_type) %>%
pivot_wider(names_from = App_type, values_from = value) %>%
mutate(TOTAL = A + B + C) %>%
column_to_rownames("name")
# A B C TOTAL
#total_sub 3 3 4 10
#total_error 3 1 3 7
#total_critical_error 2 1 2 5
Or another option is transpose from data.table
library(data.table)
data.table::transpose(setDT(out), make.names = 'App_type',
keep.names = 'name')[, TOTAL := A + B + C][]
where out is the OP's summarised output
out <- subs %>%
group_by(App_type) %>%
summarise(
total_sub = n(),
total_error = sum(Has_error),
total_critical_error = sum(Has_critical_error)
)
Or with base R
addmargins(t(cbind(total_sub = as.integer(table(subs$App_type)),
rowsum(subs[-1], subs$App_type))), 2)
# A B C Sum
#total_sub 3 3 4 10
#Has_error 3 1 3 7
#Has_critical_error 2 1 2 5

How to create segmented plot that uses data in dataframe in sequential order?

I have a dataframe that looks like this:
> print(df)
person step start end
1 sam A 0 4
2 sam B 4 6
3 greg A 2 7
4 greg B 7 11
And I created the following plot:
ggplot(df, aes(colour=step)) +
geom_segment(aes(x=start, xend=end, y=person, yend=person), size=3) +
xlab("Duration")
This looks exactly as I want it to be. However, my input data structure has changed to the following:
step sam greg
1 A 0 2
2 B 4 7
3 C 6 11
This structure has basically the same meaning, but I don't know how I can easily convert it so that geom_segment understands that it is now column-wise and not per row. Do I need to transform it back to the old data structure (with start and end) or can I somehow workaround that?
It is actually not a ggplot2 question, but a data wrangling question with the goal to create your first data frame based on your second data frame. Here is an option with dplyr and tidyr.
library(dplyr)
library(tidyr)
dat2 <- dat %>%
gather(person, Value, -step) %>%
group_by(person) %>%
slice(rep(row_number(), each = 2)) %>%
slice(2:(n() - 1)) %>%
mutate(Type = rep(c( "start", "end"), times = n()/2),
step = rep(LETTERS[1:(n()/2)], each = 2)) %>%
spread(Type, Value) %>%
arrange(person, step) %>%
select(step, person, start, end) %>%
arrange(desc(person), step) %>%
ungroup()
dat2
# # A tibble: 4 x 4
# step person start end
# <chr> <chr> <int> <int>
# 1 A sam 0 4
# 2 B sam 4 6
# 3 A greg 2 7
# 4 B greg 7 11
DATA
dat <- read.table(text = " step sam greg
1 A 0 2
2 B 4 7
3 C 6 11",
header = TRUE, stringsAsFactors = FALSE)

Collapse data frame into single row and creating new columns based on row R

I have a data frame with object names and a list of statistical moments for that object, like this:
Object Mean IQR Skew
x 1 1 1
y 2 2 2
z 3 3 3
What i want is to for each row create columns with the statistical moments and the object name prefixed. Like so:
xMean xIQR xSkew yMean yIQR ySkew zMean zIQR zSkew
1 1 1 2 2 2 3 3 3
In essence what I need is to collapse a data frame to a single row such that it list all statistical moments on a single line as i'll have many rows like the final one but a finite set of columns.
You could do:
df1$id <- 1
reshape(df1, idvar="id", timevar="Object", direction="wide")[-1]
# Mean.x IQR.x Skew.x Mean.y IQR.y Skew.y Mean.z IQR.z Skew.z
#1 1 1 1 2 2 2 3 3 3
Or using dcast, melt from reshape2
library(reshape2)
dcast(melt(df1, id.var=c('id', 'Object')), id~..., value.var='value')[-1]
# x_Mean x_IQR x_Skew y_Mean y_IQR y_Skew z_Mean z_IQR z_Skew
#1 1 1 1 2 2 2 3 3 3
Or using dplyr and tidyr
library(dplyr)
library(tidyr)
df1 %>%
gather(Var, Val, Mean:Skew) %>%
unite(VarNew,Object, Var, sep="") %>%
spread(VarNew, Val) %>%
select(-id)
# xIQR xMean xSkew yIQR yMean ySkew zIQR zMean zSkew
#1 1 1 1 2 2 2 3 3 3
data
df1 <- structure(list(Object = c("x", "y", "z"), Mean = 1:3, IQR = 1:3,
Skew = 1:3), .Names = c("Object", "Mean", "IQR", "Skew"), class = "data.frame", row.names = c(NA,
-3L))
Or maybe something like
setNames(unlist(data.frame(t(df[-1]))), paste0(rep(df[, 1], each = nrow(df)), names(df[, -1])))
# xMean xIQR xSkew yMean yIQR ySkew zMean zIQR zSkew
# 1 1 1 2 2 2 3 3 3

Resources