Summing rows based on conditional in groups - r

Previously I asked related to this question but I need more elegant and general way to solve this.
I have data separated in groups and I want to sum some rows in range based on conditional. I prefer to use 'dplyr' to do this because it's more straight forward for me to understand.
The conditionals which I need as follows;
1: for group 1 ;
find the first occurrence of '10' and sum the rows after this occurrence to the end of the group and count how many rows.
2: for group 2;'find the last occurrence of '10' and and sum the rows before this occurrence to the beginning of the group and count how many rows!
3: for group 3; find the first occurrence of '10' and and sum the rows before this occurrence to the starting row of the group and count how many rows.
df <- data.frame(gr=rep(c(1,2,3),c(7,9,11)),
y_value=c(c(0,0,10,8,8,6,0),c(10,10,10,8,7,6,2,0,0), c(8,5,8,7,6,2,10,10,8,7,0)))
> df
gr y_value
1 1 0
2 1 0
3 1 10
4 1 8
5 1 8
6 1 6
7 1 0
8 2 10
9 2 10
10 2 10
11 2 8
12 2 7
13 2 6
14 2 2
15 2 0
16 2 0
17 3 8
18 3 5
19 3 8
20 3 7
21 3 6
22 3 2
23 3 10
24 3 10
25 3 8
26 3 7
27 3 0
It guess something like this should work but cannot figured out how to implement this to dplyr
count <- function(y,gr){
if (any(y==10)&(gr==1)) {
*
*
*
if (any(y==10)&(gr==2))
*
*
*
*
}
}
df%>%
library(dplyr)
df %>%
group_by(gr) %>%
do(data.frame(.,count_rows=count(y_value,gr)))
expected output
> df
gr y_value sum nrow
1 1 0 22 4
2 1 0 22 4
3 1 10 22 4
4 1 8 22 4
5 1 8 22 4
6 1 6 22 4
7 1 0 22 4
8 2 10 23 6
9 2 10 23 6
10 2 10 23 6
11 2 8 23 6
12 2 7 23 6
13 2 6 23 6
14 2 2 23 6
15 2 0 23 6
16 2 0 23 6
17 3 8 28 6
18 3 5 28 6
19 3 7 28 6
20 3 6 28 6
21 3 2 28 6
22 3 10 28 6
23 3 10 28 6
24 3 8 28 6
25 3 7 28 6
26 3 0 28 6

Hope this helps!
(Edit note: modified code after OP updated his original requirement)
#sample data - I slightly changed sample data (replaced 0 by 10 in 2nd row) for group 1 to satisfy your condition
df <- data.frame(gr=rep(c(1,2,3),c(7,9,11)),
y_value=c(c(0,10,10,8,8,6,0),c(10,10,10,8,7,6,2,0,0), c(8,5,8,7,6,2,10,10,8,7,0)))
library(dplyr)
df_temp <- df %>%
group_by(gr) %>%
mutate(rows_to_aggregate=cumsum(y_value==10)) %>%
filter(ifelse(gr==1, rows_to_aggregate !=0, ifelse(gr==2, rows_to_aggregate ==0 | y_value==10, rows_to_aggregate ==0))) %>%
filter(ifelse(gr==1, row_number(gr) != 1, ifelse(gr==2, row_number(gr) != n(), rows_to_aggregate ==0))) %>%
mutate(nrow=n(), sum=sum(y_value)) %>%
select(gr,sum,nrow) %>%
distinct()
#final output
df<- left_join(df,df_temp, by='gr')

I think you're after cummax:
df %>%
group_by(gr) %>%
mutate(in_scope = if_else(gr == 1,
cummax(lag(y_value == 10, default = FALSE)),
if_else(gr == 2,
cummax(lag(y_value == 10, default = FALSE) & y_value != 10),
1L - cummax(y_value == 10)))) %>%
ungroup %>%
group_by(gr) %>%
summarise(the_sum = sum(y_value * in_scope),
the_count = sum(in_scope))
# A tibble: 3 x 3
gr the_sum the_count
<dbl> <dbl> <int>
1 1 22 4
2 2 23 6
3 3 36 6

Related

Filling the missing values within each id in r

I have a dataframe having some rows missing value. Here is a sample dataframe:
df <- data.frame(id = c(1,1,1, 2,2,2, 3,3,3),
item = c(11,12,13, 24,25,26, 56,45,56),
score = c(5,5, NA, 6,6,6, 7,NA, 7))
> df
id item score
1 1 11 5
2 1 12 5
3 1 13 NA
4 2 24 6
5 2 25 6
6 2 26 6
7 3 56 7
8 3 45 NA
9 3 56 7
Grouping the dataset by id column, I would like to fill those NA values with the same score.
the desired output should be:
> df
id item score
1 1 11 5
2 1 12 5
3 1 13 5
4 2 24 6
5 2 25 6
6 2 26 6
7 3 56 7
8 3 45 7
9 3 56 7
Any ideas?
Thanks!
We can group by 'id' and fill
library(dplyr)
library(tidyr)
df %>%
group_by(id) %>%
fill(score, .direction = "downup") %>%
ungroup
Here is another option with base R
> transform(df, score = ave(score, id, FUN = function(x) mean(x, na.rm = TRUE)))
id item score
1 1 11 5
2 1 12 5
3 1 13 5
4 2 24 6
5 2 25 6
6 2 26 6
7 3 56 7
8 3 45 7
9 3 56 7
Another option is to create your own function,eg:
fill.in<-function(dataf){
dataf2<-data.frame()
for (i in 1:length(unique(dataf$id))){
dataf1<-subset(dataf, id %in% unique(dataf$id)[i])
dataf1$score<-max(dataf1$score,na.rm=TRUE)
dataf2<-rbind(dataf2,dataf1)
}
return(dataf2)
}
fill.in(df)

How to break ties in a ranking with gaps in ranking [duplicate]

This question already has answers here:
Increment by one to each duplicate value
(4 answers)
Closed 1 year ago.
Say that I have these data:
data <- data.frame(orig=c(1,5,5,5,14,18,18,25))
orig
1 1
2 5
3 5
4 5
5 14
6 18
7 18
8 25
I would like to create the want column:
orig want
1 1 1
2 5 5
3 5 6
4 5 7
5 14 14
6 18 18
7 18 19
8 25 25
This column takes orig and copies its value, but breaks ties if they exist. What I am trying to do is to re-create the rankings so that there are no ties and the ties are broken based on the order of the rows in the dataset. If not for the spaces in the rankings (jump from 1 to 5, etc.), I could use
library(tidyverse)
data %>% mutate(test = rank(orig, ties.method="min"))
But this of course doesn't get me what I want:
orig test
1 1 1
2 5 2
3 5 2
4 5 2
5 14 5
6 18 6
7 18 6
8 25 8
What can I do?
We may add row_number() after grouping
library(dplyr)
data %>%
group_by(orig) %>%
mutate(want = orig + row_number() - 1) %>%
ungroup
-ouptut
# A tibble: 8 x 2
orig want
<dbl> <dbl>
1 1 1
2 5 5
3 5 6
4 5 7
5 14 14
6 18 18
7 18 19
8 25 25
Or may simplify with rowid from data.table
library(data.table)
data %>%
mutate(want = orig + rowid(orig)-1)
A base R option using ave + seq_along
transform(
data,
want = orig + ave(orig, orig, FUN = seq_along) - 1
)
gives
orig want
1 1 1
2 5 5
3 5 6
4 5 7
5 14 14
6 18 18
7 18 19
8 25 25

Fill zeros for missing values in R

I am trying to deal with this problem.
I have a df with a date column and I want to count the occurences per hour. Here is what I've done:
x <- df %>%
mutate(hora = hour(date)) %>%
select(hora) %>%
count(hora)
that gives as a result:
> x
# A tibble: 19 x 2
hora n
<int> <int>
1 0 1
2 1 1
3 3 1
4 8 4
5 9 7
6 10 10
7 11 14
8 12 10
9 13 8
10 14 4
11 15 5
12 16 12
13 17 4
14 18 12
15 19 9
16 20 5
17 21 2
18 22 4
19 23 4
As you can see, there are hours that don't show up that would have n=0, like 2 or 4:7. What I want is it to add the hours that are not in x with n=0 so the table is complete.
The expected output should be something like this:
hora n
1 0 12
2 1 3
3 2 5
4 3 7
5 4 8
6 5 1
7 6 0
8 7 11
9 8 6
10 9 10
11 10 9
12 11 0
13 12 0
14 13 3
15 14 0
16 15 7
17 16 8
18 17 1
19 18 2
20 19 11
21 20 6
22 21 10
23 22 9
24 23 4
I tried creating a table with hours 0:23 and all n=0 and trying to sum the two tables but obviously that didn't work. I also tried x$hour <- 0:23, thinking that the missing values would be added, but it didn't work as well.
You could convert hora to factor and use .drop = FALSE in count
library(dplyr)
library(lubridate)
df %>%
mutate(hora = factor(hour(date), levels = 0:23)) %>%
count(hora, .drop = FALSE)
Another option is to use complete :
df %>%
mutate(hora = hour(date)) %>%
count(hora) %>%
tidyr::complete(hora = 0:23, fill = list(n = 0))
A solution in Base R merges a vector of hours with the summarized data, and sets the missing counts to 0.
textFile <- "row hour count
1 0 1
2 1 1
3 3 1
4 8 4
5 9 7
6 10 10
7 11 14
8 12 10
9 13 8
10 14 4
11 15 5
12 16 12
13 17 4
14 18 12
15 19 9
16 20 5
17 21 2
18 22 4
19 23 4"
data <- read.table(text = textFile,header = TRUE)[-1]
hours <- data.frame(hour = 0:23)
merged <- merge(data,hours,all.y = TRUE)
merged[is.na(merged$count),"count"] <- 0
...and the output:
> head(merged)
hour count
1 0 1
2 1 1
3 2 0
4 3 1
5 4 0
6 5 0
>

Matching the row value of a data frame with its corresponding values

The picture below is my data set in R :
reproducible example:
data <- data.frame(
time = rep(0.2, 5),
m1 = c(9,15,2,8,18),
m2 = c(11,1,13,12,NA),
m3 = c(16,NA,7,17,NA),
m4 = c(10,NA,3,4,NA),
m5 = c(14,NA,6,NA,NA),
m6 = c(NA,NA,5,NA,NA)
)
I want the following output, which is a table displaying each value in the dataset and below the number of the row to which the value belongs:
Thank you in advance for your help !
Remove the first column, transpose what is left, convert it back to a data frame, set the column names to the original row numbers, stack that and omit NA rows. Then re-order by values.
d <- na.omit(stack(setNames(as.data.frame(t(data[-1])), 1:nrow(data))))
d[order(d$values), ]
giving:
values ind
8 1 2
13 2 3
16 3 3
22 4 4
18 5 3
17 6 3
15 7 3
19 8 4
1 9 1
4 10 1
2 11 1
20 12 4
14 13 3
5 14 1
7 15 2
3 16 1
21 17 4
25 18 5
try this:
library(tidyverse)
data %>%
rownames_to_column("row_id") %>%
gather(key, value, -time, -row_id) %>%
select(1, 4) %>%
na.omit() %>%
spread(value, row_id)
output is:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
1 2 3 3 4 3 3 3 4 1 1 1 4 3 1 2 1 4 5

Sum of group but keep the same value for each row in r

I have data frame, I want to create a new variable by sum of each ID and group, if I sum normal,dimension of data reduce, my case I need to keep and repeat each row.
ID <- c(rep(1,3), rep(3, 5), rep(4,4))
Group <-c(1,1,2,1,1,1,2,2,1,1,1,2)
x <- c(1:12)
y<- c(12:23)
df <- data.frame(ID,Group,x,y)
ID Group x y
1 1 1 1 12
2 1 1 2 13
3 1 2 3 14
4 3 1 4 15
5 3 1 5 16
6 3 1 6 17
7 3 2 7 18
8 3 2 8 19
9 4 1 9 20
10 4 1 10 21
11 4 1 11 22
12 4 2 12 23
The output with 2 more variables "sumx" and "sumy". Group by (ID, Group)
ID Group x y sumx sumy
1 1 1 1 12 3 25
2 1 1 2 13 3 25
3 1 2 3 14 3 14
4 3 1 4 15 15 48
5 3 1 5 16 15 48
6 3 1 6 17 15 48
7 3 2 7 18 15 37
8 3 2 8 19 15 37
9 4 1 9 20 30 63
10 4 1 10 21 30 63
11 4 1 11 22 30 63
12 4 2 12 23 12 23
Any Idea?
As short as:
df$sumx <- with(df,ave(x,ID,Group,FUN = sum))
df$sumy <- with(df,ave(y,ID,Group,FUN = sum))
We can use dplyr
library(dplyr)
df %>%
group_by(ID, Group) %>%
mutate_each(funs(sum)) %>%
rename(sumx=x, sumy=y) %>%
bind_cols(., df[c("x", "y")])
If there are only two columns to sum, then
df %>%
group_by(ID, Group) %>%
mutate(sumx = sum(x), sumy = sum(y))
You can use below code to get what you want if it is a single column and in case you have more than 1 column then add accordingly:
library(dplyr)
data13 <- data12 %>%
group_by(Category) %>%
mutate(cum_Cat_GMR = cumsum(GrossMarginRs))

Resources