New variable from grouped calculation in R - r

I have a dataset:
library(dplyr)
my_df <- data.frame(day = c(1,1,1,2,2,2,3,3,3), age = c(18, 18, 18, 25, 18, 35, 76, 76, 15))
my_df
# day age
# 1 1 18
# 2 1 18
# 3 1 18
# 4 2 25
# 5 2 18
# 6 2 35
# 7 3 76
# 8 3 76
# 9 3 15
For each row, I want to know the frequency and percentage of age for a given value of day. For example, I can calculate this with a dplyr chain:
my_df %>%
group_by(day, age) %>%
summarize(n=n()) %>%
group_by(day) %>%
mutate(pct = n/sum(n))
# day age n pct
# 1 1 18 3 1
# 2 2 18 1 0.333
# 3 2 25 1 0.333
# 4 2 35 1 0.333
# 5 3 15 1 0.333
# 6 3 76 2 0.667
How can I add the vales of n values back onto my original df? Desired output:
# day age n
# 1 1 18 3
# 2 1 18 3
# 3 1 18 3
# 4 2 25 1
# 5 2 18 1
# 6 2 35 1
# 7 3 76 2
# 8 3 76 2
# 9 3 15 1

For your desired output we could use add_count()
library(dplyr)
my_df %>%
add_count(day, age)
day age n
1 1 18 3
2 1 18 3
3 1 18 3
4 2 25 1
5 2 18 1
6 2 35 1
7 3 76 2
8 3 76 2
9 3 15 1

I would store this as a variable, as such:
my_helper_df <- my_df %>%
group_by(day, age) %>%
summarize(n=n()) %>%
group_by(day) %>%
mutate(pct = n/sum(n))
Then left_join to the original df, as so:
final_df <- dplyr::left_join(df, my_helper_df, by = c("day", "age"))

Related

Converting time-dependent variable to long format using one variable indicating day of update

I am trying to convert my data to a long format using one variable that indicates a day of the update.
I have the following variables:
baseline temperature variable "temp_b";
time-varying temperature variable "temp_v" and
the number of days "n_days" when the varying variable is updated.
I want to create a long format using the carried forward approach and a max follow-up time of 5 days.
Example of data
df <- structure(list(id=1:3, temp_b=c(20L, 7L, 7L), temp_v=c(30L, 10L, NA), n_days=c(2L, 4L, NA)), class="data.frame", row.names=c(NA, -3L))
# id temp_b temp_v n_days
# 1 1 20 30 2
# 2 2 7 10 4
# 3 3 7 NA NA
df_long <- structure(list(id=c(1,1,1,1,1, 2,2,2,2,2, 3,3,3,3,3),
days_cont=c(1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5),
long_format=c(20,30,30,30,30,7,7,7,10,10,7,7,7,7,7)),
class="data.frame", row.names=c(NA, -15L))
# id days_cont long_format
# 1 1 1 20
# 2 1 2 30
# 3 1 3 30
# 4 1 4 30
# 5 1 5 30
# 6 2 1 7
# 7 2 2 7
# 8 2 3 7
# 9 2 4 10
# 10 2 5 10
# 11 3 1 7
# 12 3 2 7
# 13 3 3 7
# 14 3 4 7
# 15 3 5 7
You could repeat each row 5 times with tidyr::uncount():
library(dplyr)
df %>%
tidyr::uncount(5) %>%
group_by(id) %>%
transmute(days_cont = 1:n(),
temp = ifelse(row_number() < n_days | is.na(n_days), temp_b, temp_v)) %>%
ungroup()
# # A tibble: 15 × 3
# id days_cont temp
# <int> <int> <int>
# 1 1 1 20
# 2 1 2 30
# 3 1 3 30
# 4 1 4 30
# 5 1 5 30
# 6 2 1 7
# 7 2 2 7
# 8 2 3 7
# 9 2 4 10
# 10 2 5 10
# 11 3 1 7
# 12 3 2 7
# 13 3 3 7
# 14 3 4 7
# 15 3 5 7
Here's a possibility using tidyverse functions. First, pivot_longer and get rid of unwanted values (that will not appear in the final df, i.e. values with temp_v == NA), then group_by id, and mutate the n_days variable to match the number of rows it will have in the final df. Finally, uncount the dataframe.
library(tidyverse)
df %>%
replace_na(list(n_days = 6)) %>%
pivot_longer(-c(id, n_days)) %>%
filter(!is.na(value)) %>%
group_by(id) %>%
mutate(n_days = case_when(name == "temp_b" ~ n_days - 1,
name == "temp_v" ~ 5 - (n_days - 1))) %>%
uncount(n_days) %>%
mutate(days_cont = row_number()) %>%
select(id, days_cont, long_format = value)
id days_cont long_format
<int> <int> <int>
1 1 1 20
2 1 2 30
3 1 3 30
4 1 4 30
5 1 5 30
6 2 1 7
7 2 2 7
8 2 3 7
9 2 4 10
10 2 5 10
11 3 1 7
12 3 2 7
13 3 3 7
14 3 4 7
15 3 5 7

Count rows until criterion is reached, then start again

My data looks like this:
id = c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4)
time=c(20,30,1100,40,31,32,33,1005,22,23,1001,24,12,13,14,1002)
test <- data.frame(id,time)
I am now trying to count the rows until time > 1000grouped by id . So far I got
library(dplyr)
test %>%
group_by(id, idx = cumsum(time >= 1000))
%>%
mutate(trip_count = row_number()) %>%
ungroup %>%
select(-idx)
This works so far but instead of 1 when time > 1000 I want the count to go one further and starting with 1again at the next column. Is this somehow possible?
Since each group has 4 rows in your data, we can use this:
> test %>% left_join(test %>% filter(time < 1000) %>% group_by(id) %>% mutate(trip_count = row_number())) %>% group_by(id) %>%
+ mutate(trip_count = replace_na(trip_count, 4))
Joining, by = c("id", "time")
# A tibble: 16 x 3
# Groups: id [4]
id time trip_count
<dbl> <dbl> <dbl>
1 1 20 1
2 1 30 2
3 1 40 3
4 1 1100 4
5 2 31 1
6 2 32 2
7 2 33 3
8 2 1005 4
9 3 22 1
10 3 23 2
11 3 24 3
12 3 1001 4
13 4 12 1
14 4 13 2
15 4 14 3
16 4 1002 4
>
If your data doesn't have 4 rows per group, can use this:
> id = c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,4)
> time=c(20,30,40,1100,31,32,33,1005,22,23,24,1001,12,13,14,15,1002)
> test <- data.frame(id,time)
> test %>% left_join(test %>% filter(time < 1000) %>% group_by(id) %>% mutate(trip_count = row_number())) %>% group_by(id) %>%
+ mutate(across(trip_count, ~ replace(., is.na(.), n())))
Joining, by = c("id", "time")
# A tibble: 17 x 3
# Groups: id [4]
id time trip_count
<dbl> <dbl> <int>
1 1 20 1
2 1 30 2
3 1 40 3
4 1 1100 4
5 2 31 1
6 2 32 2
7 2 33 3
8 2 1005 4
9 3 22 1
10 3 23 2
11 3 24 3
12 3 1001 4
13 4 12 1
14 4 13 2
15 4 14 3
16 4 15 4
17 4 1002 5
>
I added additional row to group 4.
Based on new data as shared by OP:
> test %>%
+ left_join(test %>% group_by(id) %>% filter(row_number() < which(time >= 1000)) %>%
+ mutate(trip_count = row_number())) %>%
+ left_join(test %>% group_by(id) %>% filter(row_number() > which(time >= 1000)) %>% mutate(trip_count1 = row_number())) %>%
+ mutate(trip_count = coalesce(trip_count, trip_count1)) %>% select(-trip_count1) %>% group_by(id) %>%
+ mutate(rowid = row_number()) %>% rowwise() %>% mutate(trip_count = replace_na(trip_count, rowid)) %>% select(-rowid)
Joining, by = c("id", "time")
Joining, by = c("id", "time")
# A tibble: 16 x 3
# Rowwise: id
id time trip_count
<dbl> <dbl> <int>
1 1 20 1
2 1 30 2
3 1 1100 3
4 1 40 1
5 2 31 1
6 2 32 2
7 2 33 3
8 2 1005 4
9 3 22 1
10 3 23 2
11 3 1001 3
12 3 24 1
13 4 12 1
14 4 13 2
15 4 14 3
16 4 1002 4
>
You could use the lag:
library(dplyr)
test %>%
group_by(id, idx = cumsum(lag(time, default = 0) >= 1000)) %>%
mutate(trip_count = row_number()) %>%
ungroup %>%
select(-idx)
Output:
# A tibble: 16 x 3
id time trip_count
<dbl> <dbl> <int>
1 1 20 1
2 1 30 2
3 1 40 3
4 1 1100 4
5 2 31 1
6 2 32 2
7 2 33 3
8 2 1005 4
9 3 22 1
10 3 23 2
11 3 24 3
12 3 1001 4
13 4 12 1
14 4 13 2
15 4 14 3
16 4 1002 4

How to balance a dataset in `dplyr` using `sample_n` automatically to the size of the smallest class?

I have a dataset like:
df <- tibble(
id = 1:18,
class = rep(c(rep(1,3),rep(2,2),3),3),
var_a = rep(c("a","b"),9)
)
# A tibble: 18 x 3
id cluster var_a
<int> <dbl> <chr>
1 1 1 a
2 2 1 b
3 3 1 a
4 4 2 b
5 5 2 a
6 6 3 b
7 7 1 a
8 8 1 b
9 9 1 a
10 10 2 b
11 11 2 a
12 12 3 b
13 13 1 a
14 14 1 b
15 15 1 a
16 16 2 b
17 17 2 a
18 18 3 b
That dataset contains a number of observations in several classes. The classes are not balanced. In the sample above we can see, that only 3 observations are of class 3, while there are 6 observations of class 2 and 9 observations of class 1.
Now I want to automatically balance that dataset so that all classes are of the same size. So I want a dataset of 9 rows, 3 rows in each class. I can use the sample_n function from dplyr to do such a sampling.
I achieved to do so by first calculating the smallest class size..
min_length <- as.numeric(df %>%
group_by(class) %>%
summarise(n = n()) %>%
ungroup() %>%
summarise(min = min(n)))
..and then apply the sample_n function:
set.seed(1)
df %>% group_by(cluster) %>% sample_n(min_length)
# A tibble: 9 x 3
# Groups: cluster [3]
id cluster var_a
<int> <dbl> <chr>
1 15 1 a
2 7 1 a
3 13 1 a
4 4 2 b
5 5 2 a
6 17 2 a
7 18 3 b
8 6 3 b
9 12 3 b
I wondered If it's possible to do that (calculating the smallest class size and then sampling) in one go?
You can do it in one step, but it is cheating a little:
set.seed(42)
df %>%
group_by(class) %>%
sample_n(min(table(df$class))) %>%
ungroup()
# # A tibble: 9 x 3
# id class var_a
# <int> <dbl> <chr>
# 1 1 1 a
# 2 8 1 b
# 3 15 1 a
# 4 4 2 b
# 5 5 2 a
# 6 11 2 a
# 7 12 3 b
# 8 18 3 b
# 9 6 3 b
I say "cheating" because normally you would not want to reference df$ from within the pipe. However, because they property we're looking for is of the whole frame but the table function only sees one group at a time, we need to side-step that a little.
One could do
df %>%
mutate(mn = min(table(class))) %>%
group_by(class) %>%
sample_n(mn[1]) %>%
ungroup()
# # A tibble: 9 x 4
# id class var_a mn
# <int> <dbl> <chr> <int>
# 1 14 1 b 3
# 2 13 1 a 3
# 3 7 1 a 3
# 4 4 2 b 3
# 5 16 2 b 3
# 6 5 2 a 3
# 7 12 3 b 3
# 8 18 3 b 3
# 9 6 3 b 3
Though I don't think that that is any more elegant/readable.

Calculating median based on the group in long format

I have formated my data in long
df1<-read.table(text=" ID Temp location
1 12 4
1 18 3
1 17 5
1 10 1
1 19 1
1 15 4
1 16 5
1 10 3
1 11 5
1 15 1
2 20 3
2 10 3
2 17 1
2 13 5
2 12 1
2 14 4
2 20 5
2 13 1
2 13 3
2 10 3
3 12 4
3 18 3
3 18 3
3 15 1
3 17 1
3 15 4
3 10 1
3 11 3
3 13 1
3 14 1",header=TRUE)
I want to calculate the median ( round up) based on Temp and location for 3 groups (Id). The question is what is the median for id1, id2,id3, if location=1. In other words, 10,19 and 15, give a median of 15 or for id2, we have 17,12 and 13, give a median of 13.5, roundup=14. and so on.
So I need to get this data:
AM1 15
AM2 14
AM3 14
Thanks for your help and sorry I was unable to show my effort.
You could also use data.table.
library(data.table)
setDT(df1)[location == 1, .(Median = base::round(median(as.numeric(Temp)))), by = .(ID = paste0(“AM”, ID))]
One option is to filter first, then do a group by and median
library(dplyr)
library(stringr)
df1 %>%
filter(location ==1) %>%
group_by(ID = str_c("AM", ID)) %>%
summarise(Median = median(Temp))
# A tibble: 3 x 2
# ID Median
# <chr> <int>
#1 AM1 15
#2 AM2 13
#3 AM3 14
Also, can be made more compact, but inefficient
df1 %>%
group_by(ID) %>%
summarise(Median = median(Temp[location == 1]))

Sum of group but keep the same value for each row in r

I have data frame, I want to create a new variable by sum of each ID and group, if I sum normal,dimension of data reduce, my case I need to keep and repeat each row.
ID <- c(rep(1,3), rep(3, 5), rep(4,4))
Group <-c(1,1,2,1,1,1,2,2,1,1,1,2)
x <- c(1:12)
y<- c(12:23)
df <- data.frame(ID,Group,x,y)
ID Group x y
1 1 1 1 12
2 1 1 2 13
3 1 2 3 14
4 3 1 4 15
5 3 1 5 16
6 3 1 6 17
7 3 2 7 18
8 3 2 8 19
9 4 1 9 20
10 4 1 10 21
11 4 1 11 22
12 4 2 12 23
The output with 2 more variables "sumx" and "sumy". Group by (ID, Group)
ID Group x y sumx sumy
1 1 1 1 12 3 25
2 1 1 2 13 3 25
3 1 2 3 14 3 14
4 3 1 4 15 15 48
5 3 1 5 16 15 48
6 3 1 6 17 15 48
7 3 2 7 18 15 37
8 3 2 8 19 15 37
9 4 1 9 20 30 63
10 4 1 10 21 30 63
11 4 1 11 22 30 63
12 4 2 12 23 12 23
Any Idea?
As short as:
df$sumx <- with(df,ave(x,ID,Group,FUN = sum))
df$sumy <- with(df,ave(y,ID,Group,FUN = sum))
We can use dplyr
library(dplyr)
df %>%
group_by(ID, Group) %>%
mutate_each(funs(sum)) %>%
rename(sumx=x, sumy=y) %>%
bind_cols(., df[c("x", "y")])
If there are only two columns to sum, then
df %>%
group_by(ID, Group) %>%
mutate(sumx = sum(x), sumy = sum(y))
You can use below code to get what you want if it is a single column and in case you have more than 1 column then add accordingly:
library(dplyr)
data13 <- data12 %>%
group_by(Category) %>%
mutate(cum_Cat_GMR = cumsum(GrossMarginRs))

Resources