Reformat Dataframe to start a new row at a certain column - r

I have a dataframe that looks like this:
ID x.2019 x.2020
1 10 20
2 20 30
3 30 40
4 40 50
5 50 60
and I would like to reformat it to look like this:
ID time x
1 2019 10
1 2020 20
2 2019 20
2 2020 30
3 2019 40
3 2020 50
4 2019 60
4 2020 70
5 2019 70
5 2020 80
Any idea how to achieve this?

This is a rather simple task which you can probably find in other answers. Though, you can achieve what you want with data.table as follows:
library(data.table)
df = data.table( ID = 1:5,
x.2019 = seq(10, 50, by = 10),
x.2020 = seq(20, 60, by = 10)
)
# change column names conveniently
setnames(df, c("x.2019", "x.2020"), c("2019", "2020"))
# transform the dataset from wide to long format
out = melt(df, id.vars = "ID", variable.name = "time", value.name = "x", variable.factor = FALSE)
# cast time to integer
out[ , time := as.integer(time)]
# reorder by ID
setorder(out, ID)
out
#> ID time x
#> 1: 1 2019 10
#> 2: 1 2020 20
#> 3: 2 2019 20
#> 4: 2 2020 30
#> 5: 3 2019 30
#> 6: 3 2020 40
#> 7: 4 2019 40
#> 8: 4 2020 50
#> 9: 5 2019 50
#> 10: 5 2020 60
Created on 2022-01-20 by the reprex package (v2.0.1)

You can use pivot_longer:
library(dplyr)
library(tidyr)
df = data.frame(ID=1:5,
x.2019=c(10, 20, 30, 40, 50),
x.2020=c(20, 30, 40, 50, 60))
df %>%
pivot_longer(cols = c(2, 3), names_to = 'time', values_to = 'x') %>%
mutate(time = as.integer(stringr::str_replace(time, 'x.', '')))
Result:
# A tibble: 10 x 3
ID time x
<int> <int> <dbl>
1 1 2019 10
2 1 2020 20
3 2 2019 20
4 2 2020 30
5 3 2019 30
6 3 2020 40
7 4 2019 40
8 4 2020 50
9 5 2019 50
10 5 2020 60

Related

Sum up with the next line into a new colum

I'm having some trouble on figuring out how to create a new column with the sum of 2 subsequent cells.
I have :
df1<- tibble(Years=c(1990, 2000, 2010, 2020, 2030, 2050, 2060, 2070, 2080),
Values=c(1,2,3,4,5,6,7,8,9 ))
Now, I want a new column where the first line is the sum of 1+2, the second line is the sum of 1+2+3 , the third line is the sum 1+2+3+4 and so on.
As 1, 2, 3, 4... are hipoteticall values, I need to measure the absolute growth from a decade to another in order to create later on a new variable to measure the percentage change from a decade to another.
library(tibble)
df1<- tibble(Years=c(1990, 2000, 2010, 2020, 2030, 2050, 2060, 2070, 2080),
Values=c(1,2,3,4,5,6,7,8,9 ))
library(slider)
library(dplyr, warn.conflicts = F)
df1 %>%
mutate(xx = slide_sum(Values, after = 1, before = Inf))
#> # A tibble: 9 x 3
#> Years Values xx
#> <dbl> <dbl> <dbl>
#> 1 1990 1 3
#> 2 2000 2 6
#> 3 2010 3 10
#> 4 2020 4 15
#> 5 2030 5 21
#> 6 2050 6 28
#> 7 2060 7 36
#> 8 2070 8 45
#> 9 2080 9 45
Created on 2021-08-12 by the reprex package (v2.0.0)
Assuming the last row is to be repeated. Otherwise the fill part can be skipped.
library(dplyr)
library(tidyr)
df1 %>%
mutate(x = lead(cumsum(Values))) %>%
fill(x)
# Years Values x
# <dbl> <dbl> <dbl>
# 1 1990 1 3
# 2 2000 2 6
# 3 2010 3 10
# 4 2020 4 15
# 5 2030 5 21
# 6 2050 6 28
# 7 2060 7 36
# 8 2070 8 45
# 9 2080 9 45
Using base R
v1 <- cumsum(df1$Values)[-1]
df1$new <- c(v1, v1[length(v1)])
You want the cumsum() function. Here are two ways to do it.
### Base R
df1$cumsum <- cumsum(df1$Values)
### Using dplyr
library(dplyr)
df1 <- df1 %>%
mutate(cumsum = cumsum(Values))
Here is the output in either case.
df1
# A tibble: 9 x 3
Years Values cumsum
<dbl> <dbl> <dbl>
1 1990 1 1
2 2000 2 3
3 2010 3 6
4 2020 4 10
5 2030 5 15
6 2050 6 21
7 2060 7 28
8 2070 8 36
9 2080 9 45
A data.table option
> setDT(df)[, newCol := shift(cumsum(Values), -1, fill = sum(Values))][]
Years Values newCol
1: 1990 1 3
2: 2000 2 6
3: 2010 3 10
4: 2020 4 15
5: 2030 5 21
6: 2050 6 28
7: 2060 7 36
8: 2070 8 45
9: 2080 9 45
or a base R option following a similar idea
transform(
df,
newCol = c(cumsum(Values)[-1],sum(Values))
)

How to add letter label to numeric values in a tibble?

i have a column in my tibble with values 10 10 10 20 20 20 20 30 30, I want to create a new column as follows: 10a 10b 10c 20a 20b 20c 20d 30a 30b.
A simpler and concise solution using only dplyr.
# import library
library(dplyr)
# data
df <- data.frame(id = c(10, 10, 10, 20, 20, 20, 20, 30, 30))
# solution
df %>%
group_by(id) %>% # grouping by id
mutate(
alpha_id = letters[1:length(id)], # create alpha id
new_id = paste0(id, alpha_id) # new_id = id + alpha_id
) %>%
ungroup() %>% # ungroup
select(-alpha_id) # dropping staging variable.
Output
# A tibble: 9 x 2
id new_id
<dbl> <chr>
1 10 10a
2 10 10b
3 10 10c
4 20 20a
5 20 20b
6 20 20c
7 20 20d
8 30 30a
9 30 30b
BaseR
df$x2 <- ave(df$x, df$x, FUN = function(.x) paste0(.x, letters[seq_len(length(.x))]))
x x2
1 10 10a
2 10 10b
3 10 10c
4 20 20a
5 20 20b
6 20 20c
7 20 20d
8 30 30a
9 30 30b
dplyr
df <- data.frame(x = c(10, 10, 10, 20, 20, 20, 20, 30, 30))
library(dplyr)
df %>% group_by(x) %>%
mutate(x2 = paste0(x, letters[row_number()]))
# A tibble: 9 x 2
# Groups: x [3]
x x2
<dbl> <chr>
1 10 10a
2 10 10b
3 10 10c
4 20 20a
5 20 20b
6 20 20c
7 20 20d
8 30 30a
9 30 30b
Here's an economical solution with dplyr and data.table:
First convert to dataframe:
df <- data.frame(x = c(10, 10, 10, 20, 20, 20, 20, 30, 30))
Now transform in dplyr using data.table's function rowid and the built-in constant letters (for lower-case letters) subsetted on rowid (credit to #Henrik):
library(dplyr)
library(data.table)
df %>%
mutate(x_new = paste0(x, letters[rowid(x)]))
# A tibble: 9 x 2
x x_new
<dbl> <chr>
1 10 10a
2 10 10b
3 10 10c
4 20 20a
5 20 20b
6 20 20c
7 20 20d
8 30 30a
9 30 30b

Calculating cumulative sum for multiple columns in R

R newb, I'm trying to calculate the cumulative sum grouped by year, month, group and subgroup, also having multiple columns to calculate.
Sample of the data:
df <- data.frame("Year"=2020,
"Month"=c("Jan","Jan","Jan","Jan","Feb","Feb","Feb","Feb"),
"Group"=c("A","A","A","B","A","B","B","B"),
"SubGroup"=c("a","a","b","b","a","b","a","b"),
"V1"=c(10,10,20,20,50,50,10,10),
"V2"=c(0,1,2,2,0,5,1,1))
Year Month Group SubGroup V1 V2
1 2020 Jan A a 10 0
2 2020 Jan A a 10 1
3 2020 Jan A b 20 2
4 2020 Jan B b 20 2
5 2020 Feb A a 50 0
6 2020 Feb B b 50 5
7 2020 Feb B a 10 1
8 2020 Feb B b 10 1
Resulting Table wanted:
Year Month Group SubGroup V1 V2
1 2020 Jan A a 20 1
2 2020 Feb A a 70 1
3 2020 Jan A b 20 2
4 2020 Feb A b 20 2
5 2020 Jan B a 0 0
6 2020 Feb B a 10 1
7 2020 Jan B b 20 2
8 2020 Feb B b 80 8
From Sample Table, on Jan 2020, the sum of Group 'A' Subgroup 'a' was 10+10 = 20... On Feb 2020, the value was 50, therefore 20 from Jan + 50 = 70, and so on...
If there is no value, it should consider 0.
I've tried few codes but none didn't get even close to the output I need. Would really appreciate if someone could help me with some tips for this problem.
This is a simple group_by/mutate problem. The columns V1, V2 are chosen with across and cumsum applied to them.
df$Month <- factor(df$Month, levels = c("Jan", "Feb"))
df %>%
group_by(Year, Group, SubGroup) %>%
mutate(across(V1:V2, ~cumsum(.x))) %>%
ungroup() %>%
arrange(Year, Group, SubGroup, Month)
## A tibble: 8 x 6
# Year Month Group SubGroup V1 V2
# <chr> <fct> <chr> <chr> <dbl> <dbl>
#1 2020 Jan A a 10 0
#2 2020 Jan A a 20 1
#3 2020 Feb A a 70 1
#4 2020 Jan A b 20 2
#5 2020 Feb B a 10 1
#6 2020 Jan B b 20 2
#7 2020 Feb B b 70 7
#8 2020 Feb B b 80 8
If I understand what you are doing, you're taking the sum for each month, then doing the cumulative sums for the months. This is usuaully pretty easy in dplyr.
library(dplyr)
df %>%
group_by(Year, Month, Group, SubGroup) %>%
summarize(
V1_sum = sum(V1),
V2_sum = sum(V2)
) %>%
group_by(Year, Group, SubGroup) %>%
mutate(
V1_cumsum = cumsum(V1_sum),
V2_cumsum = cumsum(V2_sum)
)
# A tibble: 6 x 8
# Groups: Year, Group, SubGroup [4]
# Year Month Group SubGroup V1_sum V2_sum V1_cumsum V2_cumsum
# <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 2020 Feb A a 50 0 50 0
# 2 2020 Feb B a 10 1 10 1
# 3 2020 Feb B b 60 6 60 6
# 4 2020 Jan A a 20 1 70 1
# 5 2020 Jan A b 20 2 20 2
# 6 2020 Jan B b 20 2 80 8
But you'll notice that the monthly cumulative sums are backwards (i.e. January comes after February), because by default group_by groups alphabetically. Also, you don't see the empty values because dplyr doesn't fill them in.
To fix the order of the months, you can either make your months numeric (convert to dates) or turn them into factors. You can add back 'missing' combinations of the grouping variables by using aggregate in base R instead of dplyr::summarize. aggregate includes all combinations of the grouping factors. aggregate converts the missing values to NA, but you can replace the NA with 0 with tidyr::replace_na, for example.
library(dplyr)
library(tidyr)
df <- data.frame("Year"=2020,
"Month"=c("Jan","Jan","Jan","Jan","Feb","Feb","Feb","Feb"),
"Group"=c("A","A","A","B","A","B","B","B"),
"SubGroup"=c("a","a","b","b","a","b","a","b"),
"V1"=c(10,10,20,20,50,50,10,10),
"V2"=c(0,1,2,2,0,5,1,1))
df$Month <- factor(df$Month, levels = c("Jan", "Feb"), ordered = TRUE)
# Get monthly sums
df1 <- with(df, aggregate(
list(V1_sum = V1, V2_sum = V2),
list(Year = Year, Month = Month, Group = Group, SubGroup = SubGroup),
FUN = sum, drop = FALSE
))
df1 <- df1 %>%
# Replace NA with 0
mutate(
V1_sum = replace_na(V1_sum, 0),
V2_sum = replace_na(V2_sum, 0)
) %>%
# Get cumulative sum across months
group_by(Year, Group, SubGroup) %>%
mutate(V1cumsum = cumsum(V1_sum),
V2cumsum = cumsum(V2_sum)) %>%
ungroup() %>%
select(Year, Month, Group, SubGroup, V1 = V1cumsum, V2 = V2cumsum)
This gives the same result as your example:
# # A tibble: 8 x 6
# Year Month Group SubGroup V1 V2
# <dbl> <ord> <chr> <chr> <dbl> <dbl>
# 1 2020 Jan A a 20 1
# 2 2020 Feb A a 70 1
# 3 2020 Jan B a 0 0
# 4 2020 Feb B a 10 1
# 5 2020 Jan A b 20 2
# 6 2020 Feb A b 20 2
# 7 2020 Jan B b 20 2
# 8 2020 Feb B b 80 8
library(dplyr)
library(zoo)
df %>%
arrange(as.yearmon(paste0(Year, '-', Month), '%Y-%b'), Group, SubGroup) %>%
group_by(Year, Group, SubGroup) %>%
mutate(
V1 = cumsum(V1),
V2 = cumsum(V2)
) %>%
arrange(Year, Group, SubGroup, as.yearmon(paste0(Year, '-', Month), '%Y-%b')) #for desired output ordering
# A tibble: 8 x 6
# Groups: Year, Group, SubGroup [4]
# Year Month Group SubGroup V1 V2
# <chr> <chr> <chr> <chr> <dbl> <dbl>
# 1 2020 Jan A a 10 0
# 2 2020 Jan A a 20 1
# 3 2020 Feb A a 70 1
# 4 2020 Jan A b 20 2
# 5 2020 Feb B a 10 1
# 6 2020 Jan B b 20 2
# 7 2020 Feb B b 70 7
# 8 2020 Feb B b 80 8

Merge between two files in terms of date and postcode with calculating the day-weighted average for the air pollution data in R

I have two data frames. One called Pollution. The second called Med. An example of both data frames as follows:
Pollution
year month postcode PM10 NO2
2001 1 12345 40 20
2001 2 12345 30 25
2001 3 12345 35 25
2001 4 12345 30 20
2001 1 62346 40 20
2001 2 62346 30 25
2001 3 62346 35 25
2001 4 62346 30 20
2002 1 12345 44 24
2002 2 12345 36 26
2002 3 12345 30 20
2002 4 12345 32 22
2002 1 62346 48 28
2002 2 62346 20 35
2002 3 62346 89 101
2002 4 62346 37 27
Med
ID postcode trementDate_start treatmentDate_end
1 12345 2001-01-15 2001-03-16
2 62346 2001-01-15 2001-02-16
3 12345 2002-02-21 2002-03-16
4 12345 2002-02-15 2002-04-16
4 62346 2002-03-16 2002-04-30
The idea is to link the (pollution) data with the (Med) data by the date and postcode.
To do that, I need to calculate an average pollution level based upon the number of days of exposure at a particular level (PM10, NO2).
First creating a column called num_of_day to calculate the day length in each month for the periods between the start and end date of treatment in the medical data frame. The subtracting idea between the end date to start date has been found not precise.
For Example ( I will take ID number 1 with a postcode of 12345 just for an explanation of how I calculated the pollution average for PM10 and NO2 by putting into consideration the days)
Med
ID postcode trementDate_start treatmentDate_end
1 12345 2001-01-15 2001-03-16
Pollution
year month postcode PM10 NO2
2001 1 12345 40 20
2001 2 12345 30 25
2001 3 12345 35 25
The air pollution of PM10 and NO2 values for the periods between 2001-01-15 and 2001-03-16 will be as follow:
The trementDate_start (2001-01-15) its PM10 = 40 and NO2 = 20.
the periods in between (2001-02-00) its PM10 = 30 and NO2 = 25.
The trementDate_end (2001-03-16) its PM10 = 35 and NO2 = 25.
I have to then calculate the day of exposure for those periods each:
the trementDate_start (2001-01-15) [January have total of 31 days]
15/31 = 0.48 days of exposure
the periods in between (2001-02-00) [February have a total of 29 days]
this should remain the same PM10 and NO2 values because the pollution measurements in the file are on monthly basis. So, it will be:
29/29 = 1 days of exposure
the trementDate_end (2001-03-16) [March have total of 31 days]
16/31 = 0.51 days of exposure
Then I can calculate afterwards the pollution average based on the exposure days:
the trementDate_start (2001-01-15) exposure days 0.48 * 40 = 19.2(for the PM10) and 0.48 * 20 = 9.6 (for the NO2)
the periods in between (2001-02-00): 1 * 30 = 30 for PM10 and 1* 25 = 25 for NO2
the trementDate_end (2001-03-16): 0.51 * 35 = 17.85 for PM10 and 0.51 * 25 = 12.75 for NO2
Then add the PM10 together (19.2 + 30 + 17.85 = 67.05).
Then I will divide 67.05 by 3 months ( 3 month is the period were the person get exposed to the air pollution during his first treatment), which is equal to 22.35
The output should be like below:
ID postcode trementDate_start treatmentDate_end. PM10. NO2
1 12345 2001-01-15 2001-03-16 22.35 15.78
zoowalk, created the code below based on my previous requirement before I updated the information with the precise day thing. It worked perfectly.
I saw this post. stackoverflow.com/questions/15569333/…. I think this can short the idea of calculating the precise days that I explained above, which takes into account the fact that not all months and years have the same number of days, e.g., the leap year. Still cant figure out how to put them in a code with the other points looking up for the postcode and year and month.
I would appreciate extra help with this. I see it as too complex for me.
Does this help? I am not sure I fully understand for what kind of average you are looking for. Why is 70 / 32 days = 40.93?
library(tidyverse)
pollution <- data.frame(
year = c(2001L,2001L,2001L,2001L,2001L,
2001L,2001L,2001L,2002L,2002L,2002L,2002L,2002L,2002L,
2002L,2002L),
month = c(1L,2L,3L,4L,1L,2L,3L,4L,1L,2L,
3L,4L,1L,2L,3L,4L),
postcode = c(12345L,12345L,12345L,12345L,62346L,
62346L,62346L,62346L,12345L,12345L,12345L,12345L,
62346L,62346L,62346L,62346L),
PM10 = c(40L,30L,35L,30L,40L,30L,35L,30L,
44L,36L,30L,32L,48L,20L,89L,37L),
NO2 = c(20L,25L,25L,20L,20L,25L,25L,20L,
24L,26L,20L,22L,28L,35L,101L,27L)
) %>%
mutate(date_floor=paste(year,month, 01, sep="-") %>%
lubridate::ymd())
pollution
#> year month postcode PM10 NO2 date_floor
#> 1 2001 1 12345 40 20 2001-01-01
#> 2 2001 2 12345 30 25 2001-02-01
#> 3 2001 3 12345 35 25 2001-03-01
#> 4 2001 4 12345 30 20 2001-04-01
#> 5 2001 1 62346 40 20 2001-01-01
#> 6 2001 2 62346 30 25 2001-02-01
#> 7 2001 3 62346 35 25 2001-03-01
#> 8 2001 4 62346 30 20 2001-04-01
#> 9 2002 1 12345 44 24 2002-01-01
#> 10 2002 2 12345 36 26 2002-02-01
#> 11 2002 3 12345 30 20 2002-03-01
#> 12 2002 4 12345 32 22 2002-04-01
#> 13 2002 1 62346 48 28 2002-01-01
#> 14 2002 2 62346 20 35 2002-02-01
#> 15 2002 3 62346 89 101 2002-03-01
#> 16 2002 4 62346 37 27 2002-04-01
med <- data.frame(
stringsAsFactors = FALSE,
ID = c(1L, 2L, 3L, 4L, 4L),
postcode = c(12345L, 62346L, 12345L, 12345L, 62346L),
treatmentDate_start = c("15/01/2001",
"15/01/2001","21/02/2002","15/03/2002","16/04/2002"),
treatmentDate_end = c("16/02/2001",
"16/02/2001","16/03/2002","16/04/2002","30/04/2002")
)
med <- med %>%
mutate(across(.cols=contains("Date"), lubridate::dmy)) %>% #convert to class date
pivot_longer(cols=contains("treatment"),
names_to = "date_type",
values_to = "date") %>%
mutate(date_floor=lubridate::floor_date(date,
unit="month"))
df_join <- med %>%
left_join(., pollution) %>%
select(-date_floor)
#> Joining, by = c("postcode", "date_floor")
df_join
#> # A tibble: 10 x 8
#> ID postcode date_type date year month PM10 NO2
#> <int> <int> <chr> <date> <int> <int> <int> <int>
#> 1 1 12345 treatmentDate_start 2001-01-15 2001 1 40 20
#> 2 1 12345 treatmentDate_end 2001-02-16 2001 2 30 25
#> 3 2 62346 treatmentDate_start 2001-01-15 2001 1 40 20
#> 4 2 62346 treatmentDate_end 2001-02-16 2001 2 30 25
#> 5 3 12345 treatmentDate_start 2002-02-21 2002 2 36 26
#> 6 3 12345 treatmentDate_end 2002-03-16 2002 3 30 20
#> 7 4 12345 treatmentDate_start 2002-03-15 2002 3 30 20
#> 8 4 12345 treatmentDate_end 2002-04-16 2002 4 32 22
#> 9 4 62346 treatmentDate_start 2002-04-16 2002 4 37 27
#> 10 4 62346 treatmentDate_end 2002-04-30 2002 4 37 27
df_join <- df_join %>%
pivot_wider(id_cols=c(ID, postcode, year, month, PM10, NO2),
names_from = date_type,
values_from = date) %>%
mutate(treatmentDate_start = case_when(is.na(treatmentDate_start) ~ lubridate::floor_date(treatmentDate_end, unit="month"),
TRUE ~ as.Date(treatmentDate_start ))) %>%
mutate(treatmentDate_end = case_when(is.na(treatmentDate_end) ~ lubridate::ceiling_date(treatmentDate_start, unit="month"),
TRUE ~ as.Date(treatmentDate_end ))) %>%
mutate(duration=treatmentDate_end-treatmentDate_start)
#this is basically all the info you need.
glimpse(df_join)
#> Rows: 9
#> Columns: 9
#> $ ID <int> 1, 1, 2, 2, 3, 3, 4, 4, 4
#> $ postcode <int> 12345, 12345, 62346, 62346, 12345, 12345, 12345...
#> $ year <int> 2001, 2001, 2001, 2001, 2002, 2002, 2002, 2002,...
#> $ month <int> 1, 2, 1, 2, 2, 3, 3, 4, 4
#> $ PM10 <int> 40, 30, 40, 30, 36, 30, 30, 32, 37
#> $ NO2 <int> 20, 25, 20, 25, 26, 20, 20, 22, 27
#> $ treatmentDate_start <date> 2001-01-15, 2001-02-01, 2001-01-15, 2001-02-01...
#> $ treatmentDate_end <date> 2001-02-01, 2001-02-16, 2001-02-01, 2001-02-16...
#> $ duration <drtn> 17 days, 15 days, 17 days, 15 days, 8 days, 15...
df_join %>%
group_by(ID, postcode) %>%
summarise(across(.cols=c(PM10, NO2, duration), .fns=sum)) %>%
mutate(across(.cols=c(PM10, NO2), .fns=function(x) x/as.numeric(duration)))
#> `summarise()` regrouping output by 'ID' (override with `.groups` argument)
#> # A tibble: 5 x 5
#> # Groups: ID [4]
#> ID postcode PM10 NO2 duration
#> <int> <int> <dbl> <dbl> <drtn>
#> 1 1 12345 2.19 1.41 32 days
#> 2 2 62346 2.19 1.41 32 days
#> 3 3 12345 2.87 2 23 days
#> 4 4 12345 1.94 1.31 32 days
#> 5 4 62346 2.64 1.93 14 days
Created on 2020-12-01 by the reprex package (v0.3.0)
Updated #zoowalk codes to meet my new updated need, since no one till now have provided help. Still, I found it not generating the correct answer. However, my issue has been solved by using excel. I found R with this case is difficult.
library(tidyverse)
library(lubridate)
pollution <- data.frame(
year = c(2001L,2001L,2001L,2001L,2001L,
2001L,2001L,2001L,2002L,2002L,2002L,2002L,2002L,2002L,
2002L,2002L),
month = c(1L,2L,3L,4L,1L,2L,3L,4L,1L,2L,
3L,4L,1L,2L,3L,4L),
postcode = c(12345L,12345L,12345L,12345L,62346L,
62346L,62346L,62346L,12345L,12345L,12345L,12345L,
62346L,62346L,62346L,62346L),
PM10 = c(40L,30L,35L,30L,40L,30L,35L,30L,
44L,36L,30L,32L,48L,20L,89L,37L),
NO2 = c(20L,25L,25L,20L,20L,25L,25L,20L,
24L,26L,20L,22L,28L,35L,101L,27L)
) %>%
mutate(date_floor=paste(year,month, 01, sep="-") %>%
lubridate::ymd())
med <- data.frame(
stringsAsFactors = FALSE,
ID = c(1L, 2L, 3L, 4L, 4L),
postcode = c(12345L, 62346L, 12345L, 12345L, 62346L),
treatmentDate_start = c("15/01/2001",
"15/01/2001","21/02/2002","15/02/2002","16/03/2002"),
treatmentDate_end = c("16/03/2001",
"16/02/2001","16/03/2002","16/04/2002","30/04/2002")
)
med <- med %>%
mutate(across(.cols=contains("Date"), lubridate::dmy)) %>% #convert to class date
pivot_longer(cols=contains("treatment"),
names_to = "date_type",
values_to = "date") %>%
mutate(date_floor=lubridate::floor_date(date,
unit="month"))
df_join <- med %>%
left_join(., pollution) %>%
select(-date_floor)
#> Joining, by = c("postcode", "date_floor")
df_join <- df_join %>%
pivot_wider(id_cols=c(ID, postcode, year, month, PM10, NO2),
names_from = date_type,
values_from = date) %>%
mutate(treatmentDate_start = case_when(is.na(treatmentDate_start) ~ lubridate::floor_date(treatmentDate_end, unit="month"),
TRUE ~ as.Date(treatmentDate_start ))) %>%
mutate(treatmentDate_end = case_when(is.na(treatmentDate_end) ~ lubridate::ceiling_date(treatmentDate_start, unit="month"),
TRUE ~ as.Date(treatmentDate_end ))) %>%
mutate(duration= lubridate::time_length(difftime(treatmentDate_end, treatmentDate_start), "years"))

how to replace missing values with previous year's binned mean

I have a data frame as below
p1_bin and f1_bin are calculated by cut function by me with
Bins <- function(x) cut(x, breaks = c(0, seq(1, 1000, by = 5)), labels = 1:200)
binned <- as.data.frame (sapply(df[,-1], Bins))
colnames(binned) <- paste("Bin", colnames(binned), sep = "_")
df<- cbind(df, binned)
Now how to calculate mean/avg for previous two years and replace in NA values with in that bin
for example : at row-5 value is NA for p1 and f1 is 30 with corresponding bin 7.. now replace NA with previous 2 years mean for same bin (7) ,i.e
df
ID year p1 f1 Bin_p1 Bin_f1
1 2013 20 30 5 7
2 2013 24 29 5 7
3 2014 10 16 2 3
4 2014 11 17 2 3
5 2015 NA 30 NA 7
6 2016 10 NA 2 NA
df1
ID year p1 f1 Bin_p1 Bin_f1
1 2013 20 30 5 7
2 2013 24 29 5 7
3 2014 10 16 2 3
4 2014 11 17 2 3
5 2015 **22** 30 NA 7
6 2016 10 **16.5** 2 NA
Thanks in advance
I believe the following code produces the desired output. There's probably a much more elegant way than using mean(rev(lag(f1))[1:2]) to get the average of the last two values of f1 but this should do the trick anyway.
library(dplyr)
df %>%
arrange(year) %>%
mutate_at(c("p1", "f1"), "as.double") %>%
group_by(Bin_p1) %>%
mutate(f1 = ifelse(is.na(f1), mean(rev(lag(f1))[1:2]), f1)) %>%
group_by(Bin_f1) %>%
mutate(p1 = ifelse(is.na(p1), mean(rev(lag(p1))[1:2]), p1)) %>%
ungroup
and the output is:
# A tibble: 6 x 6
ID year p1 f1 Bin_p1 Bin_f1
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2013 20 30.0 5 7
2 2 2013 24 29.0 5 7
3 3 2014 10 16.0 2 3
4 4 2014 11 17.0 2 3
5 5 2015 22 30.0 NA 7
6 6 2016 10 16.5 2 NA

Resources