Add missing months for a range of date in R - r

Say I have a data.frame as follows, each month has one entry of data:
df <- read.table(text="date,gmsl
2009-01-17,58.4
2009-02-17,59.1
2009-04-16,60.9
2009-06-16,62.3
2009-09-16,64.6
2009-12-16,68.3",sep=",",header=TRUE)
## > df
## date gmsl
## 1 2009-01-17 58.4
## 2 2009-02-17 59.1
## 3 2009-04-16 60.9
## 4 2009-06-16 62.3
## 5 2009-09-16 64.6
## 6 2009-12-16 68.3
Just wondering how could I fill missing month with gmsl as NaN for date range from 2009-01 to 2009-12?
I have extracted year and month for date column by df$Month_Yr <- format(as.Date(df$date), "%Y-%m").

Here's a way to this with tidyr::complete
library(dplyr)
df %>%
mutate(date = as.Date(date),
first_date = as.Date(format(date, "%Y-%m-01"))) %>%
tidyr::complete(first_date = seq(min(first_date), max(first_date), "1 month"))
# A tibble: 12 x 3
# first_date date gmsl
# <date> <date> <dbl>
# 1 2009-01-01 2009-01-17 58.4
# 2 2009-02-01 2009-02-17 59.1
# 3 2009-03-01 NA NA
# 4 2009-04-01 2009-04-16 60.9
# 5 2009-05-01 NA NA
# 6 2009-06-01 2009-06-16 62.3
# 7 2009-07-01 NA NA
# 8 2009-08-01 NA NA
# 9 2009-09-01 2009-09-16 64.6
#10 2009-10-01 NA NA
#11 2009-11-01 NA NA
#12 2009-12-01 2009-12-16 68.3
You can then decide which column to keep, either first_date or date or combine them both.
data
df <- structure(list(date = structure(1:6, .Label = c("2009-01-17",
"2009-02-17", "2009-04-16", "2009-06-16", "2009-09-16", "2009-12-16"
), class = "factor"), gmsl = c(58.4, 59.1, 60.9, 62.3, 64.6,
68.3)), class = "data.frame", row.names = c(NA, -6L))

In base R you could match (using %in%) the substrings of a seq.Date.
dt.match <- seq.Date(ISOdate(2009, 1, 1), ISOdate(2009, 12, 1), "month")
sub <-
cbind(date=substr(dt.match, 1, 10)[!substr(dt.match, 1, 7) %in% substr(dat$date, 1, 7)],
gmsl=NA)
merge(dat, sub, all=TRUE)
# date gmsl
# 1 2009-01-17 58.4
# 2 2009-02-17 59.1
# 3 2009-03-01 <NA>
# 4 2009-04-16 60.9
# 5 2009-05-01 <NA>
# 6 2009-06-16 62.3
# 7 2009-07-01 <NA>
# 8 2009-08-01 <NA>
# 9 2009-09-16 64.6
# 10 2009-10-01 <NA>
# 11 2009-11-01 <NA>
# 12 2009-12-16 68.3
Data
dat <- structure(list(date = c("2009-01-17", "2009-02-17", "2009-04-16",
"2009-06-16", "2009-09-16", "2009-12-16"), gmsl = c(58.4, 59.1,
60.9, 62.3, 64.6, 68.3)), row.names = c(NA, -6L), class = "data.frame")

Related

calculate 5 year average of panel data with factors kept

I have a panel data set that may look like
set.seed(123)
df <- data.frame(
year = rep(2011:2020,5),
county = rep(c("a","b",'c','d','e'), each=10),
state = rep(c("A","B",'C','D','E'), each=10),
country = rep(c("AA","BB",'CC','DD','EE'), each=10),
var1 = runif(50, 0, 50),
var2 = runif(50, 50, 100)
)
I want to transform the panel data set to 5 year averages of the counties by
df <- df %>%
mutate(period = cut(df$year, seq(2011, 2021, by = 5),right = F)) %>%
group_by(county, period) %>%
summarise_all(mean)
The data set looks like
county period year state country var1 var2
<chr> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
1 a [2011,2016) 2013 NA NA 33.1 69.7
2 a [2016,2021) 2018 NA NA 24.7 73.6
3 b [2011,2016) 2013 NA NA 27.6 72.3
4 b [2016,2021) 2018 NA NA 24.7 83.1
5 c [2011,2016) 2013 NA NA 38.7 75.7
6 c [2016,2021) 2018 NA NA 22.8 66.8
7 d [2011,2016) 2013 NA NA 33.8 72.2
8 d [2016,2021) 2018 NA NA 20.0 83.7
9 e [2011,2016) 2013 NA NA 14.9 71.0
10 e [2016,2021) 2018 NA NA 19.6 70.4
The warming messages are, for example
In mean.default(state) :
argument is not numeric or logical: returning NA
Is there a smart way (not by merging as actually, I have a lot of character columns) to keep the time-invariant character of each county after the transformation?
What I desire is
county period year state country var1 var2
<chr> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
1 a [2011,2016) 2013 A AA 33.1 69.7
2 a [2016,2021) 2018 A AA 24.7 73.6
3 b [2011,2016) 2013 B BB 27.6 72.3
4 b [2016,2021) 2018 B BB 24.7 83.1
5 c [2011,2016) 2013 C CC 38.7 75.7
6 c [2016,2021) 2018 C CC 22.8 66.8
7 d [2011,2016) 2013 D DD 33.8 72.2
8 d [2016,2021) 2018 D DD 20.0 83.7
9 e [2011,2016) 2013 E EE 14.9 71.0
10 e [2016,2021) 2018 E EE 19.6 70.4
Thank you in advance!
The warnning results from that summarise_all(mean) calculates averages not only on var1 & var2 but on state & country. If you want to keep state and country as grouping columns, you should put them into group_by():
library(dplyr)
df %>%
group_by(county, state, country,
period = cut(year, seq(2011, 2021, by = 5), right = FALSE)) %>%
summarise_all(mean) %>%
ungroup()
# # A tibble: 10 × 7
# county state country period year var1 var2
# <chr> <chr> <chr> <fct> <dbl> <dbl> <dbl>
# 1 a A AA [2011,2016) 2013 33.1 69.7
# 2 a A AA [2016,2021) 2018 24.7 73.6
# 3 b B BB [2011,2016) 2013 27.6 72.3
# 4 b B BB [2016,2021) 2018 24.7 83.1
# 5 c C CC [2011,2016) 2013 38.7 75.7
# 6 c C CC [2016,2021) 2018 22.8 66.8
# 7 d D DD [2011,2016) 2013 33.8 72.2
# 8 d D DD [2016,2021) 2018 20.0 83.7
# 9 e E EE [2011,2016) 2013 14.9 71.0
# 10 e E EE [2016,2021) 2018 19.6 70.4
If the grouping columns are simply county and period, and other categorical variables are unique in each group, you could keep them by just leaving the first values with first() while doing summarise().
df %>%
group_by(county,
period = cut(year, seq(2011, 2021, by = 5), right = FALSE)) %>%
summarise(across(!where(is.numeric), first),
across( where(is.numeric), mean)) %>%
ungroup()

How to find mean value using multiple columns of a R data.frame?

I am trying to find mean of A and B for each row and save it as separate column but seems like the code only average the first row and fill the rest of the rows with that value. Any suggestion how to fix this?
library(tidyverse)
library(lubridate)
set.seed(123)
DF <- data.frame(Date = seq(as.Date("2001-01-01"), to = as.Date("2003-12-31"), by = "day"),
A = runif(1095, 1,60),
Z = runif(1095, 5,100)) %>%
mutate(MeanofAandZ= mean(A:Z))
Are you looking for this:
DF %>% rowwise() %>% mutate(MeanofAandZ = mean(c_across(A:Z)))
# A tibble: 1,095 x 4
# Rowwise:
Date A Z MeanofAandZ
<date> <dbl> <dbl> <dbl>
1 2001-01-01 26.5 7.68 17.1
2 2001-01-02 54.9 33.1 44.0
3 2001-01-03 37.1 82.0 59.5
4 2001-01-04 6.91 18.0 12.4
5 2001-01-05 53.0 8.76 30.9
6 2001-01-06 26.1 7.63 16.9
7 2001-01-07 59.3 30.8 45.0
8 2001-01-08 39.9 14.6 27.3
9 2001-01-09 59.2 93.6 76.4
10 2001-01-10 30.7 89.1 59.9
you can do it with Base R: rowMeans
Full Base R:
DF$MeanofAandZ <- rowMeans(DF[c("A", "Z")])
head(DF)
#> Date A Z MeanofAandZ
#> 1 2001-01-01 17.967074 76.92436 47.44572
#> 2 2001-01-02 47.510003 99.28325 73.39663
#> 3 2001-01-03 25.129638 64.33253 44.73109
#> 4 2001-01-04 53.098027 32.42556 42.76179
#> 5 2001-01-05 56.487570 23.99162 40.23959
#> 6 2001-01-06 3.687833 81.08720 42.38751
or inside a mutate:
library(dplyr)
DF <- DF %>% mutate(MeanofAandZ = rowMeans(cbind(A,Z)))
head(DF)
#> Date A Z MeanofAandZ
#> 1 2001-01-01 17.967074 76.92436 47.44572
#> 2 2001-01-02 47.510003 99.28325 73.39663
#> 3 2001-01-03 25.129638 64.33253 44.73109
#> 4 2001-01-04 53.098027 32.42556 42.76179
#> 5 2001-01-05 56.487570 23.99162 40.23959
#> 6 2001-01-06 3.687833 81.08720 42.38751
We can also do
DF$MeanofAandZ <- Reduce(`+`, DF[c("A", "Z")])/2
Or using apply
DF$MeanofAandZ <- apply(DF[c("A", "Z")], 1, mean)

how to fill the NA values by using known formula from another dataframe in r

I have a dataframe named 'test1' like below, (here 'day' are "POSIXt" objects)
day Rain SWC_11 SWC_12 SWC_13 SWC_14 SWC_21
01/01/2019 00:00:00 0.0 51 60 63 60 64
02/01/2019 00:00:00 0.2 51.5 60.3 63.4 60.8 64.4
03/01/2019 00:00:00 0.0 51.3 60.3 63.3 60.6 64.1
04/01/2019 00:00:00 0.4 NA NA NA NA NA
05/01/2019 00:00:00 0.0 NA NA NA NA NA
06/01/2019 00:00:00 0.0 NA NA NA NA NA
07/01/2019 00:00:00 0.0 NA NA NA NA NA
08/01/2019 00:00:00 0.0 NA NA NA NA NA
09/01/2019 00:00:00 0.0 NA NA NA NA NA
10/01/2019 00:00:00 0.0 NA NA NA NA NA
And another dataframe named 'test2', like below
SWC_11_(Intercept) SWC_11_slope SWC_12_(Intercept) SWC_12_slope SWC_13_(Intercept) SWC_13_slope SWC_14_(Intercept) SWC_14_slope SWC_21(Intercept) SWC_21_slope
10471.95 -6.563423e-06 4063.32 -2.525118e-06 75040.76 -4.726106e-05 7742.763 -4.842427e-06 22965.85 -1.443707e-05
What I want to do now is fill in the missing (NA) values with the corresponding coefficients. I would have a model like this:
missing variables of SWC_11= SWC_11_(Intercept) + SWC_11_slope*day
missing variables of SWC_12= SWC_12_(Intercept)+ SWC_12_slope*day
Other columns are in the same manner. I think here sapply function should help,
test1<- data.frame(sapply(test2, function(x) )))
But now I was kind of confused about how to write the function part then. Hope someone could help. Thanks.
I would suggest a tidyverse approach where you reshape your data and then merge in order to compute the values for the missing variables. I am not clear about the day so what I did is extract the day from your date variable but you can change that if it is necessary. You have to do some cleaning steps for your variable names but all is in the code. Here the solution:
library(tidyverse)
#First format test2
test2 %>% pivot_longer(everything()) %>%
#Mutate for cleaning
mutate(name2=ifelse(grepl('Intercept',name),'Intercept','slope')) %>%
mutate(name=gsub('Intercept|slope','',name),name=substr(name,1,6)) %>%
#format to wide
pivot_wider(names_from = name2,values_from=value) %>%
#Left join with original test 1 in long format
left_join(
test1 %>% pivot_longer(-c(day,Rain)) %>%
#Format date to extract days
mutate(Day=as.numeric(format(as.Date(day,'%d/%m/%Y'),'%d')))) %>%
#Compute new values
mutate(value2=ifelse(is.na(value),Intercept+slope*Day,value)) %>%
select(name,day,Rain,value2) %>%
pivot_wider(names_from = name,values_from=value2)
Output:
# A tibble: 10 x 7
day Rain SWC_11 SWC_12 SWC_13 SWC_14 SWC_21
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 01/01/2019 00:00:00 0 51 60 63 60 64
2 02/01/2019 00:00:00 0.2 51.5 60.3 63.4 60.8 64.4
3 03/01/2019 00:00:00 0 51.3 60.3 63.3 60.6 64.1
4 04/01/2019 00:00:00 0.4 10472. 4063. 75041. 7743. 22966.
5 05/01/2019 00:00:00 0 10472. 4063. 75041. 7743. 22966.
6 06/01/2019 00:00:00 0 10472. 4063. 75041. 7743. 22966.
7 07/01/2019 00:00:00 0 10472. 4063. 75041. 7743. 22966.
8 08/01/2019 00:00:00 0 10472. 4063. 75041. 7743. 22966.
9 09/01/2019 00:00:00 0 10472. 4063. 75041. 7743. 22966.
10 10/01/2019 00:00:00 0 10472. 4063. 75041. 7743. 22966.
Some data used:
#Data 1
test1 <- structure(list(day = c("01/01/2019 00:00:00", "02/01/2019 00:00:00",
"03/01/2019 00:00:00", "04/01/2019 00:00:00", "05/01/2019 00:00:00",
"06/01/2019 00:00:00", "07/01/2019 00:00:00", "08/01/2019 00:00:00",
"09/01/2019 00:00:00", "10/01/2019 00:00:00"), Rain = c(0, 0.2,
0, 0.4, 0, 0, 0, 0, 0, 0), SWC_11 = c(51, 51.5, 51.3, NA, NA,
NA, NA, NA, NA, NA), SWC_12 = c(60, 60.3, 60.3, NA, NA, NA, NA,
NA, NA, NA), SWC_13 = c(63, 63.4, 63.3, NA, NA, NA, NA, NA, NA,
NA), SWC_14 = c(60, 60.8, 60.6, NA, NA, NA, NA, NA, NA, NA),
SWC_21 = c(64, 64.4, 64.1, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA,
-10L), class = "data.frame")
#Data2
test2 <- structure(list(SWC_11_.Intercept. = 10471.95, SWC_11_slope = -6.563423e-06,
SWC_12_.Intercept. = 4063.32, SWC_12_slope = -2.525118e-06,
SWC_13_.Intercept. = 75040.76, SWC_13_slope = -4.726106e-05,
SWC_14_.Intercept. = 7742.763, SWC_14_slope = -4.842427e-06,
SWC_21.Intercept. = 22965.85, SWC_21_slope = -1.443707e-05), class = "data.frame", row.names = c(NA,
-1L))
Conceptually this one is similar to #Duck's solution but maybe in fewer number of steps.
library(dplyr)
library(tidyr)
library(lubridate)
test2 %>%
#Get the data in long format with SWC number
pivot_longer(cols = everything(), names_to = c('name', '.value'),
names_pattern = '(SWC_\\d+).*(slope|Intercept)') %>%
#Join the data with test1
right_join(test1 %>% pivot_longer(cols = contains('SWC')), by = 'name') %>%
#Select first non-NA value between value and val
mutate(value = coalesce(value, Intercept + slope * day(day))) %>%
select(-Intercept, -slope) %>%
#Get the data in wide format
pivot_wider()
# A tibble: 10 x 7
# day Rain SWC_11 SWC_12 SWC_13 SWC_14 SWC_21
# <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2019-01-01 00:00:00 0 51 60 63 60 64
# 2 2019-01-02 00:00:00 0.2 51.5 60.3 63.4 60.8 64.4
# 3 2019-01-03 00:00:00 0 51.3 60.3 63.3 60.6 64.1
# 4 2019-01-04 00:00:00 0.4 10472. 4063. 75041. 7743. 22966.
# 5 2019-01-05 00:00:00 0 10472. 4063. 75041. 7743. 22966.
# 6 2019-01-06 00:00:00 0 10472. 4063. 75041. 7743. 22966.
# 7 2019-01-07 00:00:00 0 10472. 4063. 75041. 7743. 22966.
# 8 2019-01-08 00:00:00 0 10472. 4063. 75041. 7743. 22966.
# 9 2019-01-09 00:00:00 0 10472. 4063. 75041. 7743. 22966.
#10 2019-01-10 00:00:00 0 10472. 4063. 75041. 7743. 22966.

Replace all duplicated with na

My question is similar to replace duplicate values with NA in time series data using dplyr but while applying to other time series which are like below :
box_num date x y
6-WQ 2018-11-18 20.2 8
6-WQ 2018-11-25 500.75 7.2
6-WQ 2018-12-2 500.75 23
25-LR 2018-11-18 374.95 4.3
25-LR 2018-11-25 0.134 9.3
25-LR 2018-12-2 0.134 4
73-IU 2018-12-2 225.54 0.7562
73-IU 2018-12-9 28 0.7562
73-IU 2018-12-16 225.54 52.8
library(dplyr)
df %>%
group_by(box_num) %>%
mutate_at(vars(x:y), funs(replace(., duplicated(.), NA)))
The above code can identify and replace with NA, but the underlying problem is I'm trying to replace all NA with a linear trend in the coming step. Since it's a time series.But when we see for box_num : 6-WQ after 20.2 we can see directly a large shift which we can say it's a imputed value so I would to replace both the imputed values as NA and the other case is like for box_num 73-IU imputed values got entered after one week so I would like to replace imputed values with NA
Expected output :
box_num date x y
6-WQ 2018-11-18 20.2 8
6-WQ 2018-11-25 NA 7.2
6-WQ 2018-12-2 NA 23
25-LR 2018-11-18 374.95 4.3
25-LR 2018-11-25 NA 9.3
25-LR 2018-12-2 NA 4
73-IU 2018-12-2 NA NA
73-IU 2018-12-9 28 NA
73-IU 2018-12-16 NA 52.8
foo = function(x){
replace(x, ave(x, x, FUN = length) > 1, NA)
}
myCols = c("x", "y")
df1[myCols] = lapply(df1[myCols], foo)
df1
# box_num date x y
#1 6-WQ 2018-11-18 20.20 8.0
#2 6-WQ 2018-11-25 NA 7.2
#3 6-WQ 2018-12-2 NA 23.0
#4 25-LR 2018-11-18 374.95 4.3
#5 25-LR 2018-11-25 NA 9.3
#6 25-LR 2018-12-2 NA 4.0
#7 73-IU 2018-12-2 NA NA
#8 73-IU 2018-12-9 28.00 NA
#9 73-IU 2018-12-16 NA 52.8
#DATA
df1 = structure(list(box_num = c("6-WQ", "6-WQ", "6-WQ", "25-LR", "25-LR",
"25-LR", "73-IU", "73-IU", "73-IU"), date = c("2018-11-18", "2018-11-25",
"2018-12-2", "2018-11-18", "2018-11-25", "2018-12-2", "2018-12-2",
"2018-12-9", "2018-12-16"), x = c(20.2, 500.75, 500.75, 374.95,
0.134, 0.134, 225.54, 28, 225.54), y = c(8, 7.2, 23, 4.3, 9.3,
4, 0.7562, 0.7562, 52.8)), class = "data.frame", row.names = c(NA,
-9L))
With tidyverse you can do:
df %>%
group_by(box_num) %>%
mutate_at(vars(x:y), funs(ifelse(. %in% subset(rle(sort(.))$values, rle(sort(.))$length > 1), NA, .)))
box_num date x y
<fct> <fct> <dbl> <dbl>
1 6-WQ 2018-11-18 20.2 8.00
2 6-WQ 2018-11-25 NA 7.20
3 6-WQ 2018-12-2 NA 23.0
4 25-LR 2018-11-18 375. 4.30
5 25-LR 2018-11-25 NA 9.30
6 25-LR 2018-12-2 NA 4.00
7 73-IU 2018-12-2 NA NA
8 73-IU 2018-12-9 28.0 NA
9 73-IU 2018-12-16 NA 52.8
First, it sorts the values in "x" and "y" and computes the run length of equal values. Second, it creates a subset for those values that have a run length > 1. Finally, it compares whether the values in "x" and "y" are in the subset, and if so, they get NA.

How to calculate difference between data in different rows? [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 6 years ago.
Improve this question
I have got monthly data in this format
PrecipMM Date
122.7 2004-01-01
54.2 2005-01-01
31.9 2006-01-01
100.5 2007-01-01
144.9 2008-01-01
96.4 2009-01-01
75.3 2010-01-01
94.8 2011-01-01
67.6 2012-01-01
93.0 2013-01-01
184.6 2014-01-01
101.0 2015-01-01
149.3 2016-01-01
50.2 2004-02-01
46.2 2005-02-01
57.7 2006-02-01
I want to calculate all of the difference of precipMM in same month of different years.
My dream output is like this:
PrecipMM Date PrecipMM_diff
122.7 2004-01-01 NA
54.2 2005-01-01 -68.5
31.9 2006-01-01 -22.3
100.5 2007-01-01 68.6
144.9 2008-01-01 44.4
96.4 2009-01-01 -48.5
75.3 2010-01-01 -21.2
94.8 2011-01-01 19.5
67.6 2012-01-01 -27.2
93.0 2013-01-01 25.4
184.6 2014-01-01 91.6
101.0 2015-01-01 -83.6
149.3 2016-01-01 48.3
50.2 2004-02-01 NA
46.2 2005-02-01 -4.0
57.7 2006-02-01 11.5
I think diff() can do this but I have no idea how.
I think you can do this with lag combined with group_by from dplyr. Here's how:
library(dplyr)
library(lubridate) # makes dealing with dates easier
# Load your example data
df <- structure(list(PrecipMM = c(4.4, 66.7, 48.2, 60.9, 108.1, 109.2,
101.7, 38.1, 53.8, 71.9, 75.4, 67.1, 92.7, 115.3, 68.9, 38.9),
Date = structure(5:20, .Label = c("101.7", "108.1", "109.2",
"115.3", "1766-01-01", "1766-02-01", "1766-03-01", "1766-04-01",
"1766-05-01", "1766-06-01", "1766-07-01", "1766-08-01", "1766-09-01",
"1766-10-01", "1766-11-01", "1766-12-01", "1767-01-01", "1767-02-01",
"1767-03-01", "1767-04-01", "38.1", "38.9", "4.4", "48.2",
"53.8", "60.9", "66.7", "67.1", "68.9", "71.9", "75.4", "92.7"
), class = "factor")), class = "data.frame", row.names = c(NA,
-16L), .Names = c("PrecipMM", "Date"))
results <- df %>%
mutate(years = year(Date), months = month(Date)) %>%
group_by(months) %>%
arrange(years) %>%
mutate(lagged.rain = lag(PrecipMM), rain.diff = PrecipMM - lagged.rain)
results
# Source: local data frame [16 x 6]
# Groups: months [12]
#
# PrecipMM Date years months lagged.rain rain.diff
# (dbl) (fctr) (dbl) (dbl) (dbl) (dbl)
# 1 4.4 1766-01-01 1766 1 NA NA
# 2 92.7 1767-01-01 1767 1 4.4 88.3
# 3 66.7 1766-02-01 1766 2 NA NA
# 4 115.3 1767-02-01 1767 2 66.7 48.6
# 5 48.2 1766-03-01 1766 3 NA NA
# 6 68.9 1767-03-01 1767 3 48.2 20.7
# 7 60.9 1766-04-01 1766 4 NA NA
# 8 38.9 1767-04-01 1767 4 60.9 -22.0
# 9 108.1 1766-05-01 1766 5 NA NA
# 10 109.2 1766-06-01 1766 6 NA NA
# 11 101.7 1766-07-01 1766 7 NA NA
# 12 38.1 1766-08-01 1766 8 NA NA
# 13 53.8 1766-09-01 1766 9 NA NA
# 14 71.9 1766-10-01 1766 10 NA NA
# 15 75.4 1766-11-01 1766 11 NA NA
# 16 67.1 1766-12-01 1766 12 NA NA

Resources