Calculating mean date by row - r

I wish to obtain the mean date by row, where each row contains two dates. Eventually I found a way, posted below. However, the approach I used seems rather cumbersome. Is there a better way?
my.data = read.table(text = "
OBS MONTH1 DAY1 YEAR1 MONTH2 DAY2 YEAR2 STATE
1 3 6 2012 3 10 2012 1
2 3 10 2012 3 20 2012 1
3 3 16 2012 3 30 2012 1
4 3 20 2012 4 8 2012 1
5 3 20 2012 4 9 2012 1
6 3 20 2012 4 10 2012 1
7 3 20 2012 4 11 2012 1
8 4 4 2012 4 5 2012 1
9 4 6 2012 4 6 2012 1
10 4 6 2012 4 7 2012 1
", header = TRUE, stringsAsFactors = FALSE)
my.data
my.data$MY.DATE1 <- do.call(paste, list(my.data$MONTH1, my.data$DAY1, my.data$YEAR1))
my.data$MY.DATE2 <- do.call(paste, list(my.data$MONTH2, my.data$DAY2, my.data$YEAR2))
my.data$MY.DATE1 <- as.Date(my.data$MY.DATE1, format=c("%m %d %Y"))
my.data$MY.DATE2 <- as.Date(my.data$MY.DATE2, format=c("%m %d %Y"))
my.data
desired.result = read.table(text = "
OBS MONTH1 DAY1 YEAR1 MONTH2 DAY2 YEAR2 STATE MY.DATE1 MY.DATE2 mean.date
1 3 6 2012 3 10 2012 1 2012-03-06 2012-03-10 2012-03-08
2 3 10 2012 3 20 2012 1 2012-03-10 2012-03-20 2012-03-15
3 3 16 2012 3 30 2012 1 2012-03-16 2012-03-30 2012-03-23
4 3 20 2012 4 8 2012 1 2012-03-20 2012-04-08 2012-03-29
5 3 20 2012 4 9 2012 1 2012-03-20 2012-04-09 2012-03-30
6 3 20 2012 4 10 2012 1 2012-03-20 2012-04-10 2012-03-30
7 3 20 2012 4 11 2012 1 2012-03-20 2012-04-11 2012-03-31
8 4 4 2012 4 5 2012 1 2012-04-04 2012-04-05 2012-04-04
9 4 6 2012 4 6 2012 1 2012-04-06 2012-04-06 2012-04-06
10 4 6 2012 4 7 2012 1 2012-04-06 2012-04-07 2012-04-06
", header = TRUE, stringsAsFactors = FALSE)
Here is the approach that worked for me:
my.data$mean.date <- (my.data$MY.DATE1 + ((my.data$MY.DATE2 - my.data$MY.DATE1) / 2))
my.data
These approaches did not work:
my.data$mean.date <- mean(my.data$MY.DATE1, my.data$MY.DATE2)
my.data$mean.date <- mean(my.data$MY.DATE1, my.data$MY.DATE2, trim = 0)
my.data$mean.date <- mean(my.data$MY.DATE1, my.data$MY.DATE2, trim = 1)
my.data$mean.date <- mean(my.data$MY.DATE1, my.data$MY.DATE2, trim = 0.5)
my.data$mean.data <- apply(my.data, 1, function(x) {(x[9] + x[10]) / 2})
I think I am supposed to use the Ops.Date command, but have not found an example.
Thank you for any suggestions.

Keep things simple and use mean.Date in base R.
mean.Date(as.Date(c("01-01-2014", "01-07-2014"), format=c("%m-%d-%Y")))
[1] "2014-01-04"

Using the good advice of # jaysunice3401, I came up with this. If you want to keep the original data, you can add remove = FALSE in the two lines with unite
library(dplyr)
library(tidyr)
my.data %>%
unite(whatever1, matches("1"), sep = "-") %>%
unite(whatever2, matches("2"), sep = "-") %>%
mutate_each(funs(as.Date(., "%m-%d-%Y")), contains("whatever")) %>%
rowwise %>%
mutate(mean.date = mean.Date(c(whatever1, whatever2)))
# OBS whatever1 whatever2 STATE mean.date
#1 1 2012-03-06 2012-03-10 1 2012-03-08
#2 2 2012-03-10 2012-03-20 1 2012-03-15
#3 3 2012-03-16 2012-03-30 1 2012-03-23
#4 4 2012-03-20 2012-04-08 1 2012-03-29
#5 5 2012-03-20 2012-04-09 1 2012-03-30
#6 6 2012-03-20 2012-04-10 1 2012-03-30
#7 7 2012-03-20 2012-04-11 1 2012-03-31
#8 8 2012-04-04 2012-04-05 1 2012-04-04
#9 9 2012-04-06 2012-04-06 1 2012-04-06
#10 10 2012-04-06 2012-04-07 1 2012-04-06

Maybe something like that?
library(data.table)
setDT(my.data)[, `:=`(MY.DATE1 = as.Date(paste(DAY1 ,MONTH1, YEAR1), format = "%d %m %Y"),
MY.DATE2 = as.Date(paste(DAY2 ,MONTH2, YEAR2), format = "%d %m %Y"))][,
mean.date := MY.DATE2 - ceiling((MY.DATE2 - MY.DATE1)/2)]
my.data
# OBS MONTH1 DAY1 YEAR1 MONTH2 DAY2 YEAR2 STATE MY.DATE1 MY.DATE2 mean.date
# 1: 1 3 6 2012 3 10 2012 1 2012-03-06 2012-03-10 2012-03-08
# 2: 2 3 10 2012 3 20 2012 1 2012-03-10 2012-03-20 2012-03-15
# 3: 3 3 16 2012 3 30 2012 1 2012-03-16 2012-03-30 2012-03-23
# 4: 4 3 20 2012 4 8 2012 1 2012-03-20 2012-04-08 2012-03-29
# 5: 5 3 20 2012 4 9 2012 1 2012-03-20 2012-04-09 2012-03-30
# 6: 6 3 20 2012 4 10 2012 1 2012-03-20 2012-04-10 2012-03-30
# 7: 7 3 20 2012 4 11 2012 1 2012-03-20 2012-04-11 2012-03-31
# 8: 8 4 4 2012 4 5 2012 1 2012-04-04 2012-04-05 2012-04-04
# 9: 9 4 6 2012 4 6 2012 1 2012-04-06 2012-04-06 2012-04-06
# 10: 10 4 6 2012 4 7 2012 1 2012-04-06 2012-04-07 2012-04-06
Or if you insist on using mean.date, here's alternative solution:
library(data.table)
setDT(my.data)[, `:=`(MY.DATE1 = as.Date(paste(DAY1 ,MONTH1, YEAR1), format = "%d %m %Y"),
MY.DATE2 = as.Date(paste(DAY2 ,MONTH2, YEAR2), format = "%d %m %Y"))][,
mean.date := mean.Date(c(MY.DATE1, MY.DATE2)), by = OBS]

One-liner (split for readability), uses lubridate and dplyr and (of course) pipes:
> require(lubridate)
> require(dplyr)
> my.data = my.data %>%
mutate(
MY.DATE1=as.Date(mdy(paste(MONTH1,DAY1,YEAR1))),
MY.DATE2=as.Date(mdy(paste(MONTH2,DAY2,YEAR2)))) %>%
rowwise %>%
mutate(mean.data=mean.Date(c(MY.DATE1,MY.DATE2))) %>% data.frame()
> head(my.data)
OBS MONTH1 DAY1 YEAR1 MONTH2 DAY2 YEAR2 STATE MY.DATE1 MY.DATE2
1 1 3 6 2012 3 10 2012 1 2012-03-06 2012-03-10
2 2 3 10 2012 3 20 2012 1 2012-03-10 2012-03-20
3 3 3 16 2012 3 30 2012 1 2012-03-16 2012-03-30
4 4 3 20 2012 4 8 2012 1 2012-03-20 2012-04-08
5 5 3 20 2012 4 9 2012 1 2012-03-20 2012-04-09
6 6 3 20 2012 4 10 2012 1 2012-03-20 2012-04-10
mean.data
1 2012-03-08
2 2012-03-15
3 2012-03-23
4 2012-03-29
5 2012-03-30
6 2012-03-30
As an afterthought, if you like pipes, you can put a pipe in your pipe so you can pipe while you pipe - rewriting the first mutate step thus:
my.data %>% mutate(
MY.DATE1 = paste(MONTH1,DAY1,YEAR1) %>% mdy %>% as.Date,
MY.DATE2 = paste(MONTH2,DAY2,YEAR2) %>% mdy %>% as.Date)

1) Create Date class columns and then its easy. No external packages are used:
asDate <- function(x) as.Date(x, "1970-01-01")
my.data2 <- transform(my.data,
date1 = as.Date(ISOdate(YEAR1, MONTH1, DAY1)),
date2 = as.Date(ISOdate(YEAR2, MONTH2, DAY2))
)
transform(my.data2, mean.date = asDate(rowMeans(cbind(date1, date2))))
If we did add a library(zoo) call then we could omit the asDate definition using as.Date in the last line instead of asDate since zoo adds a default origin to as.Date.
1a) A dplyr version would look like this (using asDate from above):
library(dplyr)
my.data %>%
mutate(
date1 = ISOdate(YEAR1, MONTH1, DAY1) %>% as.Date,
date2 = ISOdate(YEAR2, MONTH2, DAY2) %>% as.Date,
mean.date = cbind(date1, date2) %>% rowMeans %>% asDate)
2) Another way uses julian in the chron package. julian converts a month/day/year to the number of days since the Epoch. We can average the two julians and convert back to Date class:
library(zoo)
library(chron)
transform(my.data,
mean.date = as.Date( ( julian(MONTH1,DAY1,YEAR1) + julian(MONTH2,DAY2,YEAR2) )/2 )
)
We could omit library(zoo) if we used asDate from (1) in place of as.Date.
Update Discussed use of zoo to shorten the solutions and made further reductions in solution (1).

what about :
apply(my.data[,c("MY.DATE1","MY.DATE2")],1,function(date){substr(strptime(mean(c(strptime(date[1],"%y%y-%m-%d"),strptime(date[2],"%y%y-%m-%d"))),format="%y%y-%m-%d"),1,10)})
?
(I just had to use substr because of CET and CEST that put my output as a list...)

This is a vectorized version of the answer posted by jaysunice3401. It seems fairly straight-forward, except that I had to use trial-and-error to identify the correct origin. I do not know how general origin = "1970-01-01" is or whether a different origin would have to be specified with each data set.
According to this website: http://www.ats.ucla.edu/stat/r/faq/dates.htm
When R looks at dates as integers, its origin is January 1, 1970.
Which seems to suggest that origin = "1970-01-01" is fairly general. Although, if I had dates prior to "1970-01-01" in my data set I would definitely test the code before using it.
my.data = read.table(text = "
OBS MONTH1 DAY1 YEAR1 MONTH2 DAY2 YEAR2 STATE
1 3 6 2012 3 10 2012 1
2 3 10 2012 3 20 2012 1
3 3 16 2012 3 30 2012 1
4 3 20 2012 4 8 2012 1
5 3 20 2012 4 9 2012 1
6 3 20 2012 4 10 2012 1
7 3 20 2012 4 11 2012 1
8 4 4 2012 4 5 2012 1
9 4 6 2012 4 6 2012 1
10 4 6 2012 4 7 2012 1
", header = TRUE, stringsAsFactors = FALSE)
desired.result = read.table(text = "
OBS MONTH1 DAY1 YEAR1 MONTH2 DAY2 YEAR2 STATE MY.DATE1 MY.DATE2 mean.date
1 3 6 2012 3 10 2012 1 2012-03-06 2012-03-10 2012-03-08
2 3 10 2012 3 20 2012 1 2012-03-10 2012-03-20 2012-03-15
3 3 16 2012 3 30 2012 1 2012-03-16 2012-03-30 2012-03-23
4 3 20 2012 4 8 2012 1 2012-03-20 2012-04-08 2012-03-29
5 3 20 2012 4 9 2012 1 2012-03-20 2012-04-09 2012-03-30
6 3 20 2012 4 10 2012 1 2012-03-20 2012-04-10 2012-03-30
7 3 20 2012 4 11 2012 1 2012-03-20 2012-04-11 2012-03-31
8 4 4 2012 4 5 2012 1 2012-04-04 2012-04-05 2012-04-04
9 4 6 2012 4 6 2012 1 2012-04-06 2012-04-06 2012-04-06
10 4 6 2012 4 7 2012 1 2012-04-06 2012-04-07 2012-04-06
", header = TRUE, stringsAsFactors = FALSE)
my.data$MY.DATE1 <- do.call(paste, list(my.data$MONTH1,my.data$DAY1,my.data$YEAR1))
my.data$MY.DATE2 <- do.call(paste, list(my.data$MONTH2,my.data$DAY2,my.data$YEAR2))
my.data$MY.DATE1 <- as.Date(my.data$MY.DATE1, format=c("%m %d %Y"))
my.data$MY.DATE2 <- as.Date(my.data$MY.DATE2, format=c("%m %d %Y"))
my.data$mean.date2 <- as.Date( apply(my.data, 1, function(x) {
mean.Date(c(as.Date(x['MY.DATE1']), as.Date(x['MY.DATE2'])))
}) , origin = "1970-01-01")
my.data
desired.result

Related

How to calculate the number of months from the initial date for each individual

This is a representation of my dataset
ID<-c(rep(1,10),rep(2,8))
year<-c(2007,2007,2007,2008,2008,2009,2010,2009,2010,2011,
2008,2008,2009,2010,2009,2010,2011,2011)
month<-c(2,7,12,4,11,6,11,1,9,4,3,6,7,4,9,11,2,8)
mydata<-data.frame(ID,year,month)
I want to calculate for each individual the number of months from the initial date. I am using two variables: year and month.
I firstly order years and months:
mydata2<-mydata%>%group_by(ID,year)%>%arrange(year,month,.by_group=T)
Then I created the variable date considering that the day begin with 01:
mydata2$date<-paste("01",mydata2$month,mydata2$year,sep = "-")
then I used lubridate to change this variable in date format
mydata2$date<-dmy(mydata2$date)
But after this, I really don't know what to do, in order to have such a dataset (preferably using dplyr code) below:
ID year month date dif_from_init
1 1 2007 2 01-2-2007 0
2 1 2007 7 01-7-2007 5
3 1 2007 12 01-12-2007 10
4 1 2008 4 01-4-2008 14
5 1 2008 11 01-11-2008 21
6 1 2009 1 01-1-2009 23
7 1 2009 6 01-6-2009 28
8 1 2010 9 01-9-2010 43
9 1 2010 11 01-11-2010 45
10 1 2011 4 01-4-2011 50
11 2 2008 3 01-3-2008 0
12 2 2008 6 01-6-2008 3
13 2 2009 7 01-7-2009 16
14 2 2009 9 01-9-2009 18
15 2 2010 4 01-4-2010 25
16 2 2010 11 01-11-2010 32
17 2 2011 2 01-2-2011 35
18 2 2011 8 01-8-2011 41
One way could be:
mydata %>%
group_by(ID) %>%
mutate(date = as.Date(sprintf('%d-%d-01',year, month)),
diff = as.numeric(round((date - date[1])/365*12)))
# A tibble: 18 x 5
# Groups: ID [2]
ID year month date diff
<dbl> <dbl> <dbl> <date> <dbl>
1 1 2007 2 2007-02-01 0
2 1 2007 7 2007-07-01 5
3 1 2007 12 2007-12-01 10
4 1 2008 4 2008-04-01 14
5 1 2008 11 2008-11-01 21
6 1 2009 6 2009-06-01 28
7 1 2010 11 2010-11-01 45
8 1 2009 1 2009-01-01 23
9 1 2010 9 2010-09-01 43
10 1 2011 4 2011-04-01 50
11 2 2008 3 2008-03-01 0
12 2 2008 6 2008-06-01 3
13 2 2009 7 2009-07-01 16
14 2 2010 4 2010-04-01 25
15 2 2009 9 2009-09-01 18
16 2 2010 11 2010-11-01 32
17 2 2011 2 2011-02-01 35
18 2 2011 8 2011-08-01 41

How to create a new column using looping and rbind in r?

I have a data similar like this. I would like to make 3 columns (date1, date2, date3) by using looping and rbind. It is because I am requied to do it by only that method.
(all I was told is making a loop, subset the data, sort it make a new data frame then rbind it to make a new column.)
year month day id
2011 1 5 3101
2011 1 14 3101
2011 2 3 3101
2011 2 4 3101
2012 1 27 3153
2012 2 20 3153
2012 2 22 3153
2012 3 1 3153
2013 1 31 3103
2013 2 1 3103
2013 2 4 3103
2013 3 4 3103
2013 3 6 3103
The result I expect is:
date1: number of days from 2011, January 1st, start again from 1 in a new year.
date2: number of days of an id working in a year, start again from 1 in a new year.
date3: number of days open within a year, start again from 1 in a new year.
(all of the dates are in ascending order)
year month day id date1 date2 date3
2011 1 5 3101 5 1 1
2011 1 14 3101 14 2 2
2011 2 3 3101 34 3 3
2011 2 4 3101 35 4 4
2012 1 27 3153 27 1 1
2012 2 20 3153 51 2 2
2012 2 22 3153 53 3 3
2012 3 1 3153 60 4 4
2013 1 31 3103 31 1 1
2013 2 1 3103 32 2 2
2013 2 4 3103 35 3 3
2013 3 4 3103 94 4 4
2013 3 6 3103 96 5 5
Please help! Thank you.
You can do it without using unnecessary for loop and subset, here is the answer below
df <- read.table(text =" year month day id
2011 1 5 3101
2011 1 14 3101
2011 2 3 3101
2011 2 4 3101
2012 1 27 3153
2012 2 20 3153
2012 2 22 3153
2012 3 1 3153
2013 1 31 3103
2013 2 1 3103
2013 2 4 3103
2013 3 4 3103
2013 3 6 3103",header = T)
library(lubridate)
df$date1 <- yday(mdy(paste0(df$month,"-",df$day,"-",df$year)))
df$date2 <- ave(df$year, df$id, FUN = seq_along)
df$date3 <- ave(df$year, df$year, FUN = seq_along)

How to lump sum the number of days of a data of several year?

I have data similar to this. I would like to lump sum the day (I'm not sure the word "lump sum" is correct or not) and create a new column "date" so that new column lump sum the number of 3 years data in ascending order.
year month day
2011 1 5
2011 2 14
2011 8 21
2012 2 24
2012 3 3
2012 4 4
2012 5 6
2013 2 14
2013 5 17
2013 6 24
I did this code but result was wrong and it's too long also. It doesn't count the February correctly since February has only 28 days. are there any shorter ways?
cday <- function(data,syear=2011,smonth=1,sday=1){
year <- data[1]
month <- data[2]
day <- data[3]
cmonth <- c(0,31,28,31,30,31,30,31,31,30,31,30,31)
date <- (year-syear)*365+sum(cmonth[1:month])+day
for(yr in c(syear:year)){
if(yr==year){
if(yr%%4==0&&month>2){date<-date+1}
}else{
if(yr%%4==0){date<-date+1}
}
}
return(date)
}
op10$day.no <- apply(op10[,c("year","month","day")],1,cday)
I expect the result like this:
year month day date
2011 1 5 5
2011 1 14 14
2011 1 21 21
2011 1 24 24
2011 2 3 31
2011 2 4 32
2011 2 6 34
2011 2 14 42
2011 2 17 45
2011 2 24 52
Thank you for helping!!
Use Date classes. Dates and times are complicated, look for tools to do this for you rather than writing your own. Pick whichever of these you want:
df$date = with(df, as.Date(paste(year, month, day, sep = "-")))
df$julian_day = as.integer(format(df$date, "%j"))
df$days_since_2010 = as.integer(df$date - as.Date("2010-12-31"))
df
# year month day date julian_day days_since_2010
# 1 2011 1 5 2011-01-05 5 5
# 2 2011 2 14 2011-02-14 45 45
# 3 2011 8 21 2011-08-21 233 233
# 4 2012 2 24 2012-02-24 55 420
# 5 2012 3 3 2012-03-03 63 428
# 6 2012 4 4 2012-04-04 95 460
# 7 2012 5 6 2012-05-06 127 492
# 8 2013 2 14 2013-02-14 45 776
# 9 2013 5 17 2013-05-17 137 868
# 10 2013 6 24 2013-06-24 175 906
# using this data
df = read.table(text = "year month day
2011 1 5
2011 2 14
2011 8 21
2012 2 24
2012 3 3
2012 4 4
2012 5 6
2013 2 14
2013 5 17
2013 6 24", header = TRUE)
This is all using base R. If you handle dates and times frequently, you may also want to look a the lubridate package.

Inserting rows into a table

I have this table (visit_ts) -
Year Month Number_of_visits
2011 4 1
2011 6 3
2011 7 23
2011 12 32
2012 1 123
2012 11 3200
The aim is to insert rows with Number_of_visits as 0, for months which are missing in the table.
Do not insert rows for 2011 where month is 1,2,3 or 2012 where month is 12.
The following code works correctly -
vec_month=c(1,2,3,4,5,6,7,8,9,10,11,12)
vec_year=c(2011,2012,2013,2014,2015,2016)
i=1
startyear=head(visit_ts$Year,n=1)
endyear=tail(visit_ts$Year,n=1)
x=head(visit_ts$Month,n=1)
y=tail(visit_ts$Month,n=1)
for (year in vec_year)
{
if(year %in% visit_ts$Year)
{
a=subset(visit_ts,visit_ts$Year==year)
index= which(!vec_month %in% a$Month)
for (j in index)
{
if((year==startyear & j>x )|(year==endyear & j<y))
visit_ts=rbind(visit_ts,c(year,j,0))
else
{
if(year!=startyear & year!=endyear)
visit_ts=rbind(visit_ts,c(year,j,0))
}
}}
else
{
i=i+1
}}
As I am new to R I am looking for an alternative/better solution to the problem which would not involve hard-coding the year and month vectors. Also please feel free to point out best programming practices.
We can use expand.grid with merge or left_join
library(dplyr)
expand.grid(Year = min(df1$Year):max(df1$Year), Month = 1:12) %>%
filter(!(Year == min(df1$Year) & Month %in% 1:3|
Year == max(df1$Year) & Month == 12)) %>%
left_join(., df1) %>%
mutate(Number_of_visits=replace(Number_of_visits, is.na(Number_of_visits), 0))
# Year Month Number_of_visits
#1 2012 1 123
#2 2012 2 0
#3 2012 3 0
#4 2011 4 1
#5 2012 4 0
#6 2011 5 0
#7 2012 5 0
#8 2011 6 3
#9 2012 6 0
#10 2011 7 23
#11 2012 7 0
#12 2011 8 0
#13 2012 8 0
#14 2011 9 0
#15 2012 9 0
#16 2011 10 0
#17 2012 10 0
#18 2011 11 0
#19 2012 11 3200
#20 2011 12 32
We can make it more dynamic by grouping by 'Year', get the sequence of 'Month' from minimum to maximum in a list, unnest the column, join with the original dataset (left_join) and replace the NA values with 0.
library(tidyr)
df1 %>%
group_by(Year) %>%
summarise(Month = list(min(Month):max(Month))) %>%
unnest(Month) %>%
left_join(., df1) %>%
mutate(Number_of_visits=replace(Number_of_visits, is.na(Number_of_visits), 0))
# Year Month Number_of_visits
# <int> <int> <dbl>
#1 2011 4 1
#2 2011 5 0
#3 2011 6 3
#4 2011 7 23
#5 2011 8 0
#6 2011 9 0
#7 2011 10 0
#8 2011 11 0
#9 2011 12 32
#10 2012 1 123
#11 2012 2 0
#12 2012 3 0
#13 2012 4 0
#14 2012 5 0
#15 2012 6 0
#16 2012 7 0
#17 2012 8 0
#18 2012 9 0
#19 2012 10 0
#20 2012 11 3200
Or another option is data.table. Convert the 'data.frame' to 'data.table' (setDT(df1)), grouped by 'Year', we get the sequence of min to max 'Month', join with the original dataset on 'Year' and 'Month', replace the NA values to 0.
library(data.table)
setDT(df1)
df1[df1[, .(Month=min(Month):max(Month)), Year],
on = c("Year", "Month")][is.na(Number_of_visits), Number_of_visits := 0][]
# Year Month Number_of_visits
# 1: 2011 4 1
# 2: 2011 5 0
# 3: 2011 6 3
# 4: 2011 7 23
# 5: 2011 8 0
# 6: 2011 9 0
# 7: 2011 10 0
# 8: 2011 11 0
# 9: 2011 12 32
#10: 2012 1 123
#11: 2012 2 0
#12: 2012 3 0
#13: 2012 4 0
#14: 2012 5 0
#15: 2012 6 0
#16: 2012 7 0
#17: 2012 8 0
#18: 2012 9 0
#19: 2012 10 0
#20: 2012 11 3200

Correct previous year by id within R

I have data something like this:
df <- data.frame(Id=c(1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,9,9,9,9),Date=c("2013-04","2013-12","2013-01","2013-12","2013-11",
"2013-12","2012-04","2013-12","2012-08","2014-12","2013-08","2014-12","2013-08","2014-12","2011-01","2013-11","2013-12","2014-01","2014-04"))
To get the correct format:
df$Date <- paste0(df$Date,"-01")
I would need to obtain only years, so that each id contains 2 dates following on each other.
I if do on the existing data something like this:
require(lubridate)
df$Date <- year(as.Date(df$Date)-days(1))
I get sometimes same date for given id.
The desired output for the column Date is this:
2012 2013 2012 2013 2012 2013 2012 2013 2013 2014 2013 2014 2013 2014 2011 2013 2014
Please note that the last date for given id is always correct, so just the preceding year have to be corrected based on the last date. The date have to be in format that can be converted to years only as shown.
EDIT Here is the case:
Id Date
1 2013-11-01
1 2013-12-01
1 2014-01-01
1 2014-04-01
Now I'm getting this: 2012,2013,2013,2013
I would need: 2012,2013,2013,2014
This is how I would solve this using data.table package (though it looks over complicated to me)
library(data.table)
setDT(df)[, year := year(Date)][,
year := if(.N == 2) (year[2] - 1):year[2] else year,
Id][]
# Id Date year indx
# 1: 1 2013-04-01 2012 2
# 2: 1 2013-12-01 2013 2
# 3: 2 2013-01-01 2012 2
# 4: 2 2013-12-01 2013 2
# 5: 3 2013-11-01 2012 2
# 6: 3 2013-12-01 2013 2
# 7: 4 2012-04-01 2012 2
# 8: 4 2013-12-01 2013 2
# 9: 5 2012-08-01 2013 2
# 10: 5 2014-12-01 2014 2
# 11: 6 2013-08-01 2013 2
# 12: 6 2014-12-01 2014 2
# 13: 7 2013-08-01 2013 2
# 14: 7 2014-12-01 2014 2
# 15: 8 2011-01-01 2011 1
Or all in one step (thanks to #Arun for providing this):
setDT(df)[, year := {tmp = year(Date);
if (.N == 2L) (tmp[2]-1L):tmp[2] else tmp},
Id]
Edit:
Per OPs new data, we can modify the code by adding additional index
setDT(df)[, indx := if(.N > 2) rep(seq_len(.N/2), each = 2) + 1L else .N, Id]
df[, year := {tmp = year(Date); if (.N > 1L) (tmp[2] - 1L):tmp[2] else tmp},
list(Id, indx)][]
# Id Date indx year
# 1: 1 2013-04-01 2 2012
# 2: 1 2013-12-01 2 2013
# 3: 2 2013-01-01 2 2012
# 4: 2 2013-12-01 2 2013
# 5: 3 2013-11-01 2 2012
# 6: 3 2013-12-01 2 2013
# 7: 4 2012-04-01 2 2012
# 8: 4 2013-12-01 2 2013
# 9: 5 2012-08-01 2 2013
# 10: 5 2014-12-01 2 2014
# 11: 6 2013-08-01 2 2013
# 12: 6 2014-12-01 2 2014
# 13: 7 2013-08-01 2 2013
# 14: 7 2014-12-01 2 2014
# 15: 8 2011-01-01 1 2011
# 16: 9 2013-11-01 2 2012
# 17: 9 2013-12-01 2 2013
# 18: 9 2014-01-01 3 2013
# 19: 9 2014-04-01 3 2014
Or another possible solution provided by #akrun
setDT(df)[, `:=`(year = year(Date), indx = .N, indx2 = as.numeric(gl(.N,2, .N))), Id]
df[indx > 1, year:=(year[2]-1):year[2], list(Id, indx2)][]
Using dplyr using similar approach as #David Arenburg's
library(dplyr)
df %>%
group_by(Id) %>%
mutate(year=as.numeric(sub('-.*', '', Date)),
year=replace(year, n()>1, c(year[2]-1, year[2])))
# Id Date year
#1 1 2013-04 2012
#2 1 2013-12 2013
#3 2 2013-01 2012
#4 2 2013-12 2013
#5 3 2013-11 2012
#6 3 2013-12 2013
#7 4 2012-04 2012
#8 4 2013-12 2013
#9 5 2012-08 2013
#10 5 2014-12 2014
#11 6 2013-08 2013
#12 6 2014-12 2014
#13 7 2013-08 2013
#14 7 2014-12 2014
#15 8 2011-01 2011
Or using base R
with(df, ave(as.numeric(sub('-.*', '', Date)), Id,
FUN=function(x) if(length(x)>1)(x[2]-1):x[2] else x))
#[1] 2012 2013 2012 2013 2012 2013 2012 2013 2013 2014 2013 2014 2013 2014 2011
Update
You can try
df$indx <- with(df, ave(Id, Id, FUN=function(x) (seq_along(x)-1)%/%2+1))
with(df, ave(as.numeric(sub('-.*', '', Date)), Id, indx,
FUN=function(x) if(length(x)>1)(x[2]-1):x[2] else x))
#[1] 2012 2013 2012 2013 2012 2013 2012 2013 2013 2014 2013 2014 2013 2014 2011
#[16] 2012 2013 2013 2014
Or
df %>%
group_by(Id) %>%
mutate(year=as.numeric(sub('-.*', '', Date))) %>%
group_by(indx=cumsum(rep(c(TRUE,FALSE), length.out=n())), add=TRUE) %>%
mutate(year=replace(year, n()>1, c(year[2]-1, year[2])))
Here's a dplyr solution. You can remove the intermediate fields last_year and year2, but I left them here for clarity:
library(stringr)
library(dplyr)
df %>%
group_by(Id) %>%
mutate(
last_year = last(as.integer(str_sub(Date, 1, 4))),
year2 = row_number() - n(),
year = last_year + year2
)

Resources