Condition for function and loop - r

I have a data frame simplified as follow:
head(dendro)
X DateTime ID diameter dendro ring DOY month mday year Rain_mm_Tot Through_Tot temp
1 1 2012-06-21 13:45:00 r1_1 5482 1 1 173 6 22 113 NA NA NA
2 2 2012-06-21 13:45:00 r2_3 NA 3 2 173 6 22 113 NA NA NA
3 3 2012-06-21 13:45:00 r1_2 5534 2 1 173 6 22 113 NA NA NA
4 4 2012-06-21 13:45:00 r2_4 NA 4 2 173 6 22 113 NA NA NA
5 5 2012-06-21 13:45:00 r1_3 5606 3 1 173 6 22 113 NA NA NA
6 6 2012-06-21 13:45:00 r2_5 NA 5 2 173 6 22 113 NA NA NA
The dataframe is first splitted by "ID", so it's a list of IDs
After that I apply a function, that includes a loop, and the result is a new column "Diameter2", with the result I want from the function, that works OK:
dendro_sp <- split(dendro, dendro$ID)
library(changepoint)
dendro_sp <- lapply(dendro_sp, function(x){
x <- subset(x, !is.na(diameter))
cpfit <- cpt.mean(x$diameter, method="BinSeg")
x$diameter2 <- x$diameter
cpts <- cpfit#cpts
means <- param.est(cpfit)$mean
meanZero <- means[1]
for(i in 1:(length(cpts)-1)){
x$diameter2[(cpts[i]+1):cpts[i+1]] <- x$diameter2[(cpts[i]+1):cpts[i+1]] + (meanZero - means[i+1])
}
return(x)
})
dendro2 <- do.call(rbind, dendro_sp)
rownames(dendro2) <- NULL
My problem is that I want it to apply it conditionally, for example to r1_1 and r1_3, and grab the "diameter" value for r3 in the new column "diameter2", instead of applying the function for the rest of IDs:
ifelse(diameter$ID==c("r1_1","r1_3"), apply_the_function_to_r11_and_r13_to_calculate_diameter2, otherwise_write_diameter_value_in_diameter2_column)
Remember that the dataframe "dendro" is splitted by ID, I don't know if that is important to define the condition for several IDs.
Thanks

I am not sure if I understand the problem correctly. I try to answer.
I assume you want to apply a function to the "diameter" field of the "diameter" data.frame, conditioning on the "ID" field and retunr the result in the corresponding diameter2 field. I don't know how the function works, so forgive me if this will not work.
Selected fields
diameter$diameter2[diameter$ID=="r1_1"|diameter$ID=="r1_3"]<- yourfun(diameter$diameter[diameter$ID=="r1_1"|diameter$ID=="r1_3"]
Unselected fields
diameter$diameter2[diameter$ID!="r1_1" & diameter$ID=="r1_3"]<- diameter$diameter[diameter$ID=="r1_1"|diameter$ID=="r1_3"]

Related

R: data.table using a for loop to wrangle multiple columns

I am currently working in R to build a for loop which will add the year to 7 columns that contain partial dates (dd/mm). I have been attempting to run the following for-loop and have not been successful. What am I doing wrong?
Here's a sample of what my data set looks like (The actual data set includes columns HomDate - HomDate_7 but I only included the first few as I know you'll get the point...)
Participant DateVisit HomDate HomDate_2 HomeDate_3 year_flag
1 2012-04-25 18/04 19/04 20/04 NA
2 2012-01-04 28/12 29/12 30/12 1
3 2012-01-05 31/12 01/01 01/02 1
4 2012-06-13 06/06 07/06 08/06 NA
5 2012-02-12 05/02 06/02 07/02 NA
Here's the code I've been trying to use:
hom_date <- list("HomDate", "HomDate_2", "HomDate_3", "HomDate_4", "HomDate_5", "HomDate_6",
"HomDate_7")
set_dates <- function(x){
home_morbid[,x:=as.character(x)]
home_morbid[(substr(x, 4, 5)==12) & (year_flag==1), x:=paste(x, "/2011", sep="")]
home_morbid[(substr(x, 4, 5)==01) & (year_flag==1), x:=paste(x, "/2012", sep="")]
home_morbid[is.na(year_flag), x:=paste(x, "/", substr(DateVisit, 1, 4), sep="")]
}
for(i in 1:length(hom_date)){
x <- hom_date[i]
home_morbid_2<-set_dates(x)
}
I'm not sure what happens to those with an NA flag. Here is an approach:
to_replace<-grep("^Hom",names(df))
df[,(to_replace):=lapply(.SD, function(x) ifelse(is.na(year_flag),x,
ifelse(substr(x, 4, 5)==12,
paste0(x,"/","2011"),
paste0(x,"/","2012")))),
.SDcols=HomDate:HomeDate_3][]
Participant DateVisit HomDate HomDate_2 HomeDate_3 year_flag
1: 1 2012-04-25 18/04 19/04 20/04 NA
2: 2 2012-01-04 28/12/2011 29/12/2011 30/12/2011 1
3: 3 2012-01-05 31/12/2011 01/01/2012 01/02/2012 1
4: 4 2012-06-13 06/06 07/06 08/06 NA
5: 5 2012-02-12 05/02 06/02 07/02 NA
To replace NA flagged years with the year from DateVisit:
library(lubridate)
to_replace<-grep("^Hom",names(df))
df[,(to_replace):=lapply(.SD, function(x) ifelse(is.na(year_flag),
paste0(x,"/",year(ymd(DateVisit))),
ifelse(substr(x, 4, 5)==12,
paste0(x,"/","2011"),
paste0(x,"/","2012")))),
.SDcols=HomDate:HomeDate_3][]
Participant DateVisit HomDate HomDate_2 HomeDate_3 year_flag
1: 1 2012-04-25 18/04/2012 19/04/2012 20/04/2012 NA
2: 2 2012-01-04 28/12/2011 29/12/2011 30/12/2011 1
3: 3 2012-01-05 31/12/2011 01/01/2012 01/02/2012 1
4: 4 2012-06-13 06/06/2012 07/06/2012 08/06/2012 NA
5: 5 2012-02-12 05/02/2012 06/02/2012 07/02/2012 NA

Efficient way for insertion of multiple rows at given indices & with repetitions

I have a data frame (DATA) with > 2 million rows (observations at different time points) and another data frame (INSERTION) which gives info about missing observations. The latter object contains 2 columns: 1st column with row indices after which empty (NA) rows should be inserted into DATA, and 2nd column with the number of empty rows that should be inserted at that position.
Below is a minimum working example:
DATA <- data.frame(datetime=strptime(as.character(c(201301011700, 201301011701, 201301011703, 201301011704, 201301011705, 201301011708, 201301011710, 201301011711, 201301011715, 201301011716, 201301011718, 201301011719, 201301011721, 201301011722, 201301011723, 201301011724, 201301011725, 201301011726, 201301011727, 201301011729, 201301011730, 201301011731, 201301011732, 201301011733, 201301011734, 201301011735, 201301011736, 201301011737, 201301011738, 201301011739)), format="%Y%m%d%H%M"), var1=rnorm(30), var2=rnorm(30), var3=rnorm(30))
INSERTION <- data.frame(index=c(2, 5, 6, 8, 10, 12, 19), repetition=c(1, 2, 1, 3, 1, 1, 1))
Now I'm looking for an efficient (and thus fast) way to insert the n empty rows at given row indices of the original file. How can I additionally complement the correct datetimes for these empty rows (add 1 minute for every new row; however, every weekend and bank holidays there are some regular gaps which are not contained in INSERTION!)?
Any help is appreciated!
Looking at the pattern in INSERTION and matching it with DATA most probably you are trying to fill the missing minutes in datetime of DATA. You can create a dataframe with every minute sequence from min to max value of datetime from DATA and then merge
merge(data.frame(datetime = seq(min(DATA$datetime), max(DATA$datetime),
by = "1 min")),DATA, all.x = TRUE)
# datetime var1 var2 var3
#1 2013-01-01 17:00:00 -1.063326 0.11925 -0.788622
#2 2013-01-01 17:01:00 1.263185 0.24369 -0.502199
#3 2013-01-01 17:02:00 NA NA NA
#4 2013-01-01 17:03:00 -0.349650 1.23248 1.496061
#5 2013-01-01 17:04:00 -0.865513 -0.51606 -1.137304
#6 2013-01-01 17:05:00 -0.236280 -0.99251 -0.179052
#7 2013-01-01 17:06:00 NA NA NA
#8 2013-01-01 17:07:00 NA NA NA
#9 2013-01-01 17:08:00 -0.197176 1.67570 1.902362
#10 2013-01-01 17:09:00 NA NA NA
#...
#...
Or using similar logic with tidyr::complete
tidyr::complete(DATA, datetime = seq(min(datetime), max(datetime), by = "1 min"))
If performance is a factor on a large data frame, this approach avoids joins:
# Generate new data.frame containing missing datetimes
tmp <- data.frame(datetime = DATA$datetime[with(INSERTION, rep(index, repetition))] + sequence(INSERTION$repetition)*60)
# Create variables filled with NA to match main data.frame
tmp[setdiff(names(DATA), names(tmp))] <- NA
# Bind and sort
new_df <- rbind(DATA, tmp)
new_df <- new_df[order(new_df$datetime),]
head(new_df, 15)
datetime var1 var2 var3
1 2013-01-01 17:00:00 0.98789253 0.68364933 0.70526985
2 2013-01-01 17:01:00 -0.68307496 0.02947599 0.90731512
31 2013-01-01 17:02:00 NA NA NA
3 2013-01-01 17:03:00 -0.60189915 -1.00153188 0.06165694
4 2013-01-01 17:04:00 -0.87329313 -1.81532302 -2.04930719
5 2013-01-01 17:05:00 -0.58713154 -0.42313098 0.37402224
32 2013-01-01 17:06:00 NA NA NA
33 2013-01-01 17:07:00 NA NA NA
6 2013-01-01 17:08:00 2.41350911 -0.13691754 1.57618578
34 2013-01-01 17:09:00 NA NA NA
7 2013-01-01 17:10:00 -0.38961552 0.83838954 1.18283382
8 2013-01-01 17:11:00 0.02290672 -2.10825367 0.87441448
35 2013-01-01 17:12:00 NA NA NA
36 2013-01-01 17:13:00 NA NA NA
37 2013-01-01 17:14:00 NA NA NA

Compute the variance of a moving window in a dataframe

Hey I want to compute the variance of column. My dataframe is sorted by the as.Date() format. Here you can see a snippet of it:
Date USA ARG BRA CHL COL MEX PER
2012-04-01 1 0.2271531 0.4970299 0.001956865 0.0005341452 0.07341428 NA
2012-05-01 1 0.2218906 0.4675895 0.001911405 0.0005273186 0.07026524 NA
2012-06-01 1 0.2054076 0.4531661 0.001891352 0.0005292575 0.06897811 NA
2012-07-01 1 0.2033470 0.4596730 0.001950686 0.0005312600 0.07269619 NA
2012-08-01 1 0.1993882 0.4596039 0.001980537 0.0005271514 0.07268987 NA
2012-09-01 1 0.1967152 0.4593390 0.002011212 0.0005305549 0.07418838 NA
2012-10-01 1 0.1972730 0.4597584 0.002002203 0.0005284380 0.07428555 NA
2012-11-01 1 0.1937618 0.4519187 0.001979805 0.0005238670 0.07329656 NA
2012-12-01 1 0.1854037 0.4500448 0.001993309 0.0005323795 0.07453949 NA
2013-01-01 1 0.1866007 0.4607501 0.002013112 0.0005412329 0.07551040 NA
2013-02-01 1 0.1855950 0.4712956 0.002011067 0.0005359562 0.07554661 NA
The dataframe ranges from january 2004 up to dezember 2018. But I do not want to compute the compute the variance of the whole columnes.
I want to compute the variance of one year (or 12 values) which is moving month by month.
I do not really know how to start. I can imagine using the zoo package and the rollapply. But here the problem is (I think) that R computes uses the values around it and not past it?
I also found this question: R: create a data frame out of a rolling window, so my idea was to get rid of the date column. It is easy to build the matrix, but now I do not understand how to apply the variance function to my data...
Is there a smart way to compute it all in one and also using the information of the date? If not, I also appreciate any other solution from you!
We can use rollappyr to perform the rolling computations. Since there are only 11 rows in the data in the question we can't take 12 month averages but using 3 month averages instead we can illustrate it. Remove fill = NA if you want to omit the NA rows or replace it with partial = TRUE if you want variances using fewer than 12 near the beginning. If you want a data frame result use fortify.zoo(zv) .
library(zoo)
z <- read.zoo(DF)
zv <- rollapplyr(z, 3, var, fill = NA)
zv
giving this zoo object:
USA ARG BRA CHL COL MEX PER
2012-04-01 NA NA NA NA NA NA NA
2012-05-01 NA NA NA NA NA NA NA
2012-06-01 0 1.287083e-04 4.998008e-04 1.126781e-09 1.237524e-11 5.208793e-06 NA
2012-07-01 0 1.033001e-04 5.217420e-05 9.109406e-10 3.883996e-12 3.565057e-06 NA
2012-08-01 0 9.358558e-06 1.396497e-05 2.060928e-09 4.221043e-12 4.600220e-06 NA
2012-09-01 0 1.113297e-05 3.108380e-08 9.159058e-10 4.826929e-12 7.453672e-07 NA
2012-10-01 0 1.988357e-06 4.498977e-08 2.485889e-10 2.953403e-12 8.001948e-07 NA
2012-11-01 0 3.560373e-06 1.944961e-05 2.615387e-10 1.168389e-11 2.971477e-07 NA
2012-12-01 0 3.717777e-05 2.655440e-05 1.271886e-10 1.814869e-11 4.312436e-07 NA
2013-01-01 0 2.042867e-05 3.268476e-05 2.806455e-10 7.540331e-11 1.231438e-06 NA
2013-02-01 0 4.134729e-07 1.129013e-04 1.186146e-10 1.983651e-11 3.263780e-07 NA
We can plot the log of the variances like this:
library(ggplot2)
autoplot(log(zv), facet = NULL) + geom_point() + ylab("log(var(.))")
Note
We assume that the starting point is the data frame generated reproducibly below:
Lines <- "Date USA ARG BRA CHL COL MEX PER
2012-04-01 1 0.2271531 0.4970299 0.001956865 0.0005341452 0.07341428 NA
2012-05-01 1 0.2218906 0.4675895 0.001911405 0.0005273186 0.07026524 NA
2012-06-01 1 0.2054076 0.4531661 0.001891352 0.0005292575 0.06897811 NA
2012-07-01 1 0.2033470 0.4596730 0.001950686 0.0005312600 0.07269619 NA
2012-08-01 1 0.1993882 0.4596039 0.001980537 0.0005271514 0.07268987 NA
2012-09-01 1 0.1967152 0.4593390 0.002011212 0.0005305549 0.07418838 NA
2012-10-01 1 0.1972730 0.4597584 0.002002203 0.0005284380 0.07428555 NA
2012-11-01 1 0.1937618 0.4519187 0.001979805 0.0005238670 0.07329656 NA
2012-12-01 1 0.1854037 0.4500448 0.001993309 0.0005323795 0.07453949 NA
2013-01-01 1 0.1866007 0.4607501 0.002013112 0.0005412329 0.07551040 NA
2013-02-01 1 0.1855950 0.4712956 0.002011067 0.0005359562 0.07554661 NA"
DF <- read.table(text = Lines, header = TRUE)

Create multiple lagged variables using a zoo object

I need to create 'n' number of variables with lags of the original variable from 1 to 'n' on the fly. Something like so :-
OrigVar
DatePeriod, value
2/01/2018,6
3/01/2018,4
4/01/2018,0
5/01/2018,2
6/01/2018,4
7/01/2018,1
8/01/2018,6
9/01/2018,2
10/01/2018,7
Lagged 1 variable
2/01/2018,NA
3/01/2018,6
4/01/2018,4
5/01/2018,0
6/01/2018,2
7/01/2018,4
8/01/2018,1
9/01/2018,6
10/01/2018,2
11/01/2018,7
Lagged 2 variable
2/01/2018,NA
3/01/2018,NA
4/01/2018,6
5/01/2018,4
6/01/2018,0
7/01/2018,2
8/01/2018,4
9/01/2018,1
10/01/2018,6
11/01/2018,2
12/01/2018,7
Lagged 3 variable
2/01/2018,NA
3/01/2018,NA
4/01/2018,NA
5/01/2018,6
6/01/2018,4
7/01/2018,0
8/01/2018,2
9/01/2018,4
10/01/2018,1
11/01/2018,6
12/01/2018,2
13/01/2018,7
and so on
I tried using the shift function and various other functions. Wtih most of them that worked for me, the lagged variables finished at the last date of the original variable. In other words, the length of the lagged variable is the same as that of the original variable.
What I am looking for the new lagged variable to be shifted down by the 'kth' lag and the data series to be extended by 'k' elements including the index.
The reason I need this is to be able to compute the value of the dependent variable using the regression coeffficients and the corresponding lagged variable value beyond the in-sample period
y1 <- Lag(ciresL1_usage_1601_1612, shift = 1)
head(y1)
2016-01-02 2016-01-03 2016-01-04 2016-01-05 2016-01-06 2016-01-07
NA -5171.051 -6079.887 -3687.227 -3229.453 -2110.368
y2 <- Lag(ciresL1_usage_1601_1612, shift = 2)
head(y2)
2016-01-02 2016-01-03 2016-01-04 2016-01-05 2016-01-06 2016-01-07
NA NA -5171.051 -6079.887 -3687.227 -3229.453
tail(y2)
2016-12-26 2016-12-27 2016-12-28 2016-12-29 2016-12-30 2016-12-31
-2316.039 -2671.185 -4100.793 -2043.020 -1147.798 1111.674
tail(ciresL1_usage_1601_1612)
2016-12-26 2016-12-27 2016-12-28 2016-12-29 2016-12-30 2016-12-31
-4100.793 -2043.020 -1147.798 1111.674 3498.729 2438.739
Is there a way to do it relatively easily. I know I can do it by looping and adding 'k' rows in a new vector and reloading the data in to this new vector appropriately shifting the data values in the new vector but I don't want to use that method unless I have to. I am quietly confident that there has to be a better way to do it than this!
By the way, the object is a zoo object with daily dates as the index.
Best regards
Deepak
Convert the input zoo object to zooreg and then use lag.zooreg like this:
library(zoo)
# test input
z <- zoo(1:10, as.Date("2008-01-01") + 0:9)
zr <- as.zooreg(z)
lag(zr, -(0:3))
giving:
lag0 lag-1 lag-2 lag-3
2008-01-01 1 NA NA NA
2008-01-02 2 1 NA NA
2008-01-03 3 2 1 NA
2008-01-04 4 3 2 1
2008-01-05 5 4 3 2
2008-01-06 6 5 4 3
2008-01-07 7 6 5 4
2008-01-08 8 7 6 5
2008-01-09 9 8 7 6
2008-01-10 10 9 8 7
2008-01-11 NA 10 9 8
2008-01-12 NA NA 10 9
2008-01-13 NA NA NA 10

Combination of merge and aggregate in R

I have created the following 2 dummy datasets as follows:
id<-c(8,8,50,87,141,161,192,216,257,282)
date<-c("2011-03-03","2011-12-12","2010-08-18","2009-04-28","2010-11-29","2012-04-02","2013-01-08","2007-01-22","2009-06-03","2009-12-02")
data<-data.frame(cbind(id,date))
id<-c(3,8,11,11,11,11,11,11,19,19,19,19,19,50,50,50,50,50,87,87,87,87,87,87,282,282,282,282,282,282,282,282,282,282,288,288,288,288,288,288,288,288,288,288,288,288,288)
date<-c("2010-11-04","2011-02-25","2009-07-26","2009-07-27","2009-08-09","2009-08-10","2009-08-30","2004-01-20","2006-02-13","2006-07-18","2007-04-20","2008-05-12","2008-05-29","2009-06-10","2010-08-17","2010-08-15","2011-05-13","2011-06-08","2007-08-09","2008-01-19","2008-02-19","2009-04-28","2009-05-16","2009-05-20","2005-05-14","2007-04-15","2007-07-25","2007-10-12","2007-10-23","2007-10-27","2007-11-20","2009-11-28","2012-08-16","2012-08-16","2008-11-17","2009-10-23","2009-10-27","2009-10-27","2009-10-27","2009-10-27","2009-10-28","2010-06-15","2010-06-17","2010-06-23","2010-07-27","2010-07-27","2010-07-28")
ns<-data.frame(cbind(id,date))
Note that only some of the id in data are included in ns and viceversa.
For each of the values in data$id I am trying to find if there is a ns$date that is 14 days before the data$date where data$id==ns$id and report the number of days difference.
The output I need is a vector/column ("received") of the same number of rows of data, with a TRUE/FALSE whre ns$date[ns$id==data$id] is less than 14 days before the respective data$date and a similar vector with the actual number of days where "received" is TRUE. I hope this makes sense now.
This is where I got so far
# convert dates
data$date <- ymd(data$date)
ns$date <- ymd(ns$date)
# left join datasets
tmp <- merge(data, ns, by="id", all.x=TRUE)
#NOTE THAT this will automatically rename data$date as date.x and tmp$date as date.y
# create variable to say when there is a date difference less than 14 days
tmp$received <- with(tmp, difftime(date.x, date.y, units="days")<14&difftime(date.x, date.y, units="days")>0)
#create a variable that reports the days difference
tmp$dif<-ifelse(tmp$received==TRUE,difftime(tmp$date.x,tmp$date.y, units="days"),NA)
This link Find if date is within 14 days if id matches between datasets in R provides an idea but the result does not include the number of days difference in tmp$dif.
In the result table I need only the lowest difference for each data$id for those cases were tmp$received was TRUE.
Hope this makes more sense now? If not please let me know what needs further clarification.
M
PS: as requested I added what the desired output should look like (same number of rows of data = 10 - no rows for data in ns not in data). Should have thought this might help earlier.
id date received dif
1 8 2011-03-03 TRUE 6
2 8 2011-12-12 FALSE NA
3 50 2010-08-18 TRUE 1
4 87 2009-04-28 TRUE 0
5 141 2010-11-29 NA NA
6 161 2012-04-02 NA NA
7 192 2013-01-08 NA NA
8 216 2007-01-22 NA NA
9 257 2009-06-03 NA NA
10 282 2009-12-02 TRUE 4
Here's a data.table approach
Converting to data.table objects
library(data.table)
setkey(setDT(data), id)
setkey(setDT(ns), id)
Merging
ns <- ns[data]
Converting to Date class
ns[, c("date", "date.1") := lapply(.SD, as.Date), .SDcols = c("date", "date.1")]
Computing days differences and TRUE/FALSE
ns[, `:=`(timediff = date.1 - date,
Logical = (date.1 - date) < 14)]
Taking only the rows we are interested in
res <- ns[is.na(timediff) | timediff >= 0, list(received = any(Logical), dif = timediff[Logical]), by = list(id, date.1)]
Sorting by id and date
res[, id := as.numeric(as.character(id))]
setkey(res, id, date.1)
Subsetting by minimum dstance
res[, list(diff = min(dif)), by = list(id, date.1, received)]
# id date.1 received diff
# 1: 8 2011-03-03 TRUE 6 days
# 2: 8 2011-12-12 FALSE NA days
# 3: 50 2010-08-18 TRUE 1 days
# 4: 87 2009-04-28 TRUE 0 days
# 5: 141 2010-11-29 NA NA days
# 6: 161 2012-04-02 NA NA days
# 7: 192 2013-01-08 NA NA days
# 8: 216 2007-01-22 NA NA days
# 9: 257 2009-06-03 NA NA days
# 10: 282 2009-12-02 TRUE 4 days

Resources