Rolling multiple regressions with panel data [duplicate] - r

This question already has an answer here:
Rolling multiple regression panel data
(1 answer)
Closed 2 years ago.
I am trying to do a rolling multiple regression of a dataset grouped by stock. A sample of the dataset can be seen below. It goes from 1991 to 2019 and contains information on stocks like returns, etc. What I intend to do is regress dependant variable ExcessReturn on EPU_Paper for each stock using a 36 month rolling regression window. I also want the stocks to have at least 18 monthly return observations in the 36 month rolling windows. ISIN is the identifier of each stock in this case. I also want to include SIZE, INVEST, BM and OP as controls in the regression. The dataset does not contain any NAs except for the beta_monthly column which is all NAs. .
This is the code I have tried to run. I am able to get the for-loop working, but do not get the desired result. Optimally I would want the coefficient of EPU_Paper to be appended in df_Final in the correct row.
I am open for both completely new solutions and variants of my current attempt.
# Create date sequence
date <- seq(as.Date("1991-01-01"),as.Date("2019-12-31"), by = "month")
## Model
v <- 36 # No. of observations in rolling regression
w <- 18 # observations of stocks requred in period
df_Final$beta_monthly <- NA
for (i in 1:(length(date)-v)) {
beta.tab <- df_Final %>% filter(Date >= date[i] & Date < date[i+v]) %>%
group_by(ISIN) %>% filter(n() >= w) %>%
do(ols.model = lm(formula = ExcessReturn ~ EPU_Paper + SIZE, .)) %>%
mutate(beta_monthly = coefficients(ols.model)[2],
Date = date[v+1])
}
This is the dataframe df_Final. It contains all the data I need.
structure(list(Year = c(2002, 2004, 2011, 2011, 2012, 1993, 2005,
2019, 2005, 1998), Month = c(5, 4, 12, 11, 4, 11, 7, 2, 12, 12
), ISIN = c("NO0003172207", "NO0003072803", "NO0010001118", "NO0010096985",
"NO0010052350", "NO0004031303", "NO0003733800", "NO0003049405",
"NO0003028904", "NO0004684408"), SIZE = c(1143750000, 894618192,
257727844.92, 293346266180.2, 104014912.25, 1312826651.5, 51164845865,
535492777.6, 1.2465e+10, 8815671800), BM = c(2.69336652499494e-06,
6.25913195949328e-07, 3.0680673824874e-07, 9.99841307356348e-07,
3.99901247813628e-06, 3.6136784151303e-06, 6.27009692475242e-07,
6.397720392755e-07, 1.985559566787e-07, 9.2518383241951e-07),
OP = c(-0.0259646808923766, 0.197313839816668, 0.136649432305334,
0.594948150836374, -0.0018535993529254, -0.0801364023870418,
0.130539826349566, 0.0244477246423, 0.620295983086681, 0.103857566765579
), INVEST = c(0.129154816408376, 0.0321275661230328, -0.092547902189399,
0.142434794968375, -0.121033439243494, -0.00124744840099796,
-0.240237999927217, 0.0376008757633188, 0.060294968189705,
0.112664489390554), MonthlyReturn = c(-0.039797852179406,
-0.066030013642565, 0.019230769230769, 0.049271412097704,
-0.12516823687752, -0.02219755826859, 0.057851239669421,
-0.043636363636364, 0.05232436939754, 0.32743529766845),
RiskFreeRate = c(0.00558, 0.00163, 0.00209, 0.00251, 0.00163,
0.00467, 0.00181, 0.00086, 0.00208, 0.00726), ShareTurnover = c(69750L,
5250L, 369135L, 183793926L, 54869L, 2879656L, 7957362L, 367551L,
2478662L, 2245928L), MarketExcessReturn = c(-2.7155, -3.0781,
1.0322, -0.3552, -0.9447, -4.9307, 6.0359, 3.8371, 6.932,
-0.7896), ExcessReturn = c(-4.5377852179406, -6.7660013642565,
1.7140769230769, 4.6761412097704, -12.679823687752, -2.686755826859,
5.6041239669421, -4.4496363636364, 5.024436939754, 32.017529766845
), TradeDate = structure(c(11838, 12538, 15338, 15308, 15460,
8734, 12993, 17955, 13147, 10590), class = "Date"), GR_SIZE = structure(c(3L,
2L, 1L, 3L, 1L, 2L, 3L, 1L, 3L, 3L), .Label = c("1", "2",
"3"), class = "factor"), GR_OP = structure(c(1L, 2L, 2L,
3L, 1L, 1L, 2L, 1L, 3L, 1L), .Label = c("1", "2", "3"), class = "factor"),
GR_BM = structure(c(3L, 2L, 1L, 3L, 3L, 3L, 2L, 2L, 1L, 3L
), .Label = c("1", "2", "3"), class = "factor"), GR_INVEST = structure(c(3L,
2L, 1L, 3L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("1", "2",
"3"), class = "factor"), SIZE_BM = structure(c(9L, 5L, 1L,
9L, 3L, 6L, 8L, 2L, 7L, 9L), .Label = c("11", "12", "13",
"21", "22", "23", "31", "32", "33"), class = "factor"), SIZE_OP = structure(c(7L,
5L, 2L, 9L, 1L, 4L, 8L, 1L, 9L, 7L), .Label = c("11", "12",
"13", "21", "22", "23", "31", "32", "33"), class = "factor"),
SIZE_INVEST = structure(c(9L, 5L, 1L, 9L, 1L, 4L, 7L, 2L,
8L, 8L), .Label = c("11", "12", "13", "21", "22", "23", "31",
"32", "33"), class = "factor"), Date = structure(c(11808,
12509, 15309, 15279, 15431, 8705, 12965, 17928, 13118, 10561
), class = "Date"), EPU_Paper = c(53.995111032374, 68.0510031873012,
150.261825109363, 124.78265498286, 47.2994312059608, 164.273390295025,
27.168222382902, 181.297305839429, 29.292072793154, 139.423199892468
), beta_monthly = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
)), row.names = c(NA, -10L), class = "data.frame")

You could try a split-apply-combine approach.
.date <- sort(unique(df2$date))
i <- 1; v <- 24; w <- 3
res <- do.call(rbind, lapply(1:v, function(i) {
ds <- df2$date %in% .date[0:(v - 1) + i]
S <- split(df2[ds, ], df2[ds, "date"])
dat <- do.call(rbind, unname(S[sapply(S, nrow) >= w]))
fit <- lm(y ~ x1 + x2, dat)
rs <- dat$date %in% .date[i]
beta_monthly <- unname(fit$coef[2])
if (any(rs)) cbind(dat[rs, ], beta_monthly) else NULL
}))
head(res, 10)
# cl date y x1 x2 beta_monthly
# 6 A 2011-02-01 0.1947135 38.20878 62846231450 -0.01949786
# 7 B 2011-02-01 5.7908162 130.09371 73446134000 -0.01949786
# 8 C 2011-02-01 -1.6335241 63.67381 93917412861 -0.01949786
# 9 D 2011-02-01 -4.8414052 151.70718 76852791458 -0.01949786
# 10 E 2011-02-01 4.1640901 123.10342 16714132588 -0.01949786
# 11 A 2011-03-01 -2.0569659 104.46436 28101485893 -0.01935559
# 12 B 2011-03-01 9.2205063 24.58415 42584043997 -0.01935559
# 13 C 2011-03-01 -0.1572310 65.94721 83745620495 -0.01935559
# 14 D 2011-03-01 5.2782394 25.69336 15235322119 -0.01935559
# 15 E 2011-03-01 3.6096263 163.65887 66618792459 -0.01935559
Data:
set.seed(42)
df2 <- expand.grid(cl=LETTERS[1:5],
date=seq(as.Date("2011-01-01"), as.Date("2019-12-31"), by="month"))
df2 <- df2[-sample(1:nrow(df2), nrow(df2)*.1), ]
n <- nrow(df2)
df2 <- transform(df2,
y=rnorm(n, 2, 5),
x1=runif(n, 20, 180),
x2=runif(n, 1e8, 1e11))

Related

Rolling multiple regression panel data

I am trying to perform a rolling regression for time t over the last 36 months for companies with observations for 18 of these months, but I am not able to make the function work. I only want the coefficient for var1. X, y, z are control variables.
Here is a sample of the data and the code I am trying to run.
structure(list(Year = c(2018, 2014, 2008, 2004, 2005, 2002, 2010,
2008, 2013, 1998), Month = c(6, 12, 4, 6, 4, 8, 12, 11, 3, 3),
ISIN = c("NO0004895103", "NO0010571680", "NO0010010473",
"NO0003079709", "NO0003117202", "NO0003073801", "NO0010379266",
"NO0004913609", "NO0003072407", "NO0003679102"), SIZE = c(3637822300,
1.155e+10, 1925631048.5, 519688494.5, 790931587.08, 127597142.4,
2892152759.3, 554108244.7, 73484767, 375946560), BM = c(7.47698486077218e-07,
7.92202995594714e-07, 1.9900582030005e-07, 3.59385930497676e-07,
6.54209144403066e-07, 6.72227955087816e-07, 1.43273711611857e-06,
5.15824596832591e-07, 1.23252873661613e-06, 2.79726876987137e-07
), OP = c(0.145781283498513, 0.0433749257278669, 0.130477272126991,
0.211294037715838, 0.127523990874984, -0.0185266025249259,
-0.0822835066759631, 0.128844560922287, -0.0155446815227611,
0.416024745410667), INVEST = c(1.58608106515088, 0.00307252384303782,
0.33925195328069, -0.0984424241606425, 0.333479950150282,
0.13302885162465, -0.035773912311751, 0.408569401011161,
-0.206778240645154, -0.177187857233583), MonthlyReturn = c(-0.019509251810137,
0.095308641975309, 0.14864864864865, -0.034364261168385,
-0.064512147964095, -0.14080459770115, 0.13580244980708,
-0.1890214797136, -0.045226130653266, 0.39682527166336),
RiskFreeRate = c(0.00064, 0.00117, 0.00532, 0.00163, 0.00166,
0.00594, 0.00206, 0.00499, 0.00153, 0.00332), ShareTurnover = c(31649L,
907793L, 5318465L, 1831390L, 8956640L, 302000L, 7333090L,
516000L, 75553L, 2836550L), MarketExcessReturn = c(1.0813,
2.7638, 11.5701, 6.4672, -3.3108, -1.8598, 10.5445, -9.0969,
-0.0793, 10.8305), ExcessReturn = c(-2.0149251810137, 9.4138641975309,
14.332864864865, -3.5994261168385, -6.6172147964095, -14.674459770115,
13.374244980708, -19.40114797136, -4.6756130653266, 39.350527166336
), TradeDate = structure(c(17711, 16434, 13999, 12599, 12902,
11929, 14973, 14211, 15791, 10316), class = "Date"), GR_SIZE = structure(c(2L,
3L, 2L, 2L, 2L, 1L, 3L, 1L, 1L, 1L), .Label = c("1", "2",
"3"), class = "factor"), GR_OP = structure(c(2L, 1L, 2L,
2L, 1L, 1L, 1L, 2L, 1L, 3L), .Label = c("1", "2", "3"), class = "factor"),
GR_BM = structure(c(2L, 2L, 1L, 1L, 2L, 2L, 3L, 2L, 2L, 1L
), .Label = c("1", "2", "3"), class = "factor"), GR_INVEST = structure(c(3L,
1L, 3L, 1L, 3L, 3L, 2L, 3L, 1L, 1L), .Label = c("1", "2",
"3"), class = "factor"), SIZE_BM = structure(c(5L, 8L, 4L,
4L, 5L, 2L, 9L, 2L, 2L, 1L), .Label = c("11", "12", "13",
"21", "22", "23", "31", "32", "33"), class = "factor"), SIZE_OP = structure(c(5L,
7L, 5L, 5L, 4L, 1L, 7L, 2L, 1L, 3L), .Label = c("11", "12",
"13", "21", "22", "23", "31", "32", "33"), class = "factor"),
SIZE_INVEST = structure(c(6L, 7L, 6L, 4L, 6L, 3L, 8L, 3L,
1L, 1L), .Label = c("11", "12", "13", "21", "22", "23", "31",
"32", "33"), class = "factor"), Date = structure(c(17683,
16405, 13970, 12570, 12874, 11900, 14944, 14184, 15765, 10286
), class = "Date"), EPU_Paper = c(197.436482473082, 181.040599101032,
58.1799902251583, 62.5412044042803, 57.4799138334861, 52.420100605017,
61.1549363311955, 92.1818760618723, 76.2564063202547, 118.390708950295
)), row.names = c(NA, -10L), class = "data.frame")
#merge dataframes
df_Final <- merge(df_Final, EPU_Paper, by = c("Year", "Month"))
# Create empty dataframe
date <- seq(as.Date("1991-01-31"),as.Date("2019-12-30"), by = "month")
Beta <- data.frame(ISIN = character(), Beta = numeric(), date =
as.Date(character()), stringsAsFactors = FALSE)
## Model
Returns_length <- 36 # No. of observations in rolling regression
Returns_req <- 18 # observations of stocks requred in period
for (i in 1:(length(date)-Returns_length)) {
beta.tab <- df_Final %>% filter(TradeDate >= date[i] & TradeDate < date[i+Returns_length]) %>%
group_by(ISIN) %>% filter(n() >= Returns_req) %>%
do(ols.model = lm(data = df_Final, formula = ExcessReturn ~ var1 + x + y + z)) %>%
mutate(Beta = coef(ols.model)[2]) %>% select("ISIN", "Beta") %>%
mutate(TradeDate = date[Returns_length+i])
Beta <- rbind(Beta, beta.tab)}
There seems to be multiple undefined inputs, df_Final and EU_paper, in the question but assuming that the data is in time order within group and that we want to apply lm to w rows from the current row to the row w-1 prior then using some generated data in the Note at the end we define a coef.x function to calculate the desired coefficient and a rolling function roll which uses it (here with width 2) and then apply it by group g.
library(zoo)
w <- 2
nr <- nrow(DF)
coef.x <- function(ix) coef(lm(y ~ x + x1, DF, subset = ix))[["x"]]
roll <- function(ix) rollapplyr(ix, w, coef.x, fill = NA)
transform(DF, coef.x = ave(1:nr, g, FUN = roll))
giving:
y x x1 g coef.x
1 1 -0.56047564655 0.4609162060 1 NA
2 2 -0.23017748948 -1.2650612346 1 3.0275676040
3 3 1.55870831415 -0.6868528519 1 0.5590071753
4 4 0.07050839142 -0.4456619701 2 NA
5 5 0.12928773516 1.2240817974 2 17.0127792594
6 6 1.71506498688 0.3598138271 2 0.6306055904
This could also be written using dplyr where w, roll and coef.x are from above:
library(dplyr, exclude = c("filter", "lag"))
library(zoo)
DF %>%
group_by(g) %>%
mutate(coef.x = roll(cur_group_rows())) %>%
ungroup
Note
# test data
set.seed(123)
DF <- data.frame(y = 1:6, x = rnorm(6), x1 = rnorm(6),
g = c(1, 1, 1, 2, 2, 2))

How to calculate mean base on quantile for one of the column

Let say I want to find out the mean for other column group by the another column quantile.
For my table, I have several columns, now I got the 10% quantile for SalePrice column, there are some other numeric columns in my table(there are also some other factor variables in this table to).
And I want to calculate these variables' mean group by SalePrice column.
Then after that, I want to save these result in to a data frame.
I want to use loop to construct this data frame, I have some basic idea about the loop, but don't know how to finish it. Or add the column in the data frame in the loop
for (i in 1:lenth(tr)){
if(tr$i == numeric){
Result <- data.frame()
}
}
here is what I got for the SalePrice 10% quantile
> quantile(tr$SalePrice, c(seq(0, 1,0.1)),na.rm = TRUE, names = TRUE)
0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
34900 106450 124000 135500 147000 163000 179360 198740 230000 278000 755000
And my data look like this:
> dput(head(tr, 5))
structure(list(
MSSubClass = structure(c(6L, 1L, 6L, 7L, 6L), .Label = c("20", "30", "40", "45", "50", "60", "70", "75", "80", "85", "90", "120", "160", "180", "190"), class = "factor"),
MSZoning = structure(c(4L, 4L, 4L, 4L, 4L), .Label = c("C (all)", "FV", "RH", "RL", "RM"), class = "factor"),
LotFrontage = c(65, 80, 68, 60, 84),
LotArea = c(8450, 9600, 11250, 9550, 14260),
Street = structure(c(2L, 2L, 2L, 2L, 2L), .Label = c("Grvl", "Pave"), class = "factor"),
Alley = structure(c(2L, 2L, 2L, 2L, 2L), .Label = c("Grvl", "NA", "Pave"), class = "factor"),
LotShape = structure(c(4L, 4L, 1L, 1L, 1L), .Label = c("IR1", "IR2", "IR3", "Reg"), class = "factor"),
LandContour = structure(c(4L, 4L, 4L, 4L, 4L), .Label = c("Bnk", "HLS", "Low", "Lvl"), class = "factor"),
Utilities = structure(c(1L, 1L, 1L, 1L, 1L), .Label = c("AllPub", "NoSeWa"), class = "factor"),
LotConfig = structure(c(5L, 3L, 5L, 1L, 3L), .Label = c("Corner", "CulDSac", "FR2", "FR3", "Inside"), class = "factor"),
LandSlope = structure(c(1L, 1L, 1L, 1L, 1L), .Label = c("Gtl", "Mod", "Sev"), class = "factor"),
Neighborhood = structure(c(6L, 25L, 6L, 7L, 14L), .Label = c("Blmngtn", "Blueste", "BrDale", "BrkSide", "ClearCr", "CollgCr", "Crawfor", "Edwards", "Gilbert", "IDOTRR", "MeadowV", "Mitchel", "NAmes", "NoRidge", "NPkVill", "NridgHt", "NWAmes", "OldTown", "Sawyer", "SawyerW", "Somerst", "StoneBr", "SWISU", "Timber", "Veenker"), class = "factor"),
Condition1 = structure(c(3L, 2L, 3L, 3L, 3L), .Label = c("Artery", "Feedr", "Norm", "PosA", "PosN", "RRAe", "RRAn", "RRNe", "RRNn"), class = "factor"),
Condition2 = structure(c(3L, 3L, 3L, 3L, 3L), .Label = c("Artery", "Feedr", "Norm", "PosA", "PosN", "RRAe", "RRAn", "RRNn"), class = "factor"),
BldgType = structure(c(1L, 1L, 1L, 1L, 1L), .Label = c("1Fam", "2fmCon", "Duplex", "Twnhs","TwnhsE"), class = "factor"),
SalePrice = c(208500, 181500, 223500, 140000, 250000)), row.names = c(NA, 5L), class = "data.frame")
I only attach some variables here, not all of them.
You did not provide any data so I was left making a few assumptions. Assuming that your data is called df perhaps you can use dput(head(df, 100)) and copy and paste the output here?
If not does this work for you?
d1 <- runif(1000)
d2 <- runif(1000)
d3 <- runif(1000)
df <- data.frame(SalePrice = d1,
data2 = d2,
data3 = d3)
library(dplyr)
df %>%
mutate(Mydeciles = ntile(data2, 10)) %>%
group_by(Mydeciles) %>%
summarise(mean_sales_price = mean(SalePrice),
mean_data2 = mean(data2),
mean_data3 = mean(data3))
Output:
# A tibble: 10 x 4
Mydeciles mean_sales_price mean_data2 mean_data3
<int> <dbl> <dbl> <dbl>
1 1 0.497 0.0450 0.450
2 2 0.520 0.144 0.522
3 3 0.506 0.250 0.487
4 4 0.472 0.360 0.457
5 5 0.510 0.469 0.553
6 6 0.555 0.564 0.503
7 7 0.510 0.652 0.540
8 8 0.461 0.751 0.482
9 9 0.465 0.844 0.485
10 10 0.530 0.952 0.534
Solution 2:
df %>%
mutate(Mydeciles = ntile(SalePrice, 2)) %>%
group_by(Mydeciles) %>%
summarise_if(is.numeric, funs(mean))
Gives:
# A tibble: 2 x 4
Mydeciles LotFrontage LotArea SalePrice
<int> <dbl> <dbl> <dbl>
1 1 68.3 9200 176667.
2 2 76 12755 236750
A data.table answer:
library(data.table)
setDT(df)
df[, .(mean_price = mean(salesPrice), mean_r1 = mean(data1), mean_r2 = mean(data2)), by = .(qtl = quantile(salesPrice, seq(0, 1, 0.1)))]

Replacing loop in dplyr R

So I am trying to program function with dplyr withou loop and here is something I do not know how to do
Say we have tv stations (x,y,z) and months (2,3). If I group by this say we get
this output also with summarised numeric value
TV months value
x 2 52
y 2 87
z 2 65
x 3 180
y 3 36
z 3 99
This is for evaluated Brand.
Then I will have many Brands I need to filter to get only those which get value >=0.8*value of evaluated brand & <=1.2*value of evaluated brand
So for example from this down I would only want to filter first two, and this should be done for all months&TV combinations
brand TV MONTH value
sdg x 2 60
sdfg x 2 55
shs x 2 120
sdg x 2 11
sdga x 2 5000
As #akrun said, you need to use a combination of merging and subsetting. Here's a base R solution.
m <- merge(df, data, by.x=c("TV", "MONTH"), by.y=c("TV", "months"))
m[m$value.x >= m$value.y*0.8 & m$value.x <= m$value.y*1.2,][,-5]
# TV MONTH brand value.x
#1 x 2 sdg 60
#2 x 2 sdfg 55
Data
data <- structure(list(TV = structure(c(1L, 2L, 3L, 1L, 2L, 3L), .Label = c("x",
"y", "z"), class = "factor"), months = c(2L, 2L, 2L, 3L, 3L,
3L), value = c(52L, 87L, 65L, 180L, 36L, 99L)), .Names = c("TV",
"months", "value"), class = "data.frame", row.names = c(NA, -6L
))
df <- structure(list(brand = structure(c(2L, 1L, 4L, 2L, 3L), .Label = c("sdfg",
"sdg", "sdga", "shs"), class = "factor"), TV = structure(c(1L,
1L, 1L, 1L, 1L), .Label = "x", class = "factor"), MONTH = c(2L,
2L, 2L, 2L, 2L), value = c(60L, 55L, 120L, 11L, 5000L)), .Names = c("brand",
"TV", "MONTH", "value"), class = "data.frame", row.names = c(NA,
-5L))

Preserve NA in output of ifelse statement using paste

I have data that is organized like below M1 - M4, and I use the code from here to generate M_NEW:
M1 M2 M3 M4 M_NEW
1 1,2 0 1 1
3,4 3,4 1,2,3,4 4 3,4
NA NA 1 2 NA
It looks for a specified number of occurneces of number in the four columns and reports those numbers in M_NEW. Now, I would like to include the numbers 0 and 21 to each of the observations, unless that observation is NA. However, so far, I am unable to paste 0 and 21 to the observations, without also pasting them the NA values. The desired output is include in df below as M_NEW1. How can this be accomplished? It appears that I am missing something with paste here.
# sample data
df <- structure(list(M1 = structure(c(3L, 4L, 2L, 2L, 1L, 5L, NA, 6L
), .Label = c("0", "1", "1,2", "1,2,3,4", "1,2,3,4,5", "3,4,5,6,7"
), class = "factor"), M2 = structure(c(3L, NA, 2L, 2L, 1L, 4L,
NA, 5L), .Label = c("0", "1,2", "1,2,3,4,5", "4,5,6", "4,5,6,7,8,9,10,11,12,13,14"
), class = "factor"), M3 = structure(c(3L, NA, 1L, 1L, 1L, 2L,
NA, 4L), .Label = c("0", "1,2,3,4", "1,2,3,4,5", "1,2,3,4,5,6,7,8"
), class = "factor"), M4 = structure(c(3L, NA, 1L, 2L, 1L, 5L,
NA, 4L), .Label = c("0", "1", "1,2,3,4,5,6", "1,2,3,4,5,6,7,8,9,10,11,12",
"4,5"), class = "factor"), M_NEW1 = structure(c(3L, NA, 1L, 2L,
1L, 5L, NA, 4L), .Label = c("0,21", "1,0,21", "1,2,3,4,5,0,21",
"3,4,5,6,7,8,0,21", "4,5,0,21"), class = "factor")), .Names = c("M1",
"M2", "M3", "M4", "M_NEW1"), class = "data.frame", row.names = c(NA,
-8L))
# function slightly modified from https://stackoverflow.com/a/23203159/1670053
f <- function(x, n=3) {
tab <- table(strsplit(paste(x, collapse=","), ","))
res <- paste(names(tab[which(tab >= n)]), collapse=",")
return(ifelse(is.na(res), NA, ifelse(res == 0, "0,21", paste(res,",0,21",sep=""))))
#return(ifelse(is.na(res), ifelse(res == 0, "0,21", NA), paste(res,",0,21",sep=""))) #https://stackoverflow.com/a/17554670/1670053
#return(ifelse(is.na(res), NA, ifelse(res == 0, "0,21", paste(na.omit(res),",0,21",sep=""))))
#return(ifelse(is.na(res), as.character(NA), ifelse(res == 0, "0,21", paste(res,",0,21",sep=""))))
}
df$M_NEW2 <- apply(df[, 1:4], 1, f))
You can add another if else statement - rather inelegant but gets you there.
f2 <- function(x, n=3) {
tab <- table(strsplit(paste(x, collapse=","), ","))
res <- paste(names(tab[which(tab >= n)]), collapse=",")
res <- ifelse(res %in% c("0", ""), "0,21", res)
if(res %in% c("NA","0,21")) res else paste(res, "0,21", sep=",")
}
apply(df[1:4], 1, f2)
# "1,2,3,4,5,0,21" "NA" "0,21" "1,0,21" "0,21" "4,5,0,21" "NA"
# "3,4,5,6,7,8,0,21"

Compare dates across multiple rows and replace values if condition is met in R

I have a set of dates and times for several individuals (ID) that correspond to our primary outcome measure (Y) and a covariate (X1).
My objective is to replace missing X1 values for each of the Y rows if the X1 measurement was recorded within a +/- 24 hour period from the date/time that the Y variable was measured. To make this easier to visualize (and load into R), here is how the data are currently arranged:
structure(list(ID = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L), TIME = structure(1:15, .Label = c("01/01/2013 12:01",
"01/03/2013 08:49", "01/03/2013 20:52", "02/01/2013 05:00", "02/03/2013 05:30",
"02/03/2013 21:14", "02/05/2013 05:15", "02/12/2013 05:03", "02/15/2013 04:16",
"02/16/2013 04:12", "02/16/2013 21:02", "03/01/2010 17:58", "03/02/2010 00:10",
"03/03/2010 10:45", "03/04/2010 09:00"), class = "factor"), Y = structure(c(1L,
5L, 7L, 1L, 1L, 2L, 1L, 1L, 1L, 4L, 3L, 1L, 8L, 1L, 6L), .Label = c(".",
"22", "35", "4", "5", "6", "8", "9"), class = "factor"), X1 = structure(c(2L,
1L, 1L, 7L, 7L, 1L, 4L, 4L, 3L, 1L, 1L, 6L, 1L, 5L, 1L), .Label = c(".",
"0.1", "0.2", "0.4", "0.6", "0.9", "1.0"), class = "factor")), .Names = c("ID",
"TIME", "Y", "X1"), class = "data.frame", row.names = c(NA, -15L))
To simplify the desired output, I would like to only display the rows with non-missing Y values, such that the end product would look like this:
ID TIME Y X1
1 1 01/03/2013 08:49 5 .
2 1 01/03/2013 20:52 8 .
3 2 02/03/2013 21:14 22 .
4 2 02/16/2013 04:12 4 0.2
5 2 02/16/2013 21:02 35 .
6 3 03/02/2010 00:10 9 0.9
7 3 03/04/2010 09:00 6 0.6
Is it possible to (1) iterate across multiple rows and evaluate the absolute value of 24 hours to get the difference between the X1 and Y measurements and (2) to replace the missing values of X1 with those that are within the +/- 24 hour window?
Any thoughts on how to go about this would be greatly appreciated!
if you convert your data into xts then you can use xts's easy subsetting feature to get what you want.
PS: following code will work if you have exactly 1 value of X1 within 24 hour period of Y measurement.
require(xts)
xx <- xts(DF[, c(1, 4, 5)], as.POSIXct(paste0(DF$Date, " ", DF$TIME), format = "%m/%d/%Y %H:%M"))
sapply(index(xx[!is.na(xx$Y)]), FUN = function(tt) {
startTime <- tt - 24 * 60 * 60
endTime <- tt + 24 * 60 * 60
y <- xx[paste(startTime, endTime, sep = "/")]
if (nrow(y[!is.na(y$X1), "X1"]) != 0) {
return(as.vector(y[!is.na(y$X1), "X1"]))
} else {
return(NA)
}
})
## [1] 0.9 0.6 NA NA 1.0 0.2 NA
xx[!is.na(xx$Y), "X1"] <- sapply(index(xx[!is.na(xx$Y)]), FUN = function(tt) {
startTime <- tt - 24 * 60 * 60
endTime <- tt + 24 * 60 * 60
y <- xx[paste(startTime, endTime, sep = "/")]
if (nrow(y[!is.na(y$X1), "X1"]) != 0) {
return(as.vector(y[!is.na(y$X1), "X1"]))
} else {
return(NA)
}
})
xx[!is.na(xx$Y), "X1"]
## X1
## 2010-03-02 00:10:00 0.9
## 2010-03-04 09:00:00 0.6
## 2013-01-03 08:49:00 NA
## 2013-01-03 20:52:00 NA
## 2013-02-03 21:14:00 1.0
## 2013-02-16 04:12:00 0.2
## 2013-02-16 21:02:00 NA

Resources