I am trying to perform a rolling regression for time t over the last 36 months for companies with observations for 18 of these months, but I am not able to make the function work. I only want the coefficient for var1. X, y, z are control variables.
Here is a sample of the data and the code I am trying to run.
structure(list(Year = c(2018, 2014, 2008, 2004, 2005, 2002, 2010,
2008, 2013, 1998), Month = c(6, 12, 4, 6, 4, 8, 12, 11, 3, 3),
ISIN = c("NO0004895103", "NO0010571680", "NO0010010473",
"NO0003079709", "NO0003117202", "NO0003073801", "NO0010379266",
"NO0004913609", "NO0003072407", "NO0003679102"), SIZE = c(3637822300,
1.155e+10, 1925631048.5, 519688494.5, 790931587.08, 127597142.4,
2892152759.3, 554108244.7, 73484767, 375946560), BM = c(7.47698486077218e-07,
7.92202995594714e-07, 1.9900582030005e-07, 3.59385930497676e-07,
6.54209144403066e-07, 6.72227955087816e-07, 1.43273711611857e-06,
5.15824596832591e-07, 1.23252873661613e-06, 2.79726876987137e-07
), OP = c(0.145781283498513, 0.0433749257278669, 0.130477272126991,
0.211294037715838, 0.127523990874984, -0.0185266025249259,
-0.0822835066759631, 0.128844560922287, -0.0155446815227611,
0.416024745410667), INVEST = c(1.58608106515088, 0.00307252384303782,
0.33925195328069, -0.0984424241606425, 0.333479950150282,
0.13302885162465, -0.035773912311751, 0.408569401011161,
-0.206778240645154, -0.177187857233583), MonthlyReturn = c(-0.019509251810137,
0.095308641975309, 0.14864864864865, -0.034364261168385,
-0.064512147964095, -0.14080459770115, 0.13580244980708,
-0.1890214797136, -0.045226130653266, 0.39682527166336),
RiskFreeRate = c(0.00064, 0.00117, 0.00532, 0.00163, 0.00166,
0.00594, 0.00206, 0.00499, 0.00153, 0.00332), ShareTurnover = c(31649L,
907793L, 5318465L, 1831390L, 8956640L, 302000L, 7333090L,
516000L, 75553L, 2836550L), MarketExcessReturn = c(1.0813,
2.7638, 11.5701, 6.4672, -3.3108, -1.8598, 10.5445, -9.0969,
-0.0793, 10.8305), ExcessReturn = c(-2.0149251810137, 9.4138641975309,
14.332864864865, -3.5994261168385, -6.6172147964095, -14.674459770115,
13.374244980708, -19.40114797136, -4.6756130653266, 39.350527166336
), TradeDate = structure(c(17711, 16434, 13999, 12599, 12902,
11929, 14973, 14211, 15791, 10316), class = "Date"), GR_SIZE = structure(c(2L,
3L, 2L, 2L, 2L, 1L, 3L, 1L, 1L, 1L), .Label = c("1", "2",
"3"), class = "factor"), GR_OP = structure(c(2L, 1L, 2L,
2L, 1L, 1L, 1L, 2L, 1L, 3L), .Label = c("1", "2", "3"), class = "factor"),
GR_BM = structure(c(2L, 2L, 1L, 1L, 2L, 2L, 3L, 2L, 2L, 1L
), .Label = c("1", "2", "3"), class = "factor"), GR_INVEST = structure(c(3L,
1L, 3L, 1L, 3L, 3L, 2L, 3L, 1L, 1L), .Label = c("1", "2",
"3"), class = "factor"), SIZE_BM = structure(c(5L, 8L, 4L,
4L, 5L, 2L, 9L, 2L, 2L, 1L), .Label = c("11", "12", "13",
"21", "22", "23", "31", "32", "33"), class = "factor"), SIZE_OP = structure(c(5L,
7L, 5L, 5L, 4L, 1L, 7L, 2L, 1L, 3L), .Label = c("11", "12",
"13", "21", "22", "23", "31", "32", "33"), class = "factor"),
SIZE_INVEST = structure(c(6L, 7L, 6L, 4L, 6L, 3L, 8L, 3L,
1L, 1L), .Label = c("11", "12", "13", "21", "22", "23", "31",
"32", "33"), class = "factor"), Date = structure(c(17683,
16405, 13970, 12570, 12874, 11900, 14944, 14184, 15765, 10286
), class = "Date"), EPU_Paper = c(197.436482473082, 181.040599101032,
58.1799902251583, 62.5412044042803, 57.4799138334861, 52.420100605017,
61.1549363311955, 92.1818760618723, 76.2564063202547, 118.390708950295
)), row.names = c(NA, -10L), class = "data.frame")
#merge dataframes
df_Final <- merge(df_Final, EPU_Paper, by = c("Year", "Month"))
# Create empty dataframe
date <- seq(as.Date("1991-01-31"),as.Date("2019-12-30"), by = "month")
Beta <- data.frame(ISIN = character(), Beta = numeric(), date =
as.Date(character()), stringsAsFactors = FALSE)
## Model
Returns_length <- 36 # No. of observations in rolling regression
Returns_req <- 18 # observations of stocks requred in period
for (i in 1:(length(date)-Returns_length)) {
beta.tab <- df_Final %>% filter(TradeDate >= date[i] & TradeDate < date[i+Returns_length]) %>%
group_by(ISIN) %>% filter(n() >= Returns_req) %>%
do(ols.model = lm(data = df_Final, formula = ExcessReturn ~ var1 + x + y + z)) %>%
mutate(Beta = coef(ols.model)[2]) %>% select("ISIN", "Beta") %>%
mutate(TradeDate = date[Returns_length+i])
Beta <- rbind(Beta, beta.tab)}
There seems to be multiple undefined inputs, df_Final and EU_paper, in the question but assuming that the data is in time order within group and that we want to apply lm to w rows from the current row to the row w-1 prior then using some generated data in the Note at the end we define a coef.x function to calculate the desired coefficient and a rolling function roll which uses it (here with width 2) and then apply it by group g.
library(zoo)
w <- 2
nr <- nrow(DF)
coef.x <- function(ix) coef(lm(y ~ x + x1, DF, subset = ix))[["x"]]
roll <- function(ix) rollapplyr(ix, w, coef.x, fill = NA)
transform(DF, coef.x = ave(1:nr, g, FUN = roll))
giving:
y x x1 g coef.x
1 1 -0.56047564655 0.4609162060 1 NA
2 2 -0.23017748948 -1.2650612346 1 3.0275676040
3 3 1.55870831415 -0.6868528519 1 0.5590071753
4 4 0.07050839142 -0.4456619701 2 NA
5 5 0.12928773516 1.2240817974 2 17.0127792594
6 6 1.71506498688 0.3598138271 2 0.6306055904
This could also be written using dplyr where w, roll and coef.x are from above:
library(dplyr, exclude = c("filter", "lag"))
library(zoo)
DF %>%
group_by(g) %>%
mutate(coef.x = roll(cur_group_rows())) %>%
ungroup
Note
# test data
set.seed(123)
DF <- data.frame(y = 1:6, x = rnorm(6), x1 = rnorm(6),
g = c(1, 1, 1, 2, 2, 2))
Related
This question already has an answer here:
Rolling multiple regression panel data
(1 answer)
Closed 2 years ago.
I am trying to do a rolling multiple regression of a dataset grouped by stock. A sample of the dataset can be seen below. It goes from 1991 to 2019 and contains information on stocks like returns, etc. What I intend to do is regress dependant variable ExcessReturn on EPU_Paper for each stock using a 36 month rolling regression window. I also want the stocks to have at least 18 monthly return observations in the 36 month rolling windows. ISIN is the identifier of each stock in this case. I also want to include SIZE, INVEST, BM and OP as controls in the regression. The dataset does not contain any NAs except for the beta_monthly column which is all NAs. .
This is the code I have tried to run. I am able to get the for-loop working, but do not get the desired result. Optimally I would want the coefficient of EPU_Paper to be appended in df_Final in the correct row.
I am open for both completely new solutions and variants of my current attempt.
# Create date sequence
date <- seq(as.Date("1991-01-01"),as.Date("2019-12-31"), by = "month")
## Model
v <- 36 # No. of observations in rolling regression
w <- 18 # observations of stocks requred in period
df_Final$beta_monthly <- NA
for (i in 1:(length(date)-v)) {
beta.tab <- df_Final %>% filter(Date >= date[i] & Date < date[i+v]) %>%
group_by(ISIN) %>% filter(n() >= w) %>%
do(ols.model = lm(formula = ExcessReturn ~ EPU_Paper + SIZE, .)) %>%
mutate(beta_monthly = coefficients(ols.model)[2],
Date = date[v+1])
}
This is the dataframe df_Final. It contains all the data I need.
structure(list(Year = c(2002, 2004, 2011, 2011, 2012, 1993, 2005,
2019, 2005, 1998), Month = c(5, 4, 12, 11, 4, 11, 7, 2, 12, 12
), ISIN = c("NO0003172207", "NO0003072803", "NO0010001118", "NO0010096985",
"NO0010052350", "NO0004031303", "NO0003733800", "NO0003049405",
"NO0003028904", "NO0004684408"), SIZE = c(1143750000, 894618192,
257727844.92, 293346266180.2, 104014912.25, 1312826651.5, 51164845865,
535492777.6, 1.2465e+10, 8815671800), BM = c(2.69336652499494e-06,
6.25913195949328e-07, 3.0680673824874e-07, 9.99841307356348e-07,
3.99901247813628e-06, 3.6136784151303e-06, 6.27009692475242e-07,
6.397720392755e-07, 1.985559566787e-07, 9.2518383241951e-07),
OP = c(-0.0259646808923766, 0.197313839816668, 0.136649432305334,
0.594948150836374, -0.0018535993529254, -0.0801364023870418,
0.130539826349566, 0.0244477246423, 0.620295983086681, 0.103857566765579
), INVEST = c(0.129154816408376, 0.0321275661230328, -0.092547902189399,
0.142434794968375, -0.121033439243494, -0.00124744840099796,
-0.240237999927217, 0.0376008757633188, 0.060294968189705,
0.112664489390554), MonthlyReturn = c(-0.039797852179406,
-0.066030013642565, 0.019230769230769, 0.049271412097704,
-0.12516823687752, -0.02219755826859, 0.057851239669421,
-0.043636363636364, 0.05232436939754, 0.32743529766845),
RiskFreeRate = c(0.00558, 0.00163, 0.00209, 0.00251, 0.00163,
0.00467, 0.00181, 0.00086, 0.00208, 0.00726), ShareTurnover = c(69750L,
5250L, 369135L, 183793926L, 54869L, 2879656L, 7957362L, 367551L,
2478662L, 2245928L), MarketExcessReturn = c(-2.7155, -3.0781,
1.0322, -0.3552, -0.9447, -4.9307, 6.0359, 3.8371, 6.932,
-0.7896), ExcessReturn = c(-4.5377852179406, -6.7660013642565,
1.7140769230769, 4.6761412097704, -12.679823687752, -2.686755826859,
5.6041239669421, -4.4496363636364, 5.024436939754, 32.017529766845
), TradeDate = structure(c(11838, 12538, 15338, 15308, 15460,
8734, 12993, 17955, 13147, 10590), class = "Date"), GR_SIZE = structure(c(3L,
2L, 1L, 3L, 1L, 2L, 3L, 1L, 3L, 3L), .Label = c("1", "2",
"3"), class = "factor"), GR_OP = structure(c(1L, 2L, 2L,
3L, 1L, 1L, 2L, 1L, 3L, 1L), .Label = c("1", "2", "3"), class = "factor"),
GR_BM = structure(c(3L, 2L, 1L, 3L, 3L, 3L, 2L, 2L, 1L, 3L
), .Label = c("1", "2", "3"), class = "factor"), GR_INVEST = structure(c(3L,
2L, 1L, 3L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("1", "2",
"3"), class = "factor"), SIZE_BM = structure(c(9L, 5L, 1L,
9L, 3L, 6L, 8L, 2L, 7L, 9L), .Label = c("11", "12", "13",
"21", "22", "23", "31", "32", "33"), class = "factor"), SIZE_OP = structure(c(7L,
5L, 2L, 9L, 1L, 4L, 8L, 1L, 9L, 7L), .Label = c("11", "12",
"13", "21", "22", "23", "31", "32", "33"), class = "factor"),
SIZE_INVEST = structure(c(9L, 5L, 1L, 9L, 1L, 4L, 7L, 2L,
8L, 8L), .Label = c("11", "12", "13", "21", "22", "23", "31",
"32", "33"), class = "factor"), Date = structure(c(11808,
12509, 15309, 15279, 15431, 8705, 12965, 17928, 13118, 10561
), class = "Date"), EPU_Paper = c(53.995111032374, 68.0510031873012,
150.261825109363, 124.78265498286, 47.2994312059608, 164.273390295025,
27.168222382902, 181.297305839429, 29.292072793154, 139.423199892468
), beta_monthly = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
)), row.names = c(NA, -10L), class = "data.frame")
You could try a split-apply-combine approach.
.date <- sort(unique(df2$date))
i <- 1; v <- 24; w <- 3
res <- do.call(rbind, lapply(1:v, function(i) {
ds <- df2$date %in% .date[0:(v - 1) + i]
S <- split(df2[ds, ], df2[ds, "date"])
dat <- do.call(rbind, unname(S[sapply(S, nrow) >= w]))
fit <- lm(y ~ x1 + x2, dat)
rs <- dat$date %in% .date[i]
beta_monthly <- unname(fit$coef[2])
if (any(rs)) cbind(dat[rs, ], beta_monthly) else NULL
}))
head(res, 10)
# cl date y x1 x2 beta_monthly
# 6 A 2011-02-01 0.1947135 38.20878 62846231450 -0.01949786
# 7 B 2011-02-01 5.7908162 130.09371 73446134000 -0.01949786
# 8 C 2011-02-01 -1.6335241 63.67381 93917412861 -0.01949786
# 9 D 2011-02-01 -4.8414052 151.70718 76852791458 -0.01949786
# 10 E 2011-02-01 4.1640901 123.10342 16714132588 -0.01949786
# 11 A 2011-03-01 -2.0569659 104.46436 28101485893 -0.01935559
# 12 B 2011-03-01 9.2205063 24.58415 42584043997 -0.01935559
# 13 C 2011-03-01 -0.1572310 65.94721 83745620495 -0.01935559
# 14 D 2011-03-01 5.2782394 25.69336 15235322119 -0.01935559
# 15 E 2011-03-01 3.6096263 163.65887 66618792459 -0.01935559
Data:
set.seed(42)
df2 <- expand.grid(cl=LETTERS[1:5],
date=seq(as.Date("2011-01-01"), as.Date("2019-12-31"), by="month"))
df2 <- df2[-sample(1:nrow(df2), nrow(df2)*.1), ]
n <- nrow(df2)
df2 <- transform(df2,
y=rnorm(n, 2, 5),
x1=runif(n, 20, 180),
x2=runif(n, 1e8, 1e11))
Let say I want to find out the mean for other column group by the another column quantile.
For my table, I have several columns, now I got the 10% quantile for SalePrice column, there are some other numeric columns in my table(there are also some other factor variables in this table to).
And I want to calculate these variables' mean group by SalePrice column.
Then after that, I want to save these result in to a data frame.
I want to use loop to construct this data frame, I have some basic idea about the loop, but don't know how to finish it. Or add the column in the data frame in the loop
for (i in 1:lenth(tr)){
if(tr$i == numeric){
Result <- data.frame()
}
}
here is what I got for the SalePrice 10% quantile
> quantile(tr$SalePrice, c(seq(0, 1,0.1)),na.rm = TRUE, names = TRUE)
0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
34900 106450 124000 135500 147000 163000 179360 198740 230000 278000 755000
And my data look like this:
> dput(head(tr, 5))
structure(list(
MSSubClass = structure(c(6L, 1L, 6L, 7L, 6L), .Label = c("20", "30", "40", "45", "50", "60", "70", "75", "80", "85", "90", "120", "160", "180", "190"), class = "factor"),
MSZoning = structure(c(4L, 4L, 4L, 4L, 4L), .Label = c("C (all)", "FV", "RH", "RL", "RM"), class = "factor"),
LotFrontage = c(65, 80, 68, 60, 84),
LotArea = c(8450, 9600, 11250, 9550, 14260),
Street = structure(c(2L, 2L, 2L, 2L, 2L), .Label = c("Grvl", "Pave"), class = "factor"),
Alley = structure(c(2L, 2L, 2L, 2L, 2L), .Label = c("Grvl", "NA", "Pave"), class = "factor"),
LotShape = structure(c(4L, 4L, 1L, 1L, 1L), .Label = c("IR1", "IR2", "IR3", "Reg"), class = "factor"),
LandContour = structure(c(4L, 4L, 4L, 4L, 4L), .Label = c("Bnk", "HLS", "Low", "Lvl"), class = "factor"),
Utilities = structure(c(1L, 1L, 1L, 1L, 1L), .Label = c("AllPub", "NoSeWa"), class = "factor"),
LotConfig = structure(c(5L, 3L, 5L, 1L, 3L), .Label = c("Corner", "CulDSac", "FR2", "FR3", "Inside"), class = "factor"),
LandSlope = structure(c(1L, 1L, 1L, 1L, 1L), .Label = c("Gtl", "Mod", "Sev"), class = "factor"),
Neighborhood = structure(c(6L, 25L, 6L, 7L, 14L), .Label = c("Blmngtn", "Blueste", "BrDale", "BrkSide", "ClearCr", "CollgCr", "Crawfor", "Edwards", "Gilbert", "IDOTRR", "MeadowV", "Mitchel", "NAmes", "NoRidge", "NPkVill", "NridgHt", "NWAmes", "OldTown", "Sawyer", "SawyerW", "Somerst", "StoneBr", "SWISU", "Timber", "Veenker"), class = "factor"),
Condition1 = structure(c(3L, 2L, 3L, 3L, 3L), .Label = c("Artery", "Feedr", "Norm", "PosA", "PosN", "RRAe", "RRAn", "RRNe", "RRNn"), class = "factor"),
Condition2 = structure(c(3L, 3L, 3L, 3L, 3L), .Label = c("Artery", "Feedr", "Norm", "PosA", "PosN", "RRAe", "RRAn", "RRNn"), class = "factor"),
BldgType = structure(c(1L, 1L, 1L, 1L, 1L), .Label = c("1Fam", "2fmCon", "Duplex", "Twnhs","TwnhsE"), class = "factor"),
SalePrice = c(208500, 181500, 223500, 140000, 250000)), row.names = c(NA, 5L), class = "data.frame")
I only attach some variables here, not all of them.
You did not provide any data so I was left making a few assumptions. Assuming that your data is called df perhaps you can use dput(head(df, 100)) and copy and paste the output here?
If not does this work for you?
d1 <- runif(1000)
d2 <- runif(1000)
d3 <- runif(1000)
df <- data.frame(SalePrice = d1,
data2 = d2,
data3 = d3)
library(dplyr)
df %>%
mutate(Mydeciles = ntile(data2, 10)) %>%
group_by(Mydeciles) %>%
summarise(mean_sales_price = mean(SalePrice),
mean_data2 = mean(data2),
mean_data3 = mean(data3))
Output:
# A tibble: 10 x 4
Mydeciles mean_sales_price mean_data2 mean_data3
<int> <dbl> <dbl> <dbl>
1 1 0.497 0.0450 0.450
2 2 0.520 0.144 0.522
3 3 0.506 0.250 0.487
4 4 0.472 0.360 0.457
5 5 0.510 0.469 0.553
6 6 0.555 0.564 0.503
7 7 0.510 0.652 0.540
8 8 0.461 0.751 0.482
9 9 0.465 0.844 0.485
10 10 0.530 0.952 0.534
Solution 2:
df %>%
mutate(Mydeciles = ntile(SalePrice, 2)) %>%
group_by(Mydeciles) %>%
summarise_if(is.numeric, funs(mean))
Gives:
# A tibble: 2 x 4
Mydeciles LotFrontage LotArea SalePrice
<int> <dbl> <dbl> <dbl>
1 1 68.3 9200 176667.
2 2 76 12755 236750
A data.table answer:
library(data.table)
setDT(df)
df[, .(mean_price = mean(salesPrice), mean_r1 = mean(data1), mean_r2 = mean(data2)), by = .(qtl = quantile(salesPrice, seq(0, 1, 0.1)))]
I have a table like following:
ID type group
A3EP 1 M
A3MA 2 M
A459 3 M
A3I1 5 M
A9D2 7 M
A3M9 4 M
A7XP 6 M
A4ZP 8 M
I want to make a color bar like following: Red color represents "group" and below that each color represents "type" and below that I want the "ID" names.
Can anyone please tell me how to do this? Thank you.
mypalette <- rainbow(8)
barplot(rep(0.5,8), width=1, space=0, col=mypalette, axes=F)
text(df$type-.5, .2, df$ID, srt=90)
rect(0, .4, 8, .5, col="red")
text(4, .45, "M")
Input data:
df <- structure(list(ID = structure(c(1L, 4L, 5L, 2L, 8L, 3L, 7L, 6L
), .Label = c("A3EP", "A3I1", "A3M9", "A3MA", "A459", "A4ZP",
"A7XP", "A9D2"), class = "factor"), type = 1:8, group = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), class = "factor", .Label = "M")), .Names =
c("ID",
"type", "group"), row.names = c(NA, -8L), class = "data.frame")
I have a data with three columns like
Inputdf<-structure(list(df1 = structure(c(4L, 5L, 2L, 1L, 3L), .Label = c("P61160,P61158,O15143,O15144,O15145,P59998,O15511",
"P78537,Q6QNY1,Q6QNY0", "Q06323,Q9UL46", "Q92793,Q09472,Q9Y6Q9,Q92831",
"Q92828,Q13227,O15379,O75376,O60907,Q9BZK7"), class = "factor"),
df2 = structure(c(3L, 2L, 5L, 4L, 1L), .Label = c("", "P61158,O15143,O15144",
"Q06323,Q9UL46", "Q6QNY0", "Q92828"), class = "factor"),
df3 = structure(c(5L, 4L, 3L, 2L, 1L), .Label = c("", "O15511",
"Q06323,Q9UL46", "Q6QNY0", "Q92793,Q09472"), class = "factor")), .Names = c("df1",
"df2", "df3"), class = "data.frame", row.names = c(NA, -5L))
I am trying to find similar strings in this data for example
in df1, I have the first row I have Q92793,Q09472,Q9Y6Q9,Q92831
then I look at df2 and df3 and see if any of these members are in there then in this example, I make the following data
df1 df2 df3 Numberdf1 df2 df3
1 0 1 4 0 Q92793,Q09472
df1 1 means the first row of df1
df2 0 means it did not have any similarity
df3 1, means the first row of df3 has similarity with df1 row 1
Numberdf1, it is the count of strings separated by a ,which is 4
df2 is 0 because there was not any similar string accords df2
df3 is Q92793,Q09472 which paste the string which were similar in here
a desire output looks like below
out<- structure(list(df1 = 1:5, df2 = c(0L, 3L, 4L, 2L, 1L), df3 = c(1L,
0L, 2L, 4L, 3L), Numberdf1 = c(4L, 6L, 2L, 7L, 2L), df2.1 = structure(c(1L,
5L, 4L, 2L, 3L), .Label = c("0", "P61158,O15143,O15144", "Q06323,Q9UL46",
"Q6QNY0", "Q92828"), class = "factor"), df3.1 = structure(c(5L,
1L, 4L, 2L, 3L), .Label = c("0", "O15511", "Q06323,Q9UL46", "Q6QNY0",
"Q92793,Q09472"), class = "factor")), .Names = c("df1", "df2",
"df3", "Numberdf1", "df2.1", "df3.1"), class = "data.frame", row.names = c(NA,
-5L))
The below function does not work , for example, use this data as input
Inputdf1<- structure(list(df1 = structure(c(2L, 3L, 1L), .Label = c("Q06323,Q9UL46",
"Q92793,Q09472,Q9Y6Q9,Q92831", "Q92828,Q13227,O15379,O75376,O60907,Q9BZK7"
), class = "factor"), df2 = structure(1:3, .Label = c("P25788,P25789",
"Q92828, O60907, O75376", "Q9UL46, Q06323"), class = "factor"),
df3 = structure(c(2L, 1L, 3L), .Label = c("Q92831, Q92793, Q09472",
"Q9BZK7, Q92828, O75376, O60907", "Q9UL46, Q06323"), class = "factor")), .Names = c("df1",
"df2", "df3"), class = "data.frame", row.names = c(NA, -3L))
This works for your example:
# First convert factors to strings to lists
Inputdf[] = lapply(Inputdf, as.character)
Inputdf[] = lapply(Inputdf, function(col) sapply(col, function(x) unlist(strsplit(x,','))))
not.empty = function(x) length(x) > 0
out = data.frame()
for (r in 1:nrow(Inputdf)) {
df2.intersect = lapply(Inputdf$df2, intersect, Inputdf$df1[[r]])
df3.intersect = lapply(Inputdf$df3, intersect, Inputdf$df1[[r]])
out[r, 'df1'] = r
out[r, 'df2'] = Position(not.empty, df2.intersect, nomatch=0)
out[r, 'df3'] = Position(not.empty, df3.intersect, nomatch=0)
out[r, 'Numberdf1'] = length(Inputdf$df1[[r]])
out[r, 'df2.1'] = paste(Find(not.empty, df2.intersect, nomatch=0), collapse=',')
out[r, 'df3.1'] = paste(Find(not.empty, df3.intersect, nomatch=0), collapse=',')
}
out
# df1 df2 df3 Numberdf1 df2.1 df3.1
# 1 1 0 1 4 0 Q92793,Q09472
# 2 2 3 0 6 Q92828 0
# 3 3 4 2 3 Q6QNY0 Q6QNY0
# 4 4 2 4 7 P61158,O15143,O15144 O15511
# 5 5 1 3 2 Q06323,Q9UL46 Q06323,Q9UL46
Note: Find and Position identify the first match only. If there are potentially multiple matches, use which.
EDIT
Version accounting for multiple matches
Inputdf[] = lapply(Inputdf, as.character)
Inputdf[] = lapply(Inputdf, function(col) sapply(col, function(x) unlist(strsplit(x,',\\s*'))))
not.empty = function(x) length(x) > 0
out = data.frame()
for (r in 1:nrow(Inputdf)) {
df2.intersect = lapply(Inputdf$df2, intersect, Inputdf$df1[[r]])
df3.intersect = lapply(Inputdf$df3, intersect, Inputdf$df1[[r]])
out[r, 'df1'] = r
out[r, 'df2'] = paste(which(sapply(df2.intersect, not.empty)), collapse=',')
out[r, 'df3'] = paste(which(sapply(df3.intersect, not.empty)), collapse=',')
out[r, 'Numberdf1'] = length(Inputdf$df1[[r]])
out[r, 'df2.1'] = paste(unique(unlist(df2.intersect)), collapse=',')
out[r, 'df3.1'] = paste(unique(unlist(df3.intersect)), collapse=',')
}
out[out==""] = "0"
I have data that is organized like below M1 - M4, and I use the code from here to generate M_NEW:
M1 M2 M3 M4 M_NEW
1 1,2 0 1 1
3,4 3,4 1,2,3,4 4 3,4
NA NA 1 2 NA
It looks for a specified number of occurneces of number in the four columns and reports those numbers in M_NEW. Now, I would like to include the numbers 0 and 21 to each of the observations, unless that observation is NA. However, so far, I am unable to paste 0 and 21 to the observations, without also pasting them the NA values. The desired output is include in df below as M_NEW1. How can this be accomplished? It appears that I am missing something with paste here.
# sample data
df <- structure(list(M1 = structure(c(3L, 4L, 2L, 2L, 1L, 5L, NA, 6L
), .Label = c("0", "1", "1,2", "1,2,3,4", "1,2,3,4,5", "3,4,5,6,7"
), class = "factor"), M2 = structure(c(3L, NA, 2L, 2L, 1L, 4L,
NA, 5L), .Label = c("0", "1,2", "1,2,3,4,5", "4,5,6", "4,5,6,7,8,9,10,11,12,13,14"
), class = "factor"), M3 = structure(c(3L, NA, 1L, 1L, 1L, 2L,
NA, 4L), .Label = c("0", "1,2,3,4", "1,2,3,4,5", "1,2,3,4,5,6,7,8"
), class = "factor"), M4 = structure(c(3L, NA, 1L, 2L, 1L, 5L,
NA, 4L), .Label = c("0", "1", "1,2,3,4,5,6", "1,2,3,4,5,6,7,8,9,10,11,12",
"4,5"), class = "factor"), M_NEW1 = structure(c(3L, NA, 1L, 2L,
1L, 5L, NA, 4L), .Label = c("0,21", "1,0,21", "1,2,3,4,5,0,21",
"3,4,5,6,7,8,0,21", "4,5,0,21"), class = "factor")), .Names = c("M1",
"M2", "M3", "M4", "M_NEW1"), class = "data.frame", row.names = c(NA,
-8L))
# function slightly modified from https://stackoverflow.com/a/23203159/1670053
f <- function(x, n=3) {
tab <- table(strsplit(paste(x, collapse=","), ","))
res <- paste(names(tab[which(tab >= n)]), collapse=",")
return(ifelse(is.na(res), NA, ifelse(res == 0, "0,21", paste(res,",0,21",sep=""))))
#return(ifelse(is.na(res), ifelse(res == 0, "0,21", NA), paste(res,",0,21",sep=""))) #https://stackoverflow.com/a/17554670/1670053
#return(ifelse(is.na(res), NA, ifelse(res == 0, "0,21", paste(na.omit(res),",0,21",sep=""))))
#return(ifelse(is.na(res), as.character(NA), ifelse(res == 0, "0,21", paste(res,",0,21",sep=""))))
}
df$M_NEW2 <- apply(df[, 1:4], 1, f))
You can add another if else statement - rather inelegant but gets you there.
f2 <- function(x, n=3) {
tab <- table(strsplit(paste(x, collapse=","), ","))
res <- paste(names(tab[which(tab >= n)]), collapse=",")
res <- ifelse(res %in% c("0", ""), "0,21", res)
if(res %in% c("NA","0,21")) res else paste(res, "0,21", sep=",")
}
apply(df[1:4], 1, f2)
# "1,2,3,4,5,0,21" "NA" "0,21" "1,0,21" "0,21" "4,5,0,21" "NA"
# "3,4,5,6,7,8,0,21"