I have data that is organized like below M1 - M4, and I use the code from here to generate M_NEW:
M1 M2 M3 M4 M_NEW
1 1,2 0 1 1
3,4 3,4 1,2,3,4 4 3,4
NA NA 1 2 NA
It looks for a specified number of occurneces of number in the four columns and reports those numbers in M_NEW. Now, I would like to include the numbers 0 and 21 to each of the observations, unless that observation is NA. However, so far, I am unable to paste 0 and 21 to the observations, without also pasting them the NA values. The desired output is include in df below as M_NEW1. How can this be accomplished? It appears that I am missing something with paste here.
# sample data
df <- structure(list(M1 = structure(c(3L, 4L, 2L, 2L, 1L, 5L, NA, 6L
), .Label = c("0", "1", "1,2", "1,2,3,4", "1,2,3,4,5", "3,4,5,6,7"
), class = "factor"), M2 = structure(c(3L, NA, 2L, 2L, 1L, 4L,
NA, 5L), .Label = c("0", "1,2", "1,2,3,4,5", "4,5,6", "4,5,6,7,8,9,10,11,12,13,14"
), class = "factor"), M3 = structure(c(3L, NA, 1L, 1L, 1L, 2L,
NA, 4L), .Label = c("0", "1,2,3,4", "1,2,3,4,5", "1,2,3,4,5,6,7,8"
), class = "factor"), M4 = structure(c(3L, NA, 1L, 2L, 1L, 5L,
NA, 4L), .Label = c("0", "1", "1,2,3,4,5,6", "1,2,3,4,5,6,7,8,9,10,11,12",
"4,5"), class = "factor"), M_NEW1 = structure(c(3L, NA, 1L, 2L,
1L, 5L, NA, 4L), .Label = c("0,21", "1,0,21", "1,2,3,4,5,0,21",
"3,4,5,6,7,8,0,21", "4,5,0,21"), class = "factor")), .Names = c("M1",
"M2", "M3", "M4", "M_NEW1"), class = "data.frame", row.names = c(NA,
-8L))
# function slightly modified from https://stackoverflow.com/a/23203159/1670053
f <- function(x, n=3) {
tab <- table(strsplit(paste(x, collapse=","), ","))
res <- paste(names(tab[which(tab >= n)]), collapse=",")
return(ifelse(is.na(res), NA, ifelse(res == 0, "0,21", paste(res,",0,21",sep=""))))
#return(ifelse(is.na(res), ifelse(res == 0, "0,21", NA), paste(res,",0,21",sep=""))) #https://stackoverflow.com/a/17554670/1670053
#return(ifelse(is.na(res), NA, ifelse(res == 0, "0,21", paste(na.omit(res),",0,21",sep=""))))
#return(ifelse(is.na(res), as.character(NA), ifelse(res == 0, "0,21", paste(res,",0,21",sep=""))))
}
df$M_NEW2 <- apply(df[, 1:4], 1, f))
You can add another if else statement - rather inelegant but gets you there.
f2 <- function(x, n=3) {
tab <- table(strsplit(paste(x, collapse=","), ","))
res <- paste(names(tab[which(tab >= n)]), collapse=",")
res <- ifelse(res %in% c("0", ""), "0,21", res)
if(res %in% c("NA","0,21")) res else paste(res, "0,21", sep=",")
}
apply(df[1:4], 1, f2)
# "1,2,3,4,5,0,21" "NA" "0,21" "1,0,21" "0,21" "4,5,0,21" "NA"
# "3,4,5,6,7,8,0,21"
Related
I am doing some tricky data cleaning. I have one dataset (first extract below) that is the output from the digitization of pdf tables. Unfortunately columns were not digitized properly. Sometimes, what shall be in column X3 ended up concatenated in column X2 with the last word of column X2...
What I am trying to do is to bring back what should be in column X3 to X3 and collapse the two rows in X2 together.
I have attached an extract of the output I am trying to create.
Any idea about how can I do this?
Thank you!
structure(list(X1 = c(111L, NA, 2L, NA, NA, 121L, NA, NA, 121L,
NA, NA, 141L, NA, NA, 141L, NA), X2 = structure(c(7L, 1L, 8L,
1L, 1L, 9L, 1L, 1L, 6L, 3L, 1L, 5L, 2L, 1L, 10L, 4L), .Label = c("",
"A - BWHITE", "ASMITH", "B - DBURNEY", "Garden Harris", "House M. Aba",
"House M. Bab", "House M. Cac", "Street M. Bak", "Villa Thomas"
), class = "factor"), X3 = structure(c(2L, 1L, 3L, 1L, 1L, 4L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "A",
"A - C", "D"), class = "factor")), class = "data.frame", row.names = c(NA,
-16L))
structure(list(X1 = c(111L, NA, 2L, NA, NA, 121L, NA, NA, 121L,
NA, NA, 141L, NA, NA, 141L), X2 = structure(c(4L, 1L, 5L, 1L,
1L, 6L, 1L, 1L, 3L, 1L, 1L, 2L, 1L, 1L, 7L), .Label = c("", "Garden Harris WHITE",
"House M. Aba SMITH", "House M. Bab", "House M. Cac", "Street M. Bak",
"Villa Thomas BURNEY"), class = "factor"), X3 = structure(c(2L,
1L, 4L, 1L, 1L, 6L, 1L, 1L, 2L, 1L, 1L, 3L, 1L, 1L, 5L), .Label = c("",
"A", "A - B", "A - C", "B - D", "D"), class = "factor")), class = "data.frame", row.names = c(NA,
-15L))
Follow up question here: Cleaning extract_tables conditional merge rows, systematic extraction
You could use tidyverse:
library(tidyr)
library(stringr)
library(dplyr)
df %>%
filter(X2 != "") %>%
mutate(
extract_name = lead(str_extract(X2, "(?<=[A-Z])[A-Z]+")),
extract_part = lead(str_extract(X2, "[A-Z](\\s-\\s[A-Z]){0,1}(?=[A-Z]+)")),
new_X2 = ifelse(!is.na(extract_name), paste(X2, extract_name), as.character(X2)),
new_X3 = ifelse(X3 != "", as.character(X3), extract_part)
) %>%
drop_na(X1) %>%
select(-extract_name, -extract_part)
which returns
X1 X2 X3 new_X2 new_X3
1 111 House M. Bab A House M. Bab A
2 2 House M. Cac A - C House M. Cac A - C
3 121 Street M. Bak D Street M. Bak D
4 121 House M. Aba House M. Aba SMITH A
5 141 Garden Harris Garden Harris WHITE A - B
6 141 Villa Thomas Villa Thomas BURNEY B - D
Note: I don't think this approach is really stable regarding the regex used. For readability I filtered out some annoying rows containing NA and empty strings, you should remove those parts if necessary.
Here is how we could do it:
Credits to MartinGal for the regex "(?<=[A-Z])[A-Z]+") (upvote!)
Replace empty values with NA
Use lead to move rows up in X3 conditional on NA else not
filter if is not NA in X1
Extract the important information with str_extract and regex "(?<=[A-Z])[A-Z]+" -> combine this info with column X2 with str_c and finally coalesce both.
Remove the string to keep relevant one with regex and str_remove
library(dyplr)
library(stringr)
df %>%
mutate(across(everything(), ~sub("^\\s*$", NA, .)),
X3= ifelse(is.na(X3), lead(X2), X3)) %>%
filter(!is.na(X1)) %>%
mutate(X2 = coalesce(str_c(X2," ", str_extract(X3, "(?<=[A-Z])[A-Z]+")), X2),
X3 = str_remove_all(X3, "(?<=[A-Z])[A-Z]+"))
Output:
X1 X2 X3
1 111 House M. Bab A
2 2 House M. Cac A - C
3 121 Street M. Bak D
4 121 House M. Aba SMITH A
5 141 Garden Harris WHITE A - B
6 141 Villa Thomas BURNEY B - D
This is a yucky one:
# Retype the data and nullify empty values;
# use X1 as a key: intermediateResult => data.frame
intermediateResult <- data.frame(
lapply(
transform(
replace(df, df == "", NA_character_),
X1 = na.omit(X1)[cumsum(!is.na(X1))]
),
as.character
)
)
# Re-structure the data:
# interemdiateResult2 => data.frame
intermediateResult2 <- do.call(
rbind,
Filter(
function(y){
nrow(y) > 0
},
Map(
function(x){
z <- x[!is.na(x$X2),]
if(nrow(z) > 1 & is.na(z$X3[1])){
z$X3[1] <- z$X2[2]
head(z, 1)
}else{
z
}
},
with(
intermediateResult,
split(
intermediateResult,
paste(
X1,
cumsum(
(is.na(X2)
)
),
sep = " - "
)
)
)
)
)
)
# Regex it and hope for the best:
# result => data.frame
result <- data.frame(
transform(
intermediateResult2,
X2 = paste0(
X2,
ifelse(
(nchar(X3) == 1 | grepl("^\\w\\s+-\\s+\\w$", X3)),
"",
ifelse(
!(grepl("^\\w\\s+-\\s+\\w", X3)),
paste0(" ", substr(X3, 2, nchar(X3))),
paste0(" ", gsub("(^\\w\\s+-\\s+\\w)(.*)", "\\2", X3))
)
)
),
X3 = ifelse(
nchar(X3) == 1 | grepl("^\\w\\s+-\\s+\\w$", X3) ,
X3,
ifelse(
!(grepl("^\\w\\s+-\\s+\\w", X3)),
substr(X3, 1, 1),
gsub("(^\\w\\s+-\\s+\\w)(.*)", "\\1", X3)
)
)
),
row.names = NULL
)
This question already has an answer here:
Rolling multiple regression panel data
(1 answer)
Closed 2 years ago.
I am trying to do a rolling multiple regression of a dataset grouped by stock. A sample of the dataset can be seen below. It goes from 1991 to 2019 and contains information on stocks like returns, etc. What I intend to do is regress dependant variable ExcessReturn on EPU_Paper for each stock using a 36 month rolling regression window. I also want the stocks to have at least 18 monthly return observations in the 36 month rolling windows. ISIN is the identifier of each stock in this case. I also want to include SIZE, INVEST, BM and OP as controls in the regression. The dataset does not contain any NAs except for the beta_monthly column which is all NAs. .
This is the code I have tried to run. I am able to get the for-loop working, but do not get the desired result. Optimally I would want the coefficient of EPU_Paper to be appended in df_Final in the correct row.
I am open for both completely new solutions and variants of my current attempt.
# Create date sequence
date <- seq(as.Date("1991-01-01"),as.Date("2019-12-31"), by = "month")
## Model
v <- 36 # No. of observations in rolling regression
w <- 18 # observations of stocks requred in period
df_Final$beta_monthly <- NA
for (i in 1:(length(date)-v)) {
beta.tab <- df_Final %>% filter(Date >= date[i] & Date < date[i+v]) %>%
group_by(ISIN) %>% filter(n() >= w) %>%
do(ols.model = lm(formula = ExcessReturn ~ EPU_Paper + SIZE, .)) %>%
mutate(beta_monthly = coefficients(ols.model)[2],
Date = date[v+1])
}
This is the dataframe df_Final. It contains all the data I need.
structure(list(Year = c(2002, 2004, 2011, 2011, 2012, 1993, 2005,
2019, 2005, 1998), Month = c(5, 4, 12, 11, 4, 11, 7, 2, 12, 12
), ISIN = c("NO0003172207", "NO0003072803", "NO0010001118", "NO0010096985",
"NO0010052350", "NO0004031303", "NO0003733800", "NO0003049405",
"NO0003028904", "NO0004684408"), SIZE = c(1143750000, 894618192,
257727844.92, 293346266180.2, 104014912.25, 1312826651.5, 51164845865,
535492777.6, 1.2465e+10, 8815671800), BM = c(2.69336652499494e-06,
6.25913195949328e-07, 3.0680673824874e-07, 9.99841307356348e-07,
3.99901247813628e-06, 3.6136784151303e-06, 6.27009692475242e-07,
6.397720392755e-07, 1.985559566787e-07, 9.2518383241951e-07),
OP = c(-0.0259646808923766, 0.197313839816668, 0.136649432305334,
0.594948150836374, -0.0018535993529254, -0.0801364023870418,
0.130539826349566, 0.0244477246423, 0.620295983086681, 0.103857566765579
), INVEST = c(0.129154816408376, 0.0321275661230328, -0.092547902189399,
0.142434794968375, -0.121033439243494, -0.00124744840099796,
-0.240237999927217, 0.0376008757633188, 0.060294968189705,
0.112664489390554), MonthlyReturn = c(-0.039797852179406,
-0.066030013642565, 0.019230769230769, 0.049271412097704,
-0.12516823687752, -0.02219755826859, 0.057851239669421,
-0.043636363636364, 0.05232436939754, 0.32743529766845),
RiskFreeRate = c(0.00558, 0.00163, 0.00209, 0.00251, 0.00163,
0.00467, 0.00181, 0.00086, 0.00208, 0.00726), ShareTurnover = c(69750L,
5250L, 369135L, 183793926L, 54869L, 2879656L, 7957362L, 367551L,
2478662L, 2245928L), MarketExcessReturn = c(-2.7155, -3.0781,
1.0322, -0.3552, -0.9447, -4.9307, 6.0359, 3.8371, 6.932,
-0.7896), ExcessReturn = c(-4.5377852179406, -6.7660013642565,
1.7140769230769, 4.6761412097704, -12.679823687752, -2.686755826859,
5.6041239669421, -4.4496363636364, 5.024436939754, 32.017529766845
), TradeDate = structure(c(11838, 12538, 15338, 15308, 15460,
8734, 12993, 17955, 13147, 10590), class = "Date"), GR_SIZE = structure(c(3L,
2L, 1L, 3L, 1L, 2L, 3L, 1L, 3L, 3L), .Label = c("1", "2",
"3"), class = "factor"), GR_OP = structure(c(1L, 2L, 2L,
3L, 1L, 1L, 2L, 1L, 3L, 1L), .Label = c("1", "2", "3"), class = "factor"),
GR_BM = structure(c(3L, 2L, 1L, 3L, 3L, 3L, 2L, 2L, 1L, 3L
), .Label = c("1", "2", "3"), class = "factor"), GR_INVEST = structure(c(3L,
2L, 1L, 3L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("1", "2",
"3"), class = "factor"), SIZE_BM = structure(c(9L, 5L, 1L,
9L, 3L, 6L, 8L, 2L, 7L, 9L), .Label = c("11", "12", "13",
"21", "22", "23", "31", "32", "33"), class = "factor"), SIZE_OP = structure(c(7L,
5L, 2L, 9L, 1L, 4L, 8L, 1L, 9L, 7L), .Label = c("11", "12",
"13", "21", "22", "23", "31", "32", "33"), class = "factor"),
SIZE_INVEST = structure(c(9L, 5L, 1L, 9L, 1L, 4L, 7L, 2L,
8L, 8L), .Label = c("11", "12", "13", "21", "22", "23", "31",
"32", "33"), class = "factor"), Date = structure(c(11808,
12509, 15309, 15279, 15431, 8705, 12965, 17928, 13118, 10561
), class = "Date"), EPU_Paper = c(53.995111032374, 68.0510031873012,
150.261825109363, 124.78265498286, 47.2994312059608, 164.273390295025,
27.168222382902, 181.297305839429, 29.292072793154, 139.423199892468
), beta_monthly = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
)), row.names = c(NA, -10L), class = "data.frame")
You could try a split-apply-combine approach.
.date <- sort(unique(df2$date))
i <- 1; v <- 24; w <- 3
res <- do.call(rbind, lapply(1:v, function(i) {
ds <- df2$date %in% .date[0:(v - 1) + i]
S <- split(df2[ds, ], df2[ds, "date"])
dat <- do.call(rbind, unname(S[sapply(S, nrow) >= w]))
fit <- lm(y ~ x1 + x2, dat)
rs <- dat$date %in% .date[i]
beta_monthly <- unname(fit$coef[2])
if (any(rs)) cbind(dat[rs, ], beta_monthly) else NULL
}))
head(res, 10)
# cl date y x1 x2 beta_monthly
# 6 A 2011-02-01 0.1947135 38.20878 62846231450 -0.01949786
# 7 B 2011-02-01 5.7908162 130.09371 73446134000 -0.01949786
# 8 C 2011-02-01 -1.6335241 63.67381 93917412861 -0.01949786
# 9 D 2011-02-01 -4.8414052 151.70718 76852791458 -0.01949786
# 10 E 2011-02-01 4.1640901 123.10342 16714132588 -0.01949786
# 11 A 2011-03-01 -2.0569659 104.46436 28101485893 -0.01935559
# 12 B 2011-03-01 9.2205063 24.58415 42584043997 -0.01935559
# 13 C 2011-03-01 -0.1572310 65.94721 83745620495 -0.01935559
# 14 D 2011-03-01 5.2782394 25.69336 15235322119 -0.01935559
# 15 E 2011-03-01 3.6096263 163.65887 66618792459 -0.01935559
Data:
set.seed(42)
df2 <- expand.grid(cl=LETTERS[1:5],
date=seq(as.Date("2011-01-01"), as.Date("2019-12-31"), by="month"))
df2 <- df2[-sample(1:nrow(df2), nrow(df2)*.1), ]
n <- nrow(df2)
df2 <- transform(df2,
y=rnorm(n, 2, 5),
x1=runif(n, 20, 180),
x2=runif(n, 1e8, 1e11))
I am trying to perform a rolling regression for time t over the last 36 months for companies with observations for 18 of these months, but I am not able to make the function work. I only want the coefficient for var1. X, y, z are control variables.
Here is a sample of the data and the code I am trying to run.
structure(list(Year = c(2018, 2014, 2008, 2004, 2005, 2002, 2010,
2008, 2013, 1998), Month = c(6, 12, 4, 6, 4, 8, 12, 11, 3, 3),
ISIN = c("NO0004895103", "NO0010571680", "NO0010010473",
"NO0003079709", "NO0003117202", "NO0003073801", "NO0010379266",
"NO0004913609", "NO0003072407", "NO0003679102"), SIZE = c(3637822300,
1.155e+10, 1925631048.5, 519688494.5, 790931587.08, 127597142.4,
2892152759.3, 554108244.7, 73484767, 375946560), BM = c(7.47698486077218e-07,
7.92202995594714e-07, 1.9900582030005e-07, 3.59385930497676e-07,
6.54209144403066e-07, 6.72227955087816e-07, 1.43273711611857e-06,
5.15824596832591e-07, 1.23252873661613e-06, 2.79726876987137e-07
), OP = c(0.145781283498513, 0.0433749257278669, 0.130477272126991,
0.211294037715838, 0.127523990874984, -0.0185266025249259,
-0.0822835066759631, 0.128844560922287, -0.0155446815227611,
0.416024745410667), INVEST = c(1.58608106515088, 0.00307252384303782,
0.33925195328069, -0.0984424241606425, 0.333479950150282,
0.13302885162465, -0.035773912311751, 0.408569401011161,
-0.206778240645154, -0.177187857233583), MonthlyReturn = c(-0.019509251810137,
0.095308641975309, 0.14864864864865, -0.034364261168385,
-0.064512147964095, -0.14080459770115, 0.13580244980708,
-0.1890214797136, -0.045226130653266, 0.39682527166336),
RiskFreeRate = c(0.00064, 0.00117, 0.00532, 0.00163, 0.00166,
0.00594, 0.00206, 0.00499, 0.00153, 0.00332), ShareTurnover = c(31649L,
907793L, 5318465L, 1831390L, 8956640L, 302000L, 7333090L,
516000L, 75553L, 2836550L), MarketExcessReturn = c(1.0813,
2.7638, 11.5701, 6.4672, -3.3108, -1.8598, 10.5445, -9.0969,
-0.0793, 10.8305), ExcessReturn = c(-2.0149251810137, 9.4138641975309,
14.332864864865, -3.5994261168385, -6.6172147964095, -14.674459770115,
13.374244980708, -19.40114797136, -4.6756130653266, 39.350527166336
), TradeDate = structure(c(17711, 16434, 13999, 12599, 12902,
11929, 14973, 14211, 15791, 10316), class = "Date"), GR_SIZE = structure(c(2L,
3L, 2L, 2L, 2L, 1L, 3L, 1L, 1L, 1L), .Label = c("1", "2",
"3"), class = "factor"), GR_OP = structure(c(2L, 1L, 2L,
2L, 1L, 1L, 1L, 2L, 1L, 3L), .Label = c("1", "2", "3"), class = "factor"),
GR_BM = structure(c(2L, 2L, 1L, 1L, 2L, 2L, 3L, 2L, 2L, 1L
), .Label = c("1", "2", "3"), class = "factor"), GR_INVEST = structure(c(3L,
1L, 3L, 1L, 3L, 3L, 2L, 3L, 1L, 1L), .Label = c("1", "2",
"3"), class = "factor"), SIZE_BM = structure(c(5L, 8L, 4L,
4L, 5L, 2L, 9L, 2L, 2L, 1L), .Label = c("11", "12", "13",
"21", "22", "23", "31", "32", "33"), class = "factor"), SIZE_OP = structure(c(5L,
7L, 5L, 5L, 4L, 1L, 7L, 2L, 1L, 3L), .Label = c("11", "12",
"13", "21", "22", "23", "31", "32", "33"), class = "factor"),
SIZE_INVEST = structure(c(6L, 7L, 6L, 4L, 6L, 3L, 8L, 3L,
1L, 1L), .Label = c("11", "12", "13", "21", "22", "23", "31",
"32", "33"), class = "factor"), Date = structure(c(17683,
16405, 13970, 12570, 12874, 11900, 14944, 14184, 15765, 10286
), class = "Date"), EPU_Paper = c(197.436482473082, 181.040599101032,
58.1799902251583, 62.5412044042803, 57.4799138334861, 52.420100605017,
61.1549363311955, 92.1818760618723, 76.2564063202547, 118.390708950295
)), row.names = c(NA, -10L), class = "data.frame")
#merge dataframes
df_Final <- merge(df_Final, EPU_Paper, by = c("Year", "Month"))
# Create empty dataframe
date <- seq(as.Date("1991-01-31"),as.Date("2019-12-30"), by = "month")
Beta <- data.frame(ISIN = character(), Beta = numeric(), date =
as.Date(character()), stringsAsFactors = FALSE)
## Model
Returns_length <- 36 # No. of observations in rolling regression
Returns_req <- 18 # observations of stocks requred in period
for (i in 1:(length(date)-Returns_length)) {
beta.tab <- df_Final %>% filter(TradeDate >= date[i] & TradeDate < date[i+Returns_length]) %>%
group_by(ISIN) %>% filter(n() >= Returns_req) %>%
do(ols.model = lm(data = df_Final, formula = ExcessReturn ~ var1 + x + y + z)) %>%
mutate(Beta = coef(ols.model)[2]) %>% select("ISIN", "Beta") %>%
mutate(TradeDate = date[Returns_length+i])
Beta <- rbind(Beta, beta.tab)}
There seems to be multiple undefined inputs, df_Final and EU_paper, in the question but assuming that the data is in time order within group and that we want to apply lm to w rows from the current row to the row w-1 prior then using some generated data in the Note at the end we define a coef.x function to calculate the desired coefficient and a rolling function roll which uses it (here with width 2) and then apply it by group g.
library(zoo)
w <- 2
nr <- nrow(DF)
coef.x <- function(ix) coef(lm(y ~ x + x1, DF, subset = ix))[["x"]]
roll <- function(ix) rollapplyr(ix, w, coef.x, fill = NA)
transform(DF, coef.x = ave(1:nr, g, FUN = roll))
giving:
y x x1 g coef.x
1 1 -0.56047564655 0.4609162060 1 NA
2 2 -0.23017748948 -1.2650612346 1 3.0275676040
3 3 1.55870831415 -0.6868528519 1 0.5590071753
4 4 0.07050839142 -0.4456619701 2 NA
5 5 0.12928773516 1.2240817974 2 17.0127792594
6 6 1.71506498688 0.3598138271 2 0.6306055904
This could also be written using dplyr where w, roll and coef.x are from above:
library(dplyr, exclude = c("filter", "lag"))
library(zoo)
DF %>%
group_by(g) %>%
mutate(coef.x = roll(cur_group_rows())) %>%
ungroup
Note
# test data
set.seed(123)
DF <- data.frame(y = 1:6, x = rnorm(6), x1 = rnorm(6),
g = c(1, 1, 1, 2, 2, 2))
I have a data with three columns like
Inputdf<-structure(list(df1 = structure(c(4L, 5L, 2L, 1L, 3L), .Label = c("P61160,P61158,O15143,O15144,O15145,P59998,O15511",
"P78537,Q6QNY1,Q6QNY0", "Q06323,Q9UL46", "Q92793,Q09472,Q9Y6Q9,Q92831",
"Q92828,Q13227,O15379,O75376,O60907,Q9BZK7"), class = "factor"),
df2 = structure(c(3L, 2L, 5L, 4L, 1L), .Label = c("", "P61158,O15143,O15144",
"Q06323,Q9UL46", "Q6QNY0", "Q92828"), class = "factor"),
df3 = structure(c(5L, 4L, 3L, 2L, 1L), .Label = c("", "O15511",
"Q06323,Q9UL46", "Q6QNY0", "Q92793,Q09472"), class = "factor")), .Names = c("df1",
"df2", "df3"), class = "data.frame", row.names = c(NA, -5L))
I am trying to find similar strings in this data for example
in df1, I have the first row I have Q92793,Q09472,Q9Y6Q9,Q92831
then I look at df2 and df3 and see if any of these members are in there then in this example, I make the following data
df1 df2 df3 Numberdf1 df2 df3
1 0 1 4 0 Q92793,Q09472
df1 1 means the first row of df1
df2 0 means it did not have any similarity
df3 1, means the first row of df3 has similarity with df1 row 1
Numberdf1, it is the count of strings separated by a ,which is 4
df2 is 0 because there was not any similar string accords df2
df3 is Q92793,Q09472 which paste the string which were similar in here
a desire output looks like below
out<- structure(list(df1 = 1:5, df2 = c(0L, 3L, 4L, 2L, 1L), df3 = c(1L,
0L, 2L, 4L, 3L), Numberdf1 = c(4L, 6L, 2L, 7L, 2L), df2.1 = structure(c(1L,
5L, 4L, 2L, 3L), .Label = c("0", "P61158,O15143,O15144", "Q06323,Q9UL46",
"Q6QNY0", "Q92828"), class = "factor"), df3.1 = structure(c(5L,
1L, 4L, 2L, 3L), .Label = c("0", "O15511", "Q06323,Q9UL46", "Q6QNY0",
"Q92793,Q09472"), class = "factor")), .Names = c("df1", "df2",
"df3", "Numberdf1", "df2.1", "df3.1"), class = "data.frame", row.names = c(NA,
-5L))
The below function does not work , for example, use this data as input
Inputdf1<- structure(list(df1 = structure(c(2L, 3L, 1L), .Label = c("Q06323,Q9UL46",
"Q92793,Q09472,Q9Y6Q9,Q92831", "Q92828,Q13227,O15379,O75376,O60907,Q9BZK7"
), class = "factor"), df2 = structure(1:3, .Label = c("P25788,P25789",
"Q92828, O60907, O75376", "Q9UL46, Q06323"), class = "factor"),
df3 = structure(c(2L, 1L, 3L), .Label = c("Q92831, Q92793, Q09472",
"Q9BZK7, Q92828, O75376, O60907", "Q9UL46, Q06323"), class = "factor")), .Names = c("df1",
"df2", "df3"), class = "data.frame", row.names = c(NA, -3L))
This works for your example:
# First convert factors to strings to lists
Inputdf[] = lapply(Inputdf, as.character)
Inputdf[] = lapply(Inputdf, function(col) sapply(col, function(x) unlist(strsplit(x,','))))
not.empty = function(x) length(x) > 0
out = data.frame()
for (r in 1:nrow(Inputdf)) {
df2.intersect = lapply(Inputdf$df2, intersect, Inputdf$df1[[r]])
df3.intersect = lapply(Inputdf$df3, intersect, Inputdf$df1[[r]])
out[r, 'df1'] = r
out[r, 'df2'] = Position(not.empty, df2.intersect, nomatch=0)
out[r, 'df3'] = Position(not.empty, df3.intersect, nomatch=0)
out[r, 'Numberdf1'] = length(Inputdf$df1[[r]])
out[r, 'df2.1'] = paste(Find(not.empty, df2.intersect, nomatch=0), collapse=',')
out[r, 'df3.1'] = paste(Find(not.empty, df3.intersect, nomatch=0), collapse=',')
}
out
# df1 df2 df3 Numberdf1 df2.1 df3.1
# 1 1 0 1 4 0 Q92793,Q09472
# 2 2 3 0 6 Q92828 0
# 3 3 4 2 3 Q6QNY0 Q6QNY0
# 4 4 2 4 7 P61158,O15143,O15144 O15511
# 5 5 1 3 2 Q06323,Q9UL46 Q06323,Q9UL46
Note: Find and Position identify the first match only. If there are potentially multiple matches, use which.
EDIT
Version accounting for multiple matches
Inputdf[] = lapply(Inputdf, as.character)
Inputdf[] = lapply(Inputdf, function(col) sapply(col, function(x) unlist(strsplit(x,',\\s*'))))
not.empty = function(x) length(x) > 0
out = data.frame()
for (r in 1:nrow(Inputdf)) {
df2.intersect = lapply(Inputdf$df2, intersect, Inputdf$df1[[r]])
df3.intersect = lapply(Inputdf$df3, intersect, Inputdf$df1[[r]])
out[r, 'df1'] = r
out[r, 'df2'] = paste(which(sapply(df2.intersect, not.empty)), collapse=',')
out[r, 'df3'] = paste(which(sapply(df3.intersect, not.empty)), collapse=',')
out[r, 'Numberdf1'] = length(Inputdf$df1[[r]])
out[r, 'df2.1'] = paste(unique(unlist(df2.intersect)), collapse=',')
out[r, 'df3.1'] = paste(unique(unlist(df3.intersect)), collapse=',')
}
out[out==""] = "0"
I'm using R and I am trying to delete some rows from a data frame based on some constrains. so, if I got
dat <- data.frame(Cs=c("c1","c2","c3","c4","c5","c6"),
R1=sample(c("Y","N"),6,replace=TRUE), R2=sample(c("Y","N"),6,replace=TRUE),
R3=sample(c("Y","N"),6,replace=TRUE), R4=sample(c("Y","N"),6,replace=TRUE),
R5=sample(c("Y","N"),6,replace=TRUE), R6=sample(c("Y","N"),6,replace=TRUE))
I'd like to delete all the rows having a "N" at some given columns such as R1, R3, R4. For one single column, I found this solution: delete row for certain constrains
d <- dat[dat[,"R1"]!="N",]
which works fine. but if I put multiple columns as
d <- dat[dat[,c("R1","R3","R4")]!="N",]
I got lots of extra rows full of NA. So where am I wrong?
You can use
dat[rowSums(dat[, c("R1","R3","R4")] == "N") == 0, , drop=FALSE]
# Cs R1 R2 R3 R4 R5 R6
#5 c5 Y Y Y Y Y Y
Or, if you don't like excessive typing:
dat[!rowSums(dat[c('R1','R3','R4')]=='N'),]
This will first test each "cell" of columns "R1", "R3" and "R4" of your data whether it is equal to "N" and then calculate the sums of TRUE values per row. If no "N" is present in a row, the sum is equal to 0 and will be kept. I added drop=FALSE to keep the structure as a data.frame.
Note after a comment by OP:
If you subset only 1 column of a data.frame without specifying a drop=TRUE option, the default behavior of [.data.frame is to coerce the resulting 1-column-data.frame to an atomic vector. Then, rowSums wouldn't work on that resulting vector. To avoid that, change your code to:
dat[!rowSums(dat[,'R1', drop=FALSE]=='N'), ]
Sample data:
set.seed(5)
dat <- data.frame(Cs=c("c1","c2","c3","c4","c5","c6"),
R1=sample(c("Y","N"),6,replace=TRUE), R2=sample(c("Y","N"),6,replace=TRUE),
R3=sample(c("Y","N"),6,replace=TRUE), R4=sample(c("Y","N"),6,replace=TRUE),
R5=sample(c("Y","N"),6,replace=TRUE), R6=sample(c("Y","N"),6,replace=TRUE))
You could make a 'keep'-variable consisting of booleans for each row:
keep <- apply(dat[,c("R1","R3","R4")],
MARGIN=1,
FUN=function(x){all(x!='N')})
res <- dat[keep,]
> res
Cs R1 R2 R3 R4 R5 R6
1 c1 Y Y Y Y Y Y
data:
seed used: 1234
dat <- structure(list(Cs = structure(1:6, .Label = c("c1", "c2", "c3",
"c4", "c5", "c6"), class = "factor"), R1 = structure(c(2L, 1L,
1L, 1L, 1L, 1L), .Label = c("N", "Y"), class = "factor"), R2 = structure(c(2L,
2L, 1L, 1L, 1L, 1L), .Label = c("N", "Y"), class = "factor"),
R3 = structure(c(2L, 1L, 2L, 1L, 2L, 2L), .Label = c("N",
"Y"), class = "factor"), R4 = structure(c(1L, 1L, 1L, 1L,
1L, 1L), .Label = "Y", class = "factor"), R5 = structure(c(2L,
1L, 1L, 1L, 1L, 2L), .Label = c("N", "Y"), class = "factor"),
R6 = structure(c(2L, 2L, 2L, 1L, 2L, 1L), .Label = c("N",
"Y"), class = "factor")), .Names = c("Cs", "R1", "R2", "R3",
"R4", "R5", "R6"), row.names = c(NA, -6L), class = "data.frame")