Df <- data.frame(prop1 = c(NA, NA, NA, "French", NA, NA,NA, "-29 to -20", NA, NA, NA, "Pop", NA, NA, NA, "French", "-29 to -20", "Pop"),
prop1_rank = c(NA, NA, NA, 0, NA, NA,NA, 11, NA, NA, NA, 1, NA, NA, NA, 40, 0, 2),
prop2 = c(NA, NA, NA, "Spanish", NA, NA,NA, "-19 to -10", NA, NA, NA, "Rock", NA, NA, NA, "Spanish", "-19 to -10", "Rock"),
prop2_rank = c(NA, NA, NA, 10, NA, NA,NA, 4, NA, NA, NA, 1, NA, NA, NA, 1, 0, 2),
initOSF1 = c(NA, NA, NA, NA, NA, "French", NA,NA,NA, "-29 to -20", NA, NA, NA, "Pop", NA, NA, NA, NA),
initOSF1_freq = c(NA, NA, NA, NA, NA, 66, NA,NA,NA, 0, NA, NA, NA, 14, NA, NA, NA, NA),
initOSF2 = c(NA, NA, NA, NA, NA, "Spanish", NA,NA,NA, "-19 to -10", NA, NA, NA, "Rock", NA, NA, NA, NA),
initOSF2_freq = c(NA, NA, NA, NA, NA, 0, NA,NA,NA, 6, NA, NA, NA, 14, NA, NA, NA, NA))
Df
I would like to organize this into
3 columns consisting: c("propositions", "ranks", "freqs"),
where,
Propositions column has the values: "French", "Spanish", "-29 to -20", "19 to -10", "Pop", "Rock", and having a separate columns for the rank values e.g., 0 for French, 10 for Spanish, etc., and frequency values e.g., 66 for French, 0 for Spanish, etc.
This is not an easy one. Probably a better solution exists:
library(tidyverse)
library(data.table)
setDT(Df) %>%
select(contains(c('prop', 'rank', 'freq'))) %>%
filter(!if_all(everything(), is.na)) %>%
melt(measure.vars = patterns(c('prop.$', 'rank$', 'freq'))) %>%
group_by(gr=cumsum(!is.na(value1)))%>%
summarise(across(-variable, ~if(length(.x)>1) na.omit(.x) else .x))
# A tibble: 12 x 4
gr value1 value2 value3
<int> <chr> <dbl> <dbl>
1 1 French 0 66
2 2 -29 to -20 11 0
3 3 Pop 1 14
4 4 French 40 NA
5 5 -29 to -20 0 NA
6 6 Pop 2 NA
7 7 Spanish 10 0
8 8 -19 to -10 4 6
9 9 Rock 1 14
10 10 Spanish 1 NA
11 11 -19 to -10 0 NA
12 12 Rock 2 NA
I have a data frame that looks something like this:
> df
# A tibble: 5,427 x 3
cond desired inc
<chr> <dbl> <dbl>
1 <NA> 0 0
2 <NA> 5 5
3 X 10 5
4 X 7 7
5 <NA> 16 16
6 <NA> 21 5
7 <NA> 26 5
8 <NA> 31 5
9 X 37 6
10 <NA> 5 5
this already includes my desired output. What I want to do is sum up the values of inc, but reset the sum if there is an X in the cond-column of the previous row. So for example in row 9 I'd take the desired-value from the previous row (31) and add the inc-value from row 9 (6) which gives 37. And in row 5 I'd just take the inc-value because the cond-column of the previous row was X. I solved this using a loop, but I'd like to use a vectorized solution. So far I got this:
df$test <- 0
df <- df %>% mutate(test = ifelse(is.na(lag(df$cond)), lag(test) + inc, inc))
If I run the second line once I get this:
> df
# A tibble: 5,427 x 4
cond desired inc test
<chr> <dbl> <dbl> <dbl>
1 <NA> 0 0 NA
2 <NA> 5 5 5
3 X 10 5 5
4 X 7 7 7
5 <NA> 16 16 16
6 <NA> 21 5 5
7 <NA> 26 5 5
8 <NA> 31 5 5
9 X 37 6 6
10 <NA> 5 5 5
After the second run it looks like this:
> df
# A tibble: 5,427 x 4
cond desired inc test
<chr> <dbl> <dbl> <dbl>
1 <NA> 0 0 NA
2 <NA> 5 5 NA
3 X 10 5 10
4 X 7 7 7
5 <NA> 16 16 16
6 <NA> 21 5 21
7 <NA> 26 5 10
8 <NA> 31 5 10
9 X 37 6 11
10 <NA> 5 5 5
# ... with 5,417 more rows
Third time:
> df
# A tibble: 5,427 x 4
cond desired inc test
<chr> <dbl> <dbl> <dbl>
1 <NA> 0 0 NA
2 <NA> 5 5 NA
3 X 10 5 NA
4 X 7 7 7
5 <NA> 16 16 16
6 <NA> 21 5 21
7 <NA> 26 5 26
8 <NA> 31 5 15
9 X 37 6 16
10 <NA> 5 5 5
Then, after the fifth time:
> df
# A tibble: 5,427 x 4
cond desired inc test
<chr> <dbl> <dbl> <dbl>
1 <NA> 0 0 NA
2 <NA> 5 5 NA
3 X 10 5 NA
4 X 7 7 7
5 <NA> 16 16 16
6 <NA> 21 5 21
7 <NA> 26 5 26
8 <NA> 31 5 31
9 X 37 6 37
10 <NA> 5 5 5
I'm using the column I'm creating with mutate in the mutate-command itself and I guess that is causing this behaviour/problem. Is there any way to get to my desired result? Thanks in advance!
the dataframe:
structure(list(cond = c(NA, NA, "X", "X", NA, NA, NA, NA, "X",
NA, NA, NA, NA, NA, NA, NA, NA, NA, "X", NA, NA, NA, NA, "X",
NA, NA, NA, NA, NA, NA, "X", NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, "X", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, "X", NA, NA, NA, NA, NA, NA, NA, "X", NA, NA, "X",
NA, NA, NA, NA, NA, "X", NA, NA, NA, NA, NA, NA, NA, "X", NA,
NA, NA, NA, NA, NA, NA, NA, "X", NA, NA, NA, NA, NA, NA, NA,
NA, "X", NA, NA, NA, NA, NA, NA, "X", NA, NA, NA, NA, NA, NA,
NA, NA, NA, "X", NA, NA, NA, "X", NA, NA, NA, NA, "X", NA, NA,
NA, NA, NA, NA, NA, NA, "X", NA, NA, "X", NA, NA, NA, NA, "X",
NA, NA, NA, NA, NA, NA, NA, NA, "X", NA, NA, NA, NA, NA, NA,
NA, "X", NA, "X", NA, NA, NA, NA, NA, NA, NA, NA, "X", NA, NA,
NA, NA, NA, NA, NA, "X", NA, NA, NA, "X", "X", NA, NA, NA, NA,
NA, NA, NA, NA, "X", "X", NA, "X", NA, NA, NA, NA, NA, NA, NA,
NA, "X", NA, NA, NA, "X", NA, NA, NA, NA, NA, NA, NA, NA, "X",
NA, NA, NA, NA, NA, "X", NA, NA, NA, NA, "X", NA, NA, NA, NA,
"X", NA, NA, NA, NA, NA, "X", NA, NA, NA, NA, NA, NA, NA, NA,
"X", NA, NA, NA, NA, NA, NA, "X", NA, NA, NA, NA, "X", NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "X", NA, "X",
NA, "X", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, "X", NA, NA, NA), desired = c(0, 5, 10, 7, 16, 21, 26,
31, 37, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 5, 10, 15, 20,
30, 7, 15, 21, 25, 40, 45, 55, 12, 20, 25, 30, 35, 40, 45, 50,
55, 60, 65, 70, 75, 5, 10, 15, 20, 22, 30, 35, 45, 50, 55, 60,
65, 70, 75, 9, 14, 19, 24, 29, 34, 39, 44, 5, 7, 10, 2, 7, 12,
17, 22, 27, 5, 10, 15, 20, 25, 30, 35, 38, 4, 7, 12, 17, 22,
27, 32, 37, 39, 13, 18, 23, 28, 33, 38, 43, 48, 53, 5, 10, 15,
20, 25, 30, 35, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 5, 10,
15, 20, 2, 10, 15, 20, 25, 5, 10, 15, 20, 25, 30, 35, 40, 45,
5, 8, 12, 5, 10, 14, 19, 24, 5, 10, 15, 20, 25, 30, 35, 40, 45,
5, 10, 15, 20, 25, 28, 33, 38, 5, 11, 5, 10, 15, 20, 25, 30,
35, 40, 45, 12, 17, 22, 27, 32, 37, 42, 47, 5, 10, 15, 20, 5,
5, 10, 15, 20, 25, 30, 35, 40, 45, 5, 5, 10, 5, 10, 15, 20, 25,
30, 35, 40, 45, 5, 10, 15, 20, 5, 10, 15, 20, 25, 30, 34, 39,
44, 5, 10, 15, 20, 25, 30, 5, 10, 15, 20, 25, 5, 10, 15, 20,
25, 5, 10, 15, 20, 25, 29, 5, 10, 15, 20, 23, 25, 30, 35, 40,
5, 15, 20, 25, 30, 35, 40, 5, 10, 15, 20, 25, 5, 10, 15, 20,
25, 28, 33, 38, 43, 48, 53, 58, 71, 76, 81, 5, 10, 5, 10, 5,
10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 5,
10, 15), inc = c(0, 5, 5, 7, 16, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 10, 7, 8, 6, 4, 15, 5, 10, 12, 8, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 8, 5, 10, 5, 5,
5, 5, 5, 5, 9, 5, 5, 5, 5, 5, 5, 5, 5, 2, 3, 2, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 3, 4, 3, 5, 5, 5, 5, 5, 5, 2, 13, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 2, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
3, 4, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
3, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 12, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4,
5, 5, 5, 5, 3, 2, 5, 5, 5, 5, 10, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 13, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5)), .Names = c("cond",
"desired", "inc"), row.names = c(NA, -300L), class = c("tbl_df",
"tbl", "data.frame"))
Here's an example using the ave() function and the df structure from above. I'm showing all the steps for clarity but these could be reduced if needed.
library(dplyr)
df %>%
mutate(prevcond = lag(cond)) %>%
mutate(flag = ifelse(is.na(prevcond) | prevcond !='X', 0, 1)) %>%
mutate(counter = cumsum(flag)) %>%
mutate(desired2 = ave(inc, counter, FUN = cumsum))
To arrive at your desired output, we must first create a grouping column that resets every time the previous row is equal to X. For this we use row_number() in combination with zoo::na.locf(). Then we can simply use cumsum():
library(dplyr)
library(zoo)
df %>% group_by(grp = na.locf(row_number(cond),
fromLast = TRUE,
na.rm = FALSE)) %>%
mutate(test = cumsum(inc))
# cond desired inc grp test
# <chr> <dbl> <dbl> <int> <dbl>
# 1 <NA> 0 0 1 0
# 2 <NA> 5 5 1 5
# 3 X 10 5 1 10
# 4 X 7 7 2 7
# 5 <NA> 16 16 3 16
# 6 <NA> 21 5 3 21
# 7 <NA> 26 5 3 26
# 8 <NA> 31 5 3 31
# 9 X 37 6 3 37
#10 <NA> 5 5 4 5
It's best explained with an example.
I have a vector, or column from data.frame named vec:
vec <- c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA)
I would like a vectorized process (not a for loop) to change the three trailing NA when a 1 is observed.
The end vector would be:
c(NA, NA, 1, 1, 1, 1, NA, 1, 1, 1, 1, NA, NA, NA)
If we had:
vec <- c(NA, NA, 1, NA, 1, NA, NA, 1, NA, NA, NA, NA, NA, NA)
The end vector would look like:
c(NA, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, NA, NA)
A very badly written solution is:
vec2 <- vec
for(i in index(v)){
if(!is.na(v[i])) vec2[i] <- 1
if(i>3){
if(!is.na(vec[i-1])) vec2[i] <- 1
if(!is.na(vec[i-2])) vec2[i] <- 1
if(!is.na(vec[i-3])) vec2[i] <- 1
}
if(i==3){
if(!is.na(vec[i-1])) vec2[i] <- 1
if(!is.na(vec[i-2])) vec2[i] <- 1
}
if(i==2){
if(!is.na(vec[i-1])) vec2[i] <- 1
}
}
Another option:
`[<-`(vec,c(outer(which(vec==1),1:3,"+")),1)
# [1] NA NA 1 1 1 1 NA 1 1 1 1 NA NA NA
Although the above works with the examples, it stretches the length of vec if a 1 is found in the last positions. Better to make a simple check and wrap into a function:
threeNAs<-function(vec) {
ind<-c(outer(which(vec==1),1:3,"+"))
ind<-ind[ind<=length(vec)]
`[<-`(vec,ind,1)
}
Another fast solution:
vec[rep(which(vec == 1), each = 3) + c(1:3)] <- 1
which gives:
> vec
[1] NA NA 1 1 1 1 NA 1 1 1 1 NA NA NA
Benchmarking is only really useful when done on larger datasets. A benchmark with a 10k larger vector and the several posted solutions:
library(microbenchmark)
microbenchmark(ans.jaap = {vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
vec[rep(which(vec == 1), each = 3) + c(1:3)] <- 1},
ans.989 = {vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
r <- which(vec==1);
vec[c(mapply(seq, r, r+3))] <- 1},
ans.sotos = {vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
vec[unique(as.vector(t(sapply(which(vec == 1), function(i) seq(i+1, length.out = 3)))))] <- 1},
ans.gregor = {vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
vec[is.na(vec)] <- 0;
n <- length(vec);
vec <- vec + c(0, vec[1:(n-1)]) + c(0, 0, vec[1:(n-2)]) + c(0, 0, 0, vec[1:(n-3)]);
vec[vec == 0] <- NA},
ans.moody = {vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
output <- sapply(1:length(vec),function(i){any(!is.na(vec[max(0,i-3):i]))});
output[output] <- 1;
output[output==0] <- NA},
ans.nicola = {vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
`[<-`(vec,c(outer(which(vec==1),1:3,"+")),1)})
which gives the following benchmark:
Unit: microseconds
expr min lq mean median uq max neval cld
ans.jaap 1778.905 1937.414 3064.686 2100.595 2257.695 86233.593 100 a
ans.989 87688.166 89638.133 96992.231 90986.269 93326.393 182431.366 100 c
ans.sotos 125344.157 127968.113 132386.664 130117.438 132951.380 214460.174 100 d
ans.gregor 4036.642 5824.474 10861.373 6533.791 7654.587 87806.955 100 b
ans.moody 173146.810 178369.220 183698.670 180318.799 184000.062 264892.878 100 e
ans.nicola 966.927 1390.486 1723.395 1604.037 1904.695 3310.203 100 a
What really is 'vectorised', if not a loop written in a C-language?
Here's a C++ loop that benchmarks well.
vec <- c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA)
library(Rcpp)
cppFunction('NumericVector fixVec(NumericVector myVec){
int n = myVec.size();
int foundCount = 0;
for(int i = 0; i < n; i++){
if(myVec[i] == 1) foundCount = 1;
if(ISNA(myVec[i])){
if(foundCount >= 1 & foundCount <= 3){
myVec[i] = 1;
foundCount++;
}
}
}
return myVec;
}')
fixVec(vec)
# [1] NA NA 1 1 1 1 NA 1 1 1 1 NA NA NA
Benchmarks
library(microbenchmark)
microbenchmark(
ans.jaap = {
vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
vec[rep(which(vec == 1), each = 4) + c(0:3)] <- 1
},
ans.nicola = {
vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
`[<-`(vec,c(outer(which(vec==1),0:3,"+")),1)
},
ans.symbolix = {
vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
vec <- fixVec(vec)
}
)
# Unit: microseconds
# expr min lq mean median uq max neval
# ans.jaap 2017.789 2264.318 2905.2437 2579.315 3588.4850 4667.249 100
# ans.nicola 1242.002 1626.704 3839.4768 2095.311 3066.4795 81299.962 100
# ans.symbolix 504.577 533.426 838.5661 718.275 966.9245 2354.373 100
vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4)
vec <- fixVec(vec)
vec2 <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4)
vec2[rep(which(vec2 == 1), each = 4) + c(0:3)] <- 1
identical(vec, vec2)
# [1] TRUE
The following code does what you asked for. It involves "shifting" the vector and then adding the shifted versions
vec[is.na(vec)] <- 0
n <- length(vec)
vec <- vec + c(0, vec[1:(n-1)]) + c(0, 0, vec[1:(n-2)]) + c(0, 0, 0, vec[1:(n-3)])
vec[vec == 0] <- NA
vec[vec != 0] <- 1
# vec | 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0 ,0, 0
# c(0, vec[1:(n-1)]) | + 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0 ,0, 0
# c(0, 0, vec[1:(n-2)]) | + 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0 ,0
# c(0,0,0,vec[1:(n-3)]) | + 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0
# |-------------------------------------------
# | 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0
A non-Vectorized solution, but nevertheless, another option using base R,
vec[unique(as.vector(t(sapply(which(vec == 1), function(i) seq(i+1, length.out = 3)))))] <- 1
vec
#[1] NA NA 1 1 1 1 NA 1 1 1 1 NA NA NA
vec1[unique(as.vector(t(sapply(which(vec1 == 1), function(i) seq(i+1, length.out = 3)))))] <- 1
vec1
#[1] NA NA 1 1 1 1 1 1 1 1 1 NA NA NA
How about this:
r <- which(vec==1)
vec[c(mapply(seq, r, r+3))] <- 1
Examples:
vec <- c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA)
#[1] NA NA 1 1 1 1 NA 1 1 1 1 NA NA NA
vec <- c(NA, NA, 1, NA, 1, NA, NA, 1, NA, NA, NA, NA, NA, NA)
#[1] NA NA 1 1 1 1 1 1 1 1 1 NA NA NA
With sapply, any, and is.na:
output <- sapply(1:length(vec),function(i){any(!is.na(vec[max(0,i-3):i]))})
output[output] <- 1
output[output==0] <- NA