Related
I have a dataframe like this:
dat<- data.frame (
'Ones'=c(0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0),
'Thats'=c(0,5,3,6,8,4,5,6,8,3,1,3,4,5,6,7,4,3,4,5))
I have to create a function (gap1) that detects each 1 in Ones and than sums n-1, n and n+1 in Thats, with n being in the same row as 1.
For example in this dataset I have two 1.
dat<- data.frame (
'Ones'=c(0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0),
'Thats'=c(0,5,3,6,8,4,5,6,8,3,1,3,4,5,6,7,4,3,4,5))
dat
This should be the output:
Ones Thats gap1
1 4 17 #(8+4+5)
1 1 7 #(3+1+3)
I would like to extend this gap at will, for example:
Ones Thats gap1 gap2 gap3 ...
1 4 17 29 #(6+8+4+5+6)
1 1 7 9 #(8+3+1+3+4)
There is another problem I have to consider:
Suppose we have this data frame:
dat<- data.frame (
'Ones'=c(1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0),
'Thats'=c(0,5,3,6,8,4,5,6,8,3,1,NA,4,5,6,7,4,3,4,5))
In case there is a 1 at the beginning (or at the end), or if there is an NA, the function should use available data.
In this case, for example:
Ones Thats gap1 gap2
1 0 5 (0+5) 8 #(0+5+3)
1 4 17 (8+4+5) 29 #(6+8+4+5+6)
1 1 4 (3+1+NA) 16 #(8+3+1+NA+4)
Do you have any advice?
Using tidyverse / collapse
For arbitrary number of lead and lags the collapse package offers a nice function flag, which has further arguments to specify columns (cols), or grouping variables g.
library(dplyr)
f <- function(df, n){
df %>%
collapse::flag(-n:n) %>%
transmute(Ones, Thats, gap = rowSums(., na.rm = T) - 1) %>%
filter(Ones == 1)
}
x <- data.frame (
'Ones'=c(1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0),
'Thats'=c(0,5,3,6,8,4,5,6,8,3,1,NA,4,5,6,7,4,3,4,5))
# we can now specify how many lags to count:
f(x, 1)
Ones Thats gap
1 1 0 5
2 1 4 17
3 1 1 4
f(x, 2)
Ones Thats gap
1 1 0 8
2 1 4 29
3 1 1 16
Or if you want to specify the number of gaps to compute, we can simplify the function to
f <- function(df, n){
df %>%
collapse::flag(-n:n) %>%
rowSums(na.rm = T) - 1
}
x %>%
mutate(gap1 = f(., 1),
gap2 = f(., 2)) %>%
filter(Ones == 1)
Ones Thats gap1 gap2
1 1 0 5 8
2 1 4 17 29
3 1 1 4 16
Base R
If you like terse functions:
f <- Vectorize(\(df, n) rowSums(collapse::flag(df, -n:n), na.rm = T) - 1, "n")
x[paste0("gap", 1:2)] <- f(x, 1:2) ; subset(x, Ones == 1)
Ones Thats gap1 gap2
1 1 0 5 8
6 1 4 17 29
11 1 1 4 16
With BaseR,
myfun <- function(data,gap=1) {
points <- which(data["Ones"]==1)
sapply(points, function(x) {
bottom <- ifelse(x-gap<=0,1,x -gap)
top <- ifelse(x+ gap > nrow(data),nrow(data),x +gap)
sum(data[bottom:top,"Thats"], na.rm=T)
})
}
#> myfun(dat,1)
#[1] 5 17 4
#> myfun(dat,2)
#[1] 8 29 16
Another base R solution
f <- function(dat, width = 1)
{
dat$gaps <- sapply(seq(nrow(dat)), function(x) {
if(dat$Ones[x] == 0) return(0)
i <- x + seq(2 * width + 1) - (width + 1)
i <- i[i > 0]
i <- i[i < nrow(dat)]
sum(dat$Thats[i])
})
dat[dat$Ones == 1,]
}
f(dat, 1)
#> Ones Thats gaps
#> 6 1 4 17
#> 11 1 1 7
f(dat, 2)
#> Ones Thats gaps
#> 6 1 4 29
#> 11 1 1 19
I am wondering if there isn’t a more elegant way to do this. I tried rollapply but could never get it to respond to more than the first column of the zoo object.
I want to access a 2-dimensional zoo or xts object, create a rolling window that includes all columns, perform some operation on each instance of the rolling window, and return a matrix containing the result of the operation on each of the rolling windows. I want the operation on a windowed snippet to be assignable to a function that I externally define.
Here is an example that works, but is not very elegant:
rolling_function <- function(my_data, w, FUN = my_func)
{
## Produce a rolling window of width w starting at
## w, ending at nrow(my_data), with window width w.
## FUN is some function passed that performs some
## operation on 'snippet' and returns a value for
## each column of snippet. That is assembled into
## a matrix and returned.
## Set up a matrix to hold results
results <- matrix(ncol = ncol(my_data),
nrow = (nrow(my_data) - w + 1))
nn <-nrow(my_data)
for(jstart in 1:(nn - w + 1))
{
snippet <- window(my_data,
start = index(my_data[jstart]),
end = index(my_data[jstart + w - 1]))
## Do something with snippet here
# print(my_func(snippet))
results[jstart, ] <- FUN(snippet)
}
return(results)
}
my_func <- function(x)
{
# An example function that takes the difference between
# the first and last rows of the snippet, x
result <- as.vector(x[1,]) - as.vector(x[nrow(x),])
return(result)
}
A small test case is given below:
## Main code
## Define a zoo object with dummy dates
my_data <-zoo(matrix(data = c(1,5,6,5,3,7,8,8,8,2,4,5),
nrow = 4, ncol = 3), order.by = as.Date(100:103))
## Define a window width of 2 and call the rolling function
width = 2
print(rolling_function(my_data, width))
The test zoo object is:
1970-04-11 1 3 8
1970-04-12 5 7 2
1970-04-13 6 8 4
1970-04-14 5 8 5
and the test output is:
[,1] [,2] [,3]
[1,] -4 -4 6
[2,] -1 -1 -2
[3,] 1 0 -1
Is there a more elegant/straightforward/faster way to perform this operation, perhaps using rollapply (I could not make this work)?
Assuming the input z shown reproducibly in the Note at the end, if the width is 2 then:
library(zoo)
-diff(z)
## V2 V3 V4
## 1970-04-12 -4 -4 6
## 1970-04-13 -1 -1 -2
## 1970-04-14 1 0 -1
and in general:
w <- 2 # modify as needed
-diff(z, w-1)
## V2 V3 V4
## 1970-04-12 -4 -4 6
## 1970-04-13 -1 -1 -2
## 1970-04-14 1 0 -1
or using rollapplyr:
w <- 2 # modify as needed
rollapplyr(z, w, function(x) x[1] - x[w])
## V2 V3 V4
## 1970-04-12 -4 -4 6
## 1970-04-13 -1 -1 -2
## 1970-04-14 1 0 -1
Note
Lines <- "
1970-04-11 1 3 8
1970-04-12 5 7 2
1970-04-13 6 8 4
1970-04-14 5 8 5"
library(zoo)
z <- read.zoo(text = Lines)
Is there a more elegant way to solve this problem?
For every TRUE value I'm looking for the positions of the closest previous and following FALSE values.
data:
vec <- c(FALSE, TRUE, TRUE, FALSE, TRUE, FALSE)
desired outcome: (something like)
pos start end
[1,] 2 1 4
[2,] 3 1 4
[3,] 5 4 6
explanation of the first row of the outcome:
pos = 2, position of the first TRUE,
start = 1, position of the closest FALSE in front of pos = 2
end = 4, position of the closest FALSE after pos = 2.
Already working solution:
pos = which(vec)
f_pos = which(!vec)
t(
sapply(pos, function(x){ s <- rev(f_pos[f_pos < x])[1]; e <- f_pos[x < f_pos][1]; return(data.frame(pos = x, start = s, end = e)) })
)
Using findInterval
pos <- which(vec)
b <- which(!vec)
ix <- findInterval(pos, b)
cbind(pos, from = b[ix], to = b[ix + 1])
# pos from to
# [1,] 2 1 4
# [2,] 3 1 4
# [3,] 5 4 6
If we stretch your "something like" slightly, a simple cut will do:
data.frame(pos, rng = cut(pos, b))
# pos rng
# 1 2 (1,4]
# 2 3 (1,4]
# 3 5 (4,6]
If the vector ends with TRUE, the findInterval solution will give NA in 'to' column. In cut, the last 'interval' is then coded as NA.
You can do as if FALSE defined intervals and use data.table::foverlaps to find the right ones:
library(data.table)
# put your objects in data.tables:
f_pos_inter <- data.table(start=head(f_pos, -1), end=tail(f_pos, -1))
pos_inter <- data.table(start=pos, end=pos)
# define the keys:
setkeyv(pos_inter, c("start", "end")); setkeyv(f_pos_inter, c("start", "end"))
res <- foverlaps(pos_inter, f_pos_inter)
# start end i.start i.end
#1: 1 4 2 2
#2: 1 4 3 3
#3: 4 6 5 5
You can further reorder the columns and keep only the ones you need:
res[, i.end:=NULL]
setcolorder(res, c(3, 1, 2))
setnames(res, "i.start", "pos")
res
# pos start end
#1: 2 1 4
#2: 3 1 4
#3: 5 4 6
N.B: this will give NA in both columns start and end if vec ends with TRUE
Here's my problem I couldn't solve it all.
Suppose that we have the following code as follows:
## A data frame named a
a <- data.frame(A = c(0,0,1,1,1), B = c(1,0,1,0,0), C = c(0,0,1,1,0), D = c(0,0,1,1,0), E = c(0,1,1,0,1))
## 1st function calculates all the combinaisons of colnames of a and the output is a character vector named item2
items2 <- c()
countI <- 1
while(countI <= ncol(a)){
for(i in countI){
countJ <- countI + 1
while(countJ <= ncol(a)){
for(j in countJ){
items2 <- c(items2, paste(colnames(a[i]), colnames(a[j]), collapse = '', sep = ""))
}
countJ <- countJ + 1
}
countI <- countI + 1
}
}
And here's my code I'm trying to solve (the output is a numeric vector called count_1):
## 2nd function
colnames(a) <- NULL ## just for facilitating the calculation
count_1 <- numeric(ncol(a)*2)
countI <- 1
while(countI <= ncol(a)){
for(i in countI){
countJ <- countI + 1
while(countJ <= ncol(a)){
for(j in countJ){
s <- a[, i]
p <- a[, j]
count_1[i*2] <- as.integer(s[i] == p[j] & s[i] == 1)
}
countJ <- countJ + 1
}
countI <- countI + 1
}
}
But when I execute this code in RStudio Console, a non-expectation result returned!:
count_1
[1] 0 0 0 0 0 1 0 1 0 0
However, I am expecting the following result:
count_1
[1] 1 2 2 2 1 1 1 1 2 1
You can see visit the following URL where you can find an image on Dropbox for detailed explanation.
https://www.dropbox.com/s/5ylt8h8wx3zrvy7/IMAG1074.jpg?dl=0
I'll try to explain a little more,
I posted the 1st function (code) just to show you what I'm looking for exactly that is an example that's all.
What I'm trying to get from the second function (code) is calculating the number of occurrences of number 1 (firstly we put counter = 0) in each row (while each row of two columns (AB, for example) must equal to one in both columns to say that counter = counter + 1) we continue by combing each column by all other columns (with AC, AD, AE, BC, BD, BE, CD, CE, and then DE), combination is n!/2!(n-2)!, that means for example if I have the following data frame:
a =
A B C D E
0 1 0 0 0
0 0 0 0 1
1 1 1 1 1
1 0 0 1 0
1 0 1 0 1
Then, the number of occurrences of the number 1 for each row by combining the two first columns is as follows: (Note that I put colnames(a) <- NULL just to facilitate the work and be more clear)
0 1 0 0 0
0 0 0 0 1
1 1 1 1 1
1 0 0 1 0
1 0 1 0 1
### Example 1: #####################################################
so from here I put (for columns A and B (AB))
s <- a[, i]
## s is equal to
## [1] 0 0 1 1 1
p <- a[, j]
## p is equal to
## [1] 1 0 1 0 0
Then I'll look for the occurrence of the number 1 in both vectors in condition it must be the same, i.e. a[, i] == 1 && a[, j] == 1 && a[, i] == a[, j], and for this example a numeric vector will be [1] 1
### Example 2: #####################################################
From here I put (for columns A and D (AD))
s <- a[, i]
## s is equal to
## [1] 0 0 1 1 1
p <- a[, j]
## p is equal to
## [1] 0 0 1 1 0
Then I'll look for the occurrence of the number 1 in both vectors in condition it must be the same, i.e. a[, i] == 1 && a[, j] == 1 && a[, i] == a[, j], and for this example a numeric vector will be [1] 2
And so on,
I'll have a numeric vector named count_1 equal to:
[1] 1 2 2 2 1 1 1 1 2 1
while each index of count_1 is a combination of each column by others (without the names of the data frame)
AB AC AD AE BC BD BE CD CE DE
1 2 2 2 1 1 1 1 2 1
Not clear what you're after at all.
As to the first code chunk, that is some ugly R coding involving a whole bunch of unnecessary while/for loops.
You can get the same result items2 in one single line.
items2 <- sort(toupper(unlist(sapply(1:4, function(i)
sapply(5:(i+1), function(j)
paste(letters[i], letters[j], sep = ""))))));
items2;
# [1] "AB" "AC" "AD" "AE" "BC" "BD" "BE" "CD" "CE" "DE"
As to the second code chunk, please explain what you're trying to calculate. It's likely that these while/for loops are as unnecessary as in the first case.
Update
Note that this is based on a as defined at the beginning of your post. Your expected output is based on a different a, that you changed further down the post.
There is no need for a for/while loop, both "functions" can be written in two one-liners.
# Your sample dataframe a
a <- data.frame(A = c(0,0,1,1,1), B = c(1,0,1,0,0), C = c(0,0,1,1,0), D = c(0,0,1,1,0), E = c(0,1,1,0,1))
# Function 1
items2 <- toupper(unlist(sapply(1:(ncol(a) - 1), function(i) sapply(ncol(a):(i+1), function(j)
paste(letters[i], letters[j], sep = "")))));
# Function 2
count_1 <- unlist(sapply(1:(ncol(a) - 1), function(i) sapply(ncol(a):(i+1), function(j)
sum(a[, i] + a[, j] == 2))));
# Add names and sort
names(count_1) <- items2;
count_1 <- count_1[order(names(count_1))];
# Output
count_1;
#AB AC AD AE BC BD BE CD CE DE
# 1 2 2 2 1 1 1 2 1 1
I have a dataframe that is n rows and 1 column. I can do this to find the 2nd row minus the 1st row:
> dif = df[2,1] - df[1,1]
How do I find df[n,1] - df[n-1, 1] for all rows in df?
head and tail are handy for this...
df <- data.frame( a = 1:5 , b = 5:1 )
tail(df,-1) - head(df,-1)
# a b
#2 1 -1
#3 1 -1
#4 1 -1
#5 1 -1
This will therefore accomplish what you are after, row by row, for all columns at the same time.
diff is also handy for this task
> set.seed(1)
> df <- data.frame( a = sample(5) , b = sample(5) ) # some data
> sapply(df, diff)
a b
[1,] 3 -1
[2,] -1 -2
[3,] -1 1
[4,] -2 -2
You can also use filter here :
filter(df,c(1,-1))